@agentmemory/agentmemory 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/.claude-plugin/marketplace.json +14 -0
  2. package/.github/workflows/ci.yml +22 -0
  3. package/.github/workflows/publish.yml +28 -0
  4. package/AGENTS.md +113 -0
  5. package/LICENSE +190 -0
  6. package/README.md +828 -0
  7. package/assets/banner.png +0 -0
  8. package/assets/demo.gif +0 -0
  9. package/assets/demo.mp4 +0 -0
  10. package/benchmark/QUALITY.md +73 -0
  11. package/benchmark/REAL-EMBEDDINGS.md +67 -0
  12. package/benchmark/SCALE.md +110 -0
  13. package/benchmark/dataset.ts +293 -0
  14. package/benchmark/quality-eval.ts +643 -0
  15. package/benchmark/real-embeddings-eval.ts +405 -0
  16. package/benchmark/scale-eval.ts +398 -0
  17. package/dist/cli.d.mts +1 -0
  18. package/dist/cli.mjs +137 -0
  19. package/dist/cli.mjs.map +1 -0
  20. package/dist/docker-compose.yml +14 -0
  21. package/dist/hooks/notification.d.mts +1 -0
  22. package/dist/hooks/notification.mjs +45 -0
  23. package/dist/hooks/notification.mjs.map +1 -0
  24. package/dist/hooks/post-tool-failure.d.mts +1 -0
  25. package/dist/hooks/post-tool-failure.mjs +45 -0
  26. package/dist/hooks/post-tool-failure.mjs.map +1 -0
  27. package/dist/hooks/post-tool-use.d.mts +1 -0
  28. package/dist/hooks/post-tool-use.mjs +53 -0
  29. package/dist/hooks/post-tool-use.mjs.map +1 -0
  30. package/dist/hooks/pre-compact.d.mts +1 -0
  31. package/dist/hooks/pre-compact.mjs +50 -0
  32. package/dist/hooks/pre-compact.mjs.map +1 -0
  33. package/dist/hooks/pre-tool-use.d.mts +1 -0
  34. package/dist/hooks/pre-tool-use.mjs +69 -0
  35. package/dist/hooks/pre-tool-use.mjs.map +1 -0
  36. package/dist/hooks/prompt-submit.d.mts +1 -0
  37. package/dist/hooks/prompt-submit.mjs +40 -0
  38. package/dist/hooks/prompt-submit.mjs.map +1 -0
  39. package/dist/hooks/session-end.d.mts +1 -0
  40. package/dist/hooks/session-end.mjs +61 -0
  41. package/dist/hooks/session-end.mjs.map +1 -0
  42. package/dist/hooks/session-start.d.mts +1 -0
  43. package/dist/hooks/session-start.mjs +42 -0
  44. package/dist/hooks/session-start.mjs.map +1 -0
  45. package/dist/hooks/stop.d.mts +1 -0
  46. package/dist/hooks/stop.mjs +33 -0
  47. package/dist/hooks/stop.mjs.map +1 -0
  48. package/dist/hooks/subagent-start.d.mts +1 -0
  49. package/dist/hooks/subagent-start.mjs +43 -0
  50. package/dist/hooks/subagent-start.mjs.map +1 -0
  51. package/dist/hooks/subagent-stop.d.mts +1 -0
  52. package/dist/hooks/subagent-stop.mjs +45 -0
  53. package/dist/hooks/subagent-stop.mjs.map +1 -0
  54. package/dist/hooks/task-completed.d.mts +1 -0
  55. package/dist/hooks/task-completed.mjs +46 -0
  56. package/dist/hooks/task-completed.mjs.map +1 -0
  57. package/dist/iii-config.yaml +51 -0
  58. package/dist/index.d.mts +2 -0
  59. package/dist/index.mjs +13776 -0
  60. package/dist/index.mjs.map +1 -0
  61. package/dist/src-QxitMPfJ.mjs +13775 -0
  62. package/dist/src-QxitMPfJ.mjs.map +1 -0
  63. package/dist/standalone.d.mts +1 -0
  64. package/dist/standalone.mjs +1155 -0
  65. package/dist/standalone.mjs.map +1 -0
  66. package/dist/transformers-BX_tgxdO.mjs +38684 -0
  67. package/dist/transformers-BX_tgxdO.mjs.map +1 -0
  68. package/dist/transformers-KMm1i9no.mjs +38683 -0
  69. package/dist/transformers-KMm1i9no.mjs.map +1 -0
  70. package/docker-compose.yml +14 -0
  71. package/iii-config.yaml +51 -0
  72. package/package.json +59 -0
  73. package/plugin/.claude-plugin/plugin.json +10 -0
  74. package/plugin/hooks/hooks.json +77 -0
  75. package/plugin/scripts/diagnostics.mjs +551 -0
  76. package/plugin/scripts/notification.mjs +45 -0
  77. package/plugin/scripts/post-tool-failure.mjs +45 -0
  78. package/plugin/scripts/post-tool-use.mjs +53 -0
  79. package/plugin/scripts/pre-compact.mjs +50 -0
  80. package/plugin/scripts/pre-tool-use.mjs +69 -0
  81. package/plugin/scripts/prompt-submit.mjs +40 -0
  82. package/plugin/scripts/session-end.mjs +61 -0
  83. package/plugin/scripts/session-start.mjs +42 -0
  84. package/plugin/scripts/stop.mjs +33 -0
  85. package/plugin/scripts/subagent-start.mjs +43 -0
  86. package/plugin/scripts/subagent-stop.mjs +45 -0
  87. package/plugin/scripts/task-completed.mjs +46 -0
  88. package/plugin/skills/forget/SKILL.md +32 -0
  89. package/plugin/skills/recall/SKILL.md +18 -0
  90. package/plugin/skills/remember/SKILL.md +25 -0
  91. package/plugin/skills/session-history/SKILL.md +17 -0
  92. package/src/auth.ts +12 -0
  93. package/src/cli.ts +159 -0
  94. package/src/config.ts +221 -0
  95. package/src/eval/metrics-store.ts +65 -0
  96. package/src/eval/quality.ts +51 -0
  97. package/src/eval/schemas.ts +124 -0
  98. package/src/eval/self-correct.ts +28 -0
  99. package/src/eval/validator.ts +31 -0
  100. package/src/functions/actions.ts +288 -0
  101. package/src/functions/audit.ts +61 -0
  102. package/src/functions/auto-forget.ts +169 -0
  103. package/src/functions/branch-aware.ts +169 -0
  104. package/src/functions/cascade.ts +80 -0
  105. package/src/functions/checkpoints.ts +209 -0
  106. package/src/functions/claude-bridge.ts +161 -0
  107. package/src/functions/compress.ts +194 -0
  108. package/src/functions/consolidate.ts +212 -0
  109. package/src/functions/consolidation-pipeline.ts +258 -0
  110. package/src/functions/context.ts +169 -0
  111. package/src/functions/crystallize.ts +293 -0
  112. package/src/functions/dedup.ts +57 -0
  113. package/src/functions/diagnostics.ts +785 -0
  114. package/src/functions/enrich.ts +132 -0
  115. package/src/functions/evict.ts +163 -0
  116. package/src/functions/export-import.ts +508 -0
  117. package/src/functions/facets.ts +248 -0
  118. package/src/functions/file-index.ts +106 -0
  119. package/src/functions/flow-compress.ts +214 -0
  120. package/src/functions/frontier.ts +196 -0
  121. package/src/functions/governance.ts +131 -0
  122. package/src/functions/graph-retrieval.ts +277 -0
  123. package/src/functions/graph.ts +275 -0
  124. package/src/functions/leases.ts +216 -0
  125. package/src/functions/lessons.ts +253 -0
  126. package/src/functions/mesh.ts +434 -0
  127. package/src/functions/migrate.ts +165 -0
  128. package/src/functions/observe.ts +144 -0
  129. package/src/functions/obsidian-export.ts +310 -0
  130. package/src/functions/patterns.ts +138 -0
  131. package/src/functions/privacy.ts +39 -0
  132. package/src/functions/profile.ts +155 -0
  133. package/src/functions/query-expansion.ts +186 -0
  134. package/src/functions/relations.ts +237 -0
  135. package/src/functions/remember.ts +162 -0
  136. package/src/functions/retention.ts +235 -0
  137. package/src/functions/routines.ts +289 -0
  138. package/src/functions/search.ts +80 -0
  139. package/src/functions/sentinels.ts +417 -0
  140. package/src/functions/signals.ts +186 -0
  141. package/src/functions/sketches.ts +274 -0
  142. package/src/functions/sliding-window.ts +257 -0
  143. package/src/functions/smart-search.ts +115 -0
  144. package/src/functions/snapshot.ts +219 -0
  145. package/src/functions/summarize.ts +155 -0
  146. package/src/functions/team.ts +147 -0
  147. package/src/functions/temporal-graph.ts +476 -0
  148. package/src/functions/timeline.ts +138 -0
  149. package/src/functions/verify.ts +117 -0
  150. package/src/health/monitor.ts +110 -0
  151. package/src/health/thresholds.ts +73 -0
  152. package/src/hooks/notification.ts +52 -0
  153. package/src/hooks/post-tool-failure.ts +58 -0
  154. package/src/hooks/post-tool-use.ts +62 -0
  155. package/src/hooks/pre-compact.ts +60 -0
  156. package/src/hooks/pre-tool-use.ts +72 -0
  157. package/src/hooks/prompt-submit.ts +46 -0
  158. package/src/hooks/session-end.ts +71 -0
  159. package/src/hooks/session-start.ts +48 -0
  160. package/src/hooks/stop.ts +39 -0
  161. package/src/hooks/subagent-start.ts +49 -0
  162. package/src/hooks/subagent-stop.ts +54 -0
  163. package/src/hooks/task-completed.ts +54 -0
  164. package/src/index.ts +342 -0
  165. package/src/mcp/in-memory-kv.ts +61 -0
  166. package/src/mcp/server.ts +1455 -0
  167. package/src/mcp/standalone.ts +177 -0
  168. package/src/mcp/tools-registry.ts +769 -0
  169. package/src/mcp/transport.ts +91 -0
  170. package/src/prompts/compression.ts +67 -0
  171. package/src/prompts/consolidation.ts +48 -0
  172. package/src/prompts/graph-extraction.ts +35 -0
  173. package/src/prompts/summary.ts +38 -0
  174. package/src/prompts/xml.ts +26 -0
  175. package/src/providers/agent-sdk.ts +34 -0
  176. package/src/providers/anthropic.ts +35 -0
  177. package/src/providers/circuit-breaker.ts +82 -0
  178. package/src/providers/embedding/cohere.ts +46 -0
  179. package/src/providers/embedding/gemini.ts +54 -0
  180. package/src/providers/embedding/index.ts +39 -0
  181. package/src/providers/embedding/local.ts +52 -0
  182. package/src/providers/embedding/openai.ts +45 -0
  183. package/src/providers/embedding/openrouter.ts +51 -0
  184. package/src/providers/embedding/voyage.ts +46 -0
  185. package/src/providers/fallback-chain.ts +31 -0
  186. package/src/providers/index.ts +84 -0
  187. package/src/providers/openrouter.ts +71 -0
  188. package/src/providers/resilient.ts +37 -0
  189. package/src/state/hybrid-search.ts +295 -0
  190. package/src/state/index-persistence.ts +63 -0
  191. package/src/state/keyed-mutex.ts +18 -0
  192. package/src/state/kv.ts +33 -0
  193. package/src/state/schema.ts +71 -0
  194. package/src/state/search-index.ts +245 -0
  195. package/src/state/stemmer.ts +104 -0
  196. package/src/state/synonyms.ts +63 -0
  197. package/src/state/vector-index.ts +130 -0
  198. package/src/telemetry/setup.ts +116 -0
  199. package/src/triggers/api.ts +1904 -0
  200. package/src/triggers/events.ts +71 -0
  201. package/src/types.ts +769 -0
  202. package/src/version.ts +1 -0
  203. package/src/viewer/index.html +2497 -0
  204. package/src/viewer/server.ts +207 -0
  205. package/src/xenova.d.ts +3 -0
  206. package/test/actions.test.ts +490 -0
  207. package/test/audit.test.ts +108 -0
  208. package/test/auto-forget.test.ts +188 -0
  209. package/test/cascade.test.ts +277 -0
  210. package/test/checkpoints.test.ts +493 -0
  211. package/test/circuit-breaker.test.ts +107 -0
  212. package/test/claude-bridge.test.ts +178 -0
  213. package/test/confidence.test.ts +247 -0
  214. package/test/consistency.test.ts +61 -0
  215. package/test/consolidation-pipeline.test.ts +251 -0
  216. package/test/crystallize.test.ts +521 -0
  217. package/test/diagnostics.test.ts +638 -0
  218. package/test/embedding-provider.test.ts +49 -0
  219. package/test/enrich.test.ts +209 -0
  220. package/test/eval.test.ts +300 -0
  221. package/test/export-import.test.ts +251 -0
  222. package/test/facets.test.ts +448 -0
  223. package/test/fallback-chain.test.ts +93 -0
  224. package/test/frontier.test.ts +485 -0
  225. package/test/governance.test.ts +147 -0
  226. package/test/graph-retrieval.test.ts +186 -0
  227. package/test/graph.test.ts +160 -0
  228. package/test/helpers/mocks.ts +40 -0
  229. package/test/hybrid-search.test.ts +145 -0
  230. package/test/index-persistence.test.ts +124 -0
  231. package/test/integration.test.ts +265 -0
  232. package/test/leases.test.ts +399 -0
  233. package/test/mcp-prompts.test.ts +218 -0
  234. package/test/mcp-resources.test.ts +286 -0
  235. package/test/mcp-standalone.test.ts +113 -0
  236. package/test/mesh.test.ts +700 -0
  237. package/test/privacy.test.ts +87 -0
  238. package/test/profile.test.ts +161 -0
  239. package/test/query-expansion.test.ts +154 -0
  240. package/test/relations.test.ts +198 -0
  241. package/test/retention.test.ts +245 -0
  242. package/test/routines.test.ts +497 -0
  243. package/test/schema-fingerprint.test.ts +81 -0
  244. package/test/schema.test.ts +42 -0
  245. package/test/search-index.test.ts +128 -0
  246. package/test/sentinels.test.ts +626 -0
  247. package/test/signals.test.ts +410 -0
  248. package/test/sketches.test.ts +549 -0
  249. package/test/sliding-window.test.ts +199 -0
  250. package/test/smart-search.test.ts +169 -0
  251. package/test/snapshot.test.ts +165 -0
  252. package/test/team.test.ts +156 -0
  253. package/test/temporal-graph.test.ts +378 -0
  254. package/test/timeline.test.ts +148 -0
  255. package/test/vector-index.test.ts +79 -0
  256. package/test/verify.test.ts +209 -0
  257. package/test/xml.test.ts +65 -0
  258. package/tsconfig.json +22 -0
  259. package/tsdown.config.ts +62 -0
Binary file
Binary file
Binary file
@@ -0,0 +1,73 @@
1
+ # agentmemory v0.6.0 — Search Quality Evaluation
2
+
3
+ **Date:** 2026-03-18T07:44:43.397Z
4
+ **Dataset:** 240 observations across 30 sessions (realistic coding project)
5
+ **Queries:** 20 labeled queries with ground-truth relevance
6
+ **Metric definitions:** Recall@K (fraction of relevant docs in top K), Precision@K (fraction of top K that are relevant), NDCG@10 (ranking quality), MRR (position of first relevant result)
7
+
8
+ ## Head-to-Head Comparison
9
+
10
+ | System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Latency | Tokens/query |
11
+ |--------|----------|-----------|-------------|---------|-----|---------|--------------|
12
+ | Built-in (CLAUDE.md / grep) | 37.0% | 55.8% | 78.0% | 80.3% | 82.5% | 0.50ms | 22,610 |
13
+ | Built-in (200-line MEMORY.md) | 27.4% | 37.8% | 63.0% | 56.4% | 65.5% | 0.16ms | 7,938 |
14
+ | BM25-only | 43.8% | 55.9% | 95.0% | 82.7% | 95.5% | 0.17ms | 3,142 |
15
+ | Dual-stream (BM25+Vector) | 42.4% | 58.6% | 90.0% | 84.7% | 95.4% | 0.71ms | 3,142 |
16
+ | Triple-stream (BM25+Vector+Graph) | 36.8% | 58.0% | 87.0% | 81.7% | 87.9% | 1.02ms | 3,142 |
17
+
18
+ ## Why This Matters
19
+
20
+ **Recall improvement:** agentmemory triple-stream finds 58.0% of relevant memories at K=10 vs 55.8% for keyword grep (+4%)
21
+ **Token savings:** agentmemory returns only the top 10 results (3,142 tokens) vs loading everything into context (22,610 tokens) — 86% reduction
22
+ **200-line cap:** Claude Code's MEMORY.md is capped at 200 lines. With 240 observations, 37.8% recall at K=10 — memories from later sessions are simply invisible.
23
+
24
+ ## Per-Query Breakdown (Triple-Stream)
25
+
26
+ | Query | Category | Recall@10 | NDCG@10 | MRR | Relevant | Latency |
27
+ |-------|----------|-----------|---------|-----|----------|---------|
28
+ | How did we set up authentication? | semantic | 50.0% | 100.0% | 100.0% | 20 | 1.7ms |
29
+ | JWT token validation middleware | exact | 50.0% | 64.9% | 100.0% | 10 | 1.2ms |
30
+ | PostgreSQL connection issues | semantic | 33.3% | 100.0% | 100.0% | 30 | 1.0ms |
31
+ | Playwright test configuration | exact | 100.0% | 100.0% | 100.0% | 10 | 1.1ms |
32
+ | Why did the production deployment fail? | cross-session | 33.3% | 100.0% | 100.0% | 30 | 0.8ms |
33
+ | rate limiting implementation | exact | 80.0% | 64.1% | 33.3% | 10 | 0.7ms |
34
+ | What security measures did we add? | semantic | 33.3% | 100.0% | 100.0% | 30 | 0.7ms |
35
+ | database performance optimization | semantic | 0.0% | 0.0% | 7.1% | 25 | 0.8ms |
36
+ | Kubernetes pod crash debugging | entity | 100.0% | 96.7% | 100.0% | 5 | 1.2ms |
37
+ | Docker containerization setup | entity | 100.0% | 100.0% | 100.0% | 10 | 0.9ms |
38
+ | How does caching work in the app? | semantic | 25.0% | 64.9% | 100.0% | 20 | 0.8ms |
39
+ | test infrastructure and factories | exact | 50.0% | 64.9% | 100.0% | 10 | 0.7ms |
40
+ | What happened with the OAuth callback error? | cross-session | 100.0% | 54.1% | 16.7% | 5 | 1.1ms |
41
+ | monitoring and observability setup | semantic | 66.7% | 100.0% | 100.0% | 15 | 0.8ms |
42
+ | Prisma ORM configuration | entity | 25.7% | 93.6% | 100.0% | 35 | 1.8ms |
43
+ | CI/CD pipeline configuration | exact | 20.0% | 64.9% | 100.0% | 25 | 1.0ms |
44
+ | memory leak debugging | cross-session | 100.0% | 100.0% | 100.0% | 5 | 0.7ms |
45
+ | API design decisions | semantic | 25.0% | 64.9% | 100.0% | 20 | 1.4ms |
46
+ | zod validation schemas | entity | 66.7% | 100.0% | 100.0% | 15 | 0.7ms |
47
+ | infrastructure as code Terraform | entity | 100.0% | 100.0% | 100.0% | 5 | 1.5ms |
48
+
49
+ ## By Query Category
50
+
51
+ | Category | Avg Recall@10 | Avg NDCG@10 | Avg MRR | Queries |
52
+ |----------|---------------|-------------|---------|---------|
53
+ | exact | 60.0% | 71.8% | 86.7% | 5 |
54
+ | semantic | 33.3% | 75.7% | 86.7% | 7 |
55
+ | cross-session | 77.8% | 84.7% | 72.2% | 3 |
56
+ | entity | 78.5% | 98.1% | 100.0% | 5 |
57
+
58
+ ## Context Window Analysis
59
+
60
+ The fundamental problem with built-in agent memory:
61
+
62
+ | Observations | MEMORY.md tokens | agentmemory tokens (top 10) | Savings | MEMORY.md reachable |
63
+ |-------------|-----------------|---------------------------|---------|-------------------|
64
+ | 240 | 12,000 | 3,142 | 74% | 83% |
65
+ | 500 | 25,000 | 3,142 | 87% | 40% |
66
+ | 1,000 | 50,000 | 3,142 | 94% | 20% |
67
+ | 5,000 | 250,000 | 3,142 | 99% | 4% |
68
+
69
+ At 240 observations (our dataset), MEMORY.md already hits its 200-line cap and loses access to the most recent 40 observations. At 1,000 observations, 80% of memories are invisible. agentmemory always searches the full corpus.
70
+
71
+ ---
72
+
73
+ *100 evaluations across 5 systems. Ground-truth labels assigned by concept matching against observation metadata.*
@@ -0,0 +1,67 @@
1
+ # agentmemory v0.6.0 — Real Embeddings Quality Evaluation
2
+
3
+ **Date:** 2026-03-18T07:38:21.450Z
4
+ **Platform:** darwin arm64, Node v20.20.0
5
+ **Dataset:** 240 observations, 30 sessions, 20 labeled queries
6
+ **Embedding model:** Xenova/all-MiniLM-L6-v2 (384d, local, no API key)
7
+
8
+ ## Head-to-Head: Real Embeddings vs Keyword Search
9
+
10
+ | System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Avg Latency | Tokens/query |
11
+ |--------|----------|-----------|-------------|---------|-----|-------------|--------------|
12
+ | Built-in (grep all) | 37.0% | 55.8% | 78.0% | 80.3% | 82.5% | 0.44ms | 19,462 |
13
+ | BM25-only (stemmed+synonyms) | 43.8% | 55.9% | 95.0% | 82.7% | 95.5% | 0.26ms | 1,571 |
14
+ | Dual-stream (BM25+Xenova) | 43.8% | 64.1% | 98.0% | 94.9% | 100.0% | 2.39ms | 1,571 |
15
+ | Triple-stream (BM25+Xenova+Graph) | 43.8% | 64.1% | 98.0% | 94.9% | 100.0% | 2.07ms | 1,571 |
16
+
17
+ ## Improvement from Real Embeddings
18
+
19
+ Adding real vector embeddings to BM25 improves recall@10 by **8.2 percentage points**.
20
+ Token savings vs loading everything: **92%** (1,571 vs 19,462 tokens).
21
+
22
+ ## Per-Query: Where Real Embeddings Win
23
+
24
+ Queries where dual-stream (real embeddings) outperforms BM25-only:
25
+
26
+ | Query | Category | BM25 Recall@10 | +Vector Recall@10 | Delta |
27
+ |-------|----------|---------------|-------------------|-------|
28
+ | How did we set up authentication? | semantic | 25.0% | 45.0% | +20.0pp ** |
29
+ | Playwright test configuration | exact | 50.0% | 90.0% | +40.0pp ** |
30
+ | database performance optimization | semantic | 0.0% | 40.0% | +40.0pp ** |
31
+ | test infrastructure and factories | exact | 50.0% | 80.0% | +30.0pp ** |
32
+ | Prisma ORM configuration | entity | 14.3% | 28.6% | +14.3pp ** |
33
+ | CI/CD pipeline configuration | exact | 20.0% | 40.0% | +20.0pp ** |
34
+
35
+ ## By Category Comparison
36
+
37
+ | Category | Built-in grep | BM25 (stemmed) | +Real Vectors | +Graph |
38
+ |----------|--------------|----------------|--------------|--------|
39
+ | exact | 48.0% | 54.0% | 72.0% | 72.0% |
40
+ | semantic | 35.5% | 33.3% | 41.9% | 41.9% |
41
+ | cross-session | 77.8% | 77.8% | 77.8% | 77.8% |
42
+ | entity | 79.0% | 76.2% | 79.0% | 79.0% |
43
+
44
+ ## Embedding Performance
45
+
46
+ | System | Embedding Time | Model | Dimensions |
47
+ |--------|---------------|-------|------------|
48
+ | Dual-stream (BM25+Xenova) | 3.1s | Xenova/all-MiniLM-L6-v2 | 384 |
49
+ | Triple-stream (BM25+Xenova+Graph) | 2.9s | Xenova/all-MiniLM-L6-v2 | 384 |
50
+
51
+ Embedding is a one-time cost at ingestion. Search is sub-millisecond after indexing.
52
+
53
+ ## Key Findings
54
+
55
+ 1. **Semantic queries improve most**: 8.6pp recall@10 gain from real embeddings
56
+ 2. **"database performance optimization"** — the hardest query — goes from BM25 0.0% to vector-augmented 40.0%
57
+ 3. **Entity/exact queries** are already well-served by BM25+stemming — vectors add marginal value
58
+ 4. **Local embeddings (Xenova)** run without API keys — zero cost, zero latency concerns
59
+
60
+ ## Recommendation
61
+
62
+ Enable local embeddings by default (`EMBEDDING_PROVIDER=local` or install `@xenova/transformers`).
63
+ This gives agentmemory genuine semantic search that built-in agent memories cannot match —
64
+ understanding that "database performance optimization" relates to "N+1 query fix" and "eager loading".
65
+
66
+ ---
67
+ *All measurements use Xenova/all-MiniLM-L6-v2 local embeddings (384 dimensions, no API calls).*
@@ -0,0 +1,110 @@
1
+ # agentmemory v0.6.0 — Scale & Cross-Session Evaluation
2
+
3
+ **Date:** 2026-03-18T07:45:03.529Z
4
+ **Platform:** darwin arm64, Node v20.20.0
5
+
6
+ ## 1. Scale: agentmemory vs Built-in Memory
7
+
8
+ Every built-in agent memory (CLAUDE.md, .cursorrules, Cline's memory-bank) loads ALL memory into context every session. agentmemory searches and returns only relevant results.
9
+
10
+ | Observations | Sessions | Index Build | BM25 Search | Hybrid Search | Heap | Context Tokens (built-in) | Context Tokens (agentmemory) | Savings | Built-in Unreachable |
11
+ |-------------|----------|------------|-------------|---------------|------|--------------------------|-----------------------------|---------|--------------------|
12
+ | 240 | 30 | 177ms | 0.112ms | 0.63ms | 9MB | 10,504 | 1,924 | 82% | 17% |
13
+ | 1,000 | 125 | 155ms | 0.317ms | 1.709ms | 6MB | 43,834 | 1,969 | 96% | 80% |
14
+ | 5,000 | 625 | 810ms | 1.496ms | 8.58ms | 25MB | 220,335 | 1,972 | 99% | 96% |
15
+ | 10,000 | 1250 | 1657ms | 3.195ms | 17.49ms | 1MB | 440,973 | 1,974 | 100% | 98% |
16
+ | 50,000 | 6250 | 9182ms | 22.827ms | 108.722ms | 316MB | 2,216,173 | 1,981 | 100% | 100% |
17
+
18
+ ### What the numbers mean
19
+
20
+ **Context Tokens (built-in):** How many tokens Claude Code/Cursor/Cline would consume loading ALL memory into the context window. At 5,000 observations, this is ~250K tokens — exceeding most context windows entirely.
21
+
22
+ **Context Tokens (agentmemory):** How many tokens the top-10 search results consume. Stays constant regardless of corpus size.
23
+
24
+ **Built-in Unreachable:** Percentage of memories that built-in systems CANNOT access because they exceed the 200-line MEMORY.md cap or context window limits. At 1,000 observations, 80% of your project history is invisible.
25
+
26
+ ### Storage Costs
27
+
28
+ | Observations | BM25 Index | Vector Index (d=384) | Total Storage |
29
+ |-------------|-----------|---------------------|---------------|
30
+ | 240 | 395 KB | 494 KB | 0.9 MB |
31
+ | 1,000 | 1,599 KB | 2,060 KB | 3.6 MB |
32
+ | 5,000 | 8,006 KB | 10,298 KB | 17.9 MB |
33
+ | 10,000 | 16,005 KB | 20,596 KB | 35.7 MB |
34
+ | 50,000 | 80,126 KB | 102,979 KB | 178.8 MB |
35
+
36
+ ## 2. Cross-Session Retrieval
37
+
38
+ Can the system find relevant information from past sessions? This is impossible for built-in memory once observations exceed the line/context cap.
39
+
40
+ | Query | Target Session | Gap | BM25 Found | BM25 Rank | Hybrid Found | Hybrid Rank | Built-in Visible |
41
+ |-------|---------------|-----|-----------|-----------|-------------|-------------|-----------------|
42
+ | How did we set up OAuth providers? | ses_005-009 | 24 | Yes | #1 | Yes | #1 | Yes |
43
+ | What was the N+1 query fix? | ses_010-014 | 18 | Yes | #1 | Yes | #2 | Yes |
44
+ | PostgreSQL full-text search setup | ses_010-014 | 17 | Yes | #1 | Yes | #1 | Yes |
45
+ | bcrypt password hashing configuration | ses_005-009 | 20 | Yes | #1 | Yes | #1 | Yes |
46
+ | Vitest unit testing setup | ses_020-024 | 9 | Yes | #1 | Yes | #1 | Yes |
47
+ | webhook retry exponential backoff | ses_015-019 | 14 | Yes | #1 | Yes | #1 | Yes |
48
+ | ESLint flat config migration | ses_000-004 | 29 | Yes | #1 | Yes | #1 | Yes |
49
+ | Kubernetes HPA autoscaling configuration | ses_025-029 | 4 | Yes | #1 | Yes | #1 | No |
50
+ | Prisma database seed script | ses_010-014 | 16 | Yes | #1 | Yes | #1 | Yes |
51
+ | API cursor-based pagination | ses_015-019 | 14 | Yes | #1 | Yes | #1 | Yes |
52
+ | CSRF protection double-submit cookie | ses_005-009 | 24 | Yes | #1 | Yes | #1 | Yes |
53
+ | blue-green deployment rollback | ses_025-029 | 4 | Yes | #1 | Yes | #1 | No |
54
+
55
+ **Summary:** agentmemory BM25 found 12/12 cross-session queries. Hybrid found 12/12. Built-in memory (200-line cap) could only reach 10/12.
56
+
57
+ ## 3. The Context Window Problem
58
+
59
+ ```
60
+ Agent context window: ~200K tokens
61
+ System prompt + tools: ~20K tokens
62
+ User conversation: ~30K tokens
63
+ Available for memory: ~150K tokens
64
+
65
+ At 50 tokens/observation:
66
+ 200 observations = 10,000 tokens (fits, but 200-line cap hits first)
67
+ 1,000 observations = 50,000 tokens (33% of available budget)
68
+ 5,000 observations = 250,000 tokens (EXCEEDS total context window)
69
+
70
+ agentmemory top-10 results:
71
+ Any corpus size = ~1,924 tokens (0.3% of budget)
72
+ ```
73
+
74
+ ## 4. What Built-in Memory Cannot Do
75
+
76
+ | Capability | Built-in (CLAUDE.md) | agentmemory |
77
+ |-----------|---------------------|-------------|
78
+ | Semantic search | No (keyword grep only) | BM25 + vector + graph |
79
+ | Scale beyond 200 lines | No (hard cap) | Unlimited |
80
+ | Cross-session recall | Only if in 200-line window | Full corpus search |
81
+ | Cross-agent sharing | No (per-agent files) | MCP + REST API |
82
+ | Multi-agent coordination | No | Leases, signals, actions |
83
+ | Temporal queries | No | Point-in-time graph |
84
+ | Memory lifecycle | No (manual pruning) | Ebbinghaus decay + eviction |
85
+ | Knowledge graph | No | Entity extraction + traversal |
86
+ | Query expansion | No | LLM-generated reformulations |
87
+ | Retention scoring | No | Time-frequency decay model |
88
+ | Real-time dashboard | No (read files manually) | Viewer on :3113 |
89
+ | Concurrent access | No (file lock) | Keyed mutex + KV store |
90
+
91
+ ## 5. When to Use What
92
+
93
+ **Use built-in memory (CLAUDE.md) when:**
94
+ - You have < 200 items to remember
95
+ - Single agent, single project
96
+ - Preferences and quick facts only
97
+ - Zero setup is the priority
98
+
99
+ **Use agentmemory when:**
100
+ - Project history exceeds 200 observations
101
+ - You need to recall specific incidents from weeks ago
102
+ - Multiple agents work on the same codebase
103
+ - You want semantic search ("how does auth work?") not just keyword matching
104
+ - You need to track memory quality, decay, and lifecycle
105
+ - You want a shared memory layer across Claude Code, Cursor, Windsurf, etc.
106
+
107
+ Built-in memory is your sticky notes. agentmemory is the searchable database behind them.
108
+
109
+ ---
110
+ *Scale tests: 5 corpus sizes. Cross-session tests: 12 queries targeting specific past sessions.*
@@ -0,0 +1,293 @@
1
+ import type { CompressedObservation } from "../src/types.js";
2
+
3
+ export interface LabeledQuery {
4
+ query: string;
5
+ relevantObsIds: string[];
6
+ description: string;
7
+ category: "exact" | "semantic" | "temporal" | "cross-session" | "entity";
8
+ }
9
+
10
+ const SESSION_COUNT = 30;
11
+ const OBS_PER_SESSION = 8;
12
+
13
+ function ts(daysAgo: number): string {
14
+ return new Date(Date.now() - daysAgo * 86400000).toISOString();
15
+ }
16
+
17
+ const RAW_SESSIONS: Array<{
18
+ sessionRange: [number, number];
19
+ daysAgoRange: [number, number];
20
+ project: string;
21
+ observations: Array<Omit<CompressedObservation, "id" | "sessionId" | "timestamp">>;
22
+ }> = [
23
+ {
24
+ sessionRange: [0, 4],
25
+ daysAgoRange: [28, 25],
26
+ project: "webapp",
27
+ observations: [
28
+ { type: "command_run", title: "Initialize Next.js 15 project", subtitle: "create-next-app", facts: ["Created Next.js 15 app with App Router", "TypeScript template selected", "Tailwind CSS v4 configured"], narrative: "Initialized a new Next.js 15 project using create-next-app with TypeScript and Tailwind CSS. Selected the App Router layout.", concepts: ["nextjs", "typescript", "tailwind", "app-router"], files: ["package.json", "tsconfig.json", "tailwind.config.ts"], importance: 6 },
29
+ { type: "file_edit", title: "Configure ESLint with flat config", subtitle: "eslint.config.mjs", facts: ["Migrated to ESLint flat config format", "Added typescript-eslint plugin", "Configured import sorting rules"], narrative: "Set up ESLint using the new flat config format (eslint.config.mjs). Added typescript-eslint for type-aware linting and configured import sorting with eslint-plugin-import.", concepts: ["eslint", "linting", "code-quality", "typescript"], files: ["eslint.config.mjs", "package.json"], importance: 5 },
30
+ { type: "file_edit", title: "Set up Prettier with Tailwind plugin", subtitle: "Formatting", facts: ["Installed prettier and prettier-plugin-tailwindcss", "Added .prettierrc with semi: false, singleQuote: true", "Configured format-on-save in VS Code settings"], narrative: "Configured Prettier for automatic code formatting. Added the Tailwind CSS class sorting plugin. Set up VS Code to format on save.", concepts: ["prettier", "formatting", "tailwind", "developer-experience"], files: [".prettierrc", ".vscode/settings.json"], importance: 4 },
31
+ { type: "file_edit", title: "Create shared UI component library", subtitle: "Components", facts: ["Created Button, Input, Card, Badge components", "Used cva (class-variance-authority) for variant styling", "Added Radix UI primitives for accessibility"], narrative: "Built a shared component library with Button, Input, Card, and Badge components. Used class-variance-authority (cva) for type-safe variant styling and Radix UI primitives for keyboard navigation and screen reader support.", concepts: ["components", "ui-library", "radix-ui", "cva", "accessibility"], files: ["src/components/ui/button.tsx", "src/components/ui/input.tsx", "src/components/ui/card.tsx"], importance: 7 },
32
+ { type: "file_edit", title: "Add global layout with navigation", subtitle: "Layout", facts: ["Created root layout with metadata", "Added responsive navigation bar", "Implemented mobile hamburger menu"], narrative: "Created the root layout component with SEO metadata, Open Graph tags, and a responsive navigation bar that collapses into a hamburger menu on mobile devices.", concepts: ["layout", "navigation", "responsive-design", "seo"], files: ["src/app/layout.tsx", "src/components/nav.tsx"], importance: 6 },
33
+ { type: "file_edit", title: "Configure path aliases and absolute imports", subtitle: "tsconfig", facts: ["Added @ alias pointing to src/", "Configured baseUrl for absolute imports"], narrative: "Set up TypeScript path aliases so imports can use @/components instead of relative paths. Configured baseUrl in tsconfig.json.", concepts: ["typescript", "path-aliases", "developer-experience"], files: ["tsconfig.json"], importance: 3 },
34
+ { type: "command_run", title: "Add Vitest for unit testing", subtitle: "Testing setup", facts: ["Installed vitest and @testing-library/react", "Created vitest.config.ts with jsdom environment", "Added test script to package.json"], narrative: "Set up Vitest as the unit testing framework with React Testing Library for component tests. Configured jsdom environment for DOM testing.", concepts: ["vitest", "testing", "react-testing-library", "configuration"], files: ["vitest.config.ts", "package.json"], importance: 5 },
35
+ { type: "file_edit", title: "Set up Husky pre-commit hooks", subtitle: "Git hooks", facts: ["Installed husky and lint-staged", "Pre-commit runs ESLint and Prettier", "Added commitlint for conventional commits"], narrative: "Configured Husky git hooks with lint-staged to run ESLint and Prettier on staged files before each commit. Added commitlint to enforce conventional commit message format.", concepts: ["husky", "git-hooks", "lint-staged", "commitlint", "ci"], files: [".husky/pre-commit", ".lintstagedrc", "commitlint.config.js"], importance: 4 },
36
+ ],
37
+ },
38
+ {
39
+ sessionRange: [5, 9],
40
+ daysAgoRange: [24, 20],
41
+ project: "webapp",
42
+ observations: [
43
+ { type: "file_edit", title: "Implement NextAuth.js v5 authentication", subtitle: "Auth setup", facts: ["Configured NextAuth.js v5 with Auth.js", "Added GitHub and Google OAuth providers", "Set up JWT session strategy with 30-day expiry"], narrative: "Implemented authentication using NextAuth.js v5 (Auth.js). Configured GitHub and Google as OAuth providers. Using JWT-based sessions with 30-day expiry instead of database sessions for simplicity.", concepts: ["nextauth", "authentication", "oauth", "jwt", "github", "google"], files: ["src/auth.ts", "src/app/api/auth/[...nextauth]/route.ts", ".env.local"], importance: 9 },
44
+ { type: "file_edit", title: "Create login and signup pages", subtitle: "Auth UI", facts: ["Built login page with OAuth buttons", "Added email/password form with validation", "Implemented error toast notifications"], narrative: "Created the login page with GitHub and Google OAuth sign-in buttons plus an email/password form. Used react-hook-form with zod validation. Added toast notifications for login errors.", concepts: ["login", "signup", "oauth", "form-validation", "react-hook-form", "zod"], files: ["src/app/login/page.tsx", "src/app/signup/page.tsx"], importance: 7 },
45
+ { type: "file_edit", title: "Add middleware for route protection", subtitle: "Auth middleware", facts: ["Created middleware.ts to protect /dashboard routes", "Redirects unauthenticated users to /login", "Allows public access to /api/webhooks"], narrative: "Added Next.js middleware that checks for valid sessions on protected routes (/dashboard/*). Unauthenticated users are redirected to /login. The /api/webhooks path is excluded from auth checks for third-party integrations.", concepts: ["middleware", "route-protection", "authentication", "security"], files: ["src/middleware.ts"], importance: 8 },
46
+ { type: "file_edit", title: "Implement role-based access control", subtitle: "RBAC", facts: ["Added user roles: admin, editor, viewer", "Created withAuth HOC for role checking", "Stored roles in JWT custom claims"], narrative: "Implemented role-based access control with three roles: admin, editor, and viewer. Created a withAuth higher-order component that checks user roles before rendering protected components. Roles are stored as custom claims in the JWT token.", concepts: ["rbac", "authorization", "roles", "jwt-claims", "security"], files: ["src/lib/auth/rbac.ts", "src/lib/auth/with-auth.tsx"], importance: 8 },
47
+ { type: "file_edit", title: "Add password hashing with bcrypt", subtitle: "Security", facts: ["Using bcrypt with cost factor 12", "Added password strength validation (min 8 chars, mixed case, number)", "Implemented rate limiting on login endpoint (5 attempts per 15 min)"], narrative: "Added bcrypt password hashing with cost factor 12 for the email/password authentication flow. Implemented password strength validation requiring minimum 8 characters with mixed case and numbers. Added rate limiting on the login API endpoint: 5 attempts per 15-minute window per IP.", concepts: ["bcrypt", "password-hashing", "rate-limiting", "security", "validation"], files: ["src/lib/auth/password.ts", "src/app/api/auth/login/route.ts"], importance: 9 },
48
+ { type: "file_edit", title: "Create user profile settings page", subtitle: "User settings", facts: ["Profile page shows avatar, name, email", "Added avatar upload with S3 presigned URLs", "Implemented account deletion flow"], narrative: "Built the user profile settings page showing avatar, name, and email. Added avatar upload using S3 presigned URLs for direct browser-to-S3 uploads. Implemented a full account deletion flow with email confirmation.", concepts: ["user-profile", "settings", "s3", "file-upload", "account-deletion"], files: ["src/app/dashboard/settings/page.tsx", "src/app/api/upload/route.ts"], importance: 6 },
49
+ { type: "command_run", title: "Debug OAuth callback URL mismatch", subtitle: "Auth debugging", facts: ["GitHub OAuth callback failed with redirect_uri_mismatch", "Fixed: NEXTAUTH_URL was set to http:// but app served on https://", "Lesson: always use HTTPS in production OAuth callback URLs"], narrative: "Spent time debugging why GitHub OAuth login failed in production. The error was redirect_uri_mismatch. Root cause: NEXTAUTH_URL environment variable was set to http://localhost:3000 in production instead of the HTTPS production URL. Fixed by updating the environment variable.", concepts: ["oauth-debugging", "github", "callback-url", "environment-variables", "production"], files: [".env.production"], importance: 7 },
50
+ { type: "file_edit", title: "Add CSRF protection to API routes", subtitle: "Security", facts: ["Implemented double-submit cookie pattern", "Added CSRF token generation in layout", "Validated CSRF token on all POST/PUT/DELETE requests"], narrative: "Added CSRF protection using the double-submit cookie pattern. A CSRF token is generated on page load and stored in both a cookie and a hidden form field. All mutating API requests (POST, PUT, DELETE) validate the token.", concepts: ["csrf", "security", "cookies", "api-protection"], files: ["src/lib/csrf.ts", "src/middleware.ts"], importance: 8 },
51
+ ],
52
+ },
53
+ {
54
+ sessionRange: [10, 14],
55
+ daysAgoRange: [19, 15],
56
+ project: "webapp",
57
+ observations: [
58
+ { type: "file_edit", title: "Set up Prisma ORM with PostgreSQL", subtitle: "Database", facts: ["Initialized Prisma with PostgreSQL provider", "Created User, Post, Comment, Tag models", "Generated migrations with prisma migrate dev"], narrative: "Set up Prisma ORM connecting to a PostgreSQL database. Defined the initial schema with User, Post, Comment, and Tag models including many-to-many relationships between Post and Tag.", concepts: ["prisma", "postgresql", "database", "orm", "schema", "migrations"], files: ["prisma/schema.prisma", "src/lib/db.ts"], importance: 9 },
59
+ { type: "file_edit", title: "Create database seed script", subtitle: "Seeding", facts: ["Created seed.ts with faker-generated data", "Seeds 10 users, 50 posts, 200 comments", "Runs via prisma db seed command"], narrative: "Built a database seed script using faker.js to generate realistic test data. Creates 10 users with posts, comments, and tags. Configured to run automatically on prisma db seed.", concepts: ["database", "seeding", "faker", "test-data", "prisma"], files: ["prisma/seed.ts", "package.json"], importance: 5 },
60
+ { type: "file_edit", title: "Implement server actions for CRUD operations", subtitle: "Data layer", facts: ["Created server actions for post CRUD", "Used Prisma transactions for multi-step operations", "Added revalidatePath after mutations"], narrative: "Implemented Next.js server actions for post create, read, update, and delete operations. Used Prisma transactions for operations that modify multiple tables. Called revalidatePath after mutations to refresh cached data.", concepts: ["server-actions", "crud", "prisma", "transactions", "revalidation", "caching"], files: ["src/app/actions/posts.ts"], importance: 8 },
61
+ { type: "command_run", title: "Fix N+1 query in post listing", subtitle: "Performance", facts: ["Identified N+1 query loading post authors individually", "Fixed with Prisma include for eager loading", "Query count dropped from 52 to 3"], narrative: "Discovered an N+1 query problem on the post listing page — each post was triggering a separate query to load its author. Fixed by using Prisma's include option for eager loading. Total query count dropped from 52 to 3.", concepts: ["n+1", "performance", "prisma", "eager-loading", "query-optimization"], files: ["src/app/actions/posts.ts"], importance: 8 },
62
+ { type: "file_edit", title: "Add full-text search with PostgreSQL tsvector", subtitle: "Search", facts: ["Created tsvector column on posts table", "Built GIN index for fast text search", "Implemented search API with ts_rank scoring"], narrative: "Added full-text search using PostgreSQL's built-in tsvector functionality. Created a generated tsvector column combining title and body, with a GIN index. The search API uses ts_rank for relevance scoring and supports phrase matching.", concepts: ["full-text-search", "postgresql", "tsvector", "gin-index", "search"], files: ["prisma/migrations/20260301_add_search.sql", "src/app/api/search/route.ts"], importance: 7 },
63
+ { type: "file_edit", title: "Set up connection pooling with PgBouncer", subtitle: "Database infra", facts: ["Deployed PgBouncer in transaction pooling mode", "Configured max 25 client connections, 10 server connections", "Added DATABASE_URL_DIRECT for migrations (bypasses pooler)"], narrative: "Deployed PgBouncer as a connection pooler for PostgreSQL. Using transaction pooling mode to maximize connection reuse. Configured separate DATABASE_URL for application use (through pooler) and DATABASE_URL_DIRECT for migrations.", concepts: ["pgbouncer", "connection-pooling", "postgresql", "infrastructure"], files: ["docker-compose.yml", ".env"], importance: 7 },
64
+ { type: "command_run", title: "Debug Prisma migration drift", subtitle: "Database debugging", facts: ["prisma migrate deploy failed with drift detected", "Cause: manual SQL ALTER was run directly on production", "Resolution: ran prisma migrate resolve to mark migration as applied"], narrative: "Production deployment failed because Prisma detected schema drift — someone had run a manual ALTER TABLE directly on the production database. Resolved by using prisma migrate resolve to mark the conflicting migration as already applied.", concepts: ["prisma", "migration-drift", "database", "production", "debugging"], files: ["prisma/schema.prisma"], importance: 7 },
65
+ { type: "file_edit", title: "Add Redis caching layer for expensive queries", subtitle: "Caching", facts: ["Used ioredis with 60-second TTL for post listings", "Implemented cache-aside pattern", "Added cache invalidation on post mutations"], narrative: "Added a Redis caching layer for expensive database queries. Post listings are cached for 60 seconds using a cache-aside pattern. Cache entries are invalidated when posts are created, updated, or deleted.", concepts: ["redis", "caching", "cache-aside", "ioredis", "performance"], files: ["src/lib/cache.ts", "src/app/actions/posts.ts"], importance: 7 },
66
+ ],
67
+ },
68
+ {
69
+ sessionRange: [15, 19],
70
+ daysAgoRange: [14, 10],
71
+ project: "webapp",
72
+ observations: [
73
+ { type: "file_edit", title: "Build REST API with input validation", subtitle: "API", facts: ["Created /api/v1/posts, /api/v1/users endpoints", "Used zod for request body validation", "Added consistent error response format with error codes"], narrative: "Built a versioned REST API under /api/v1/ with endpoints for posts and users. All request bodies are validated with zod schemas. Errors follow a consistent format with error codes, messages, and field-level details.", concepts: ["rest-api", "zod", "validation", "error-handling", "api-design"], files: ["src/app/api/v1/posts/route.ts", "src/app/api/v1/users/route.ts", "src/lib/api/errors.ts"], importance: 8 },
74
+ { type: "file_edit", title: "Implement cursor-based pagination", subtitle: "API pagination", facts: ["Replaced offset pagination with cursor-based approach", "Uses Prisma cursor with opaque base64-encoded cursors", "Returns hasNextPage and endCursor in response"], narrative: "Switched from offset-based to cursor-based pagination for the post listing API. Cursors are base64-encoded Prisma record IDs. Response includes hasNextPage boolean and endCursor for the client to request the next page.", concepts: ["pagination", "cursor-based", "prisma", "api-design", "performance"], files: ["src/app/api/v1/posts/route.ts", "src/lib/api/pagination.ts"], importance: 7 },
75
+ { type: "file_edit", title: "Add API rate limiting with Upstash Redis", subtitle: "Rate limiting", facts: ["Used @upstash/ratelimit with sliding window algorithm", "10 requests per 10 seconds per API key", "Returns X-RateLimit-Remaining header"], narrative: "Implemented API rate limiting using Upstash Redis with a sliding window algorithm. Each API key is limited to 10 requests per 10-second window. Rate limit status is communicated via standard X-RateLimit-* headers.", concepts: ["rate-limiting", "upstash", "redis", "api-security", "sliding-window"], files: ["src/middleware.ts", "src/lib/rate-limit.ts"], importance: 8 },
76
+ { type: "file_edit", title: "Create webhook system for external integrations", subtitle: "Webhooks", facts: ["Built webhook registration and delivery system", "Events: post.created, post.updated, user.signup", "Implemented retry with exponential backoff (max 3 retries)"], narrative: "Created a webhook system allowing external services to subscribe to events. Supports post.created, post.updated, and user.signup events. Webhook deliveries use exponential backoff with up to 3 retries on failure.", concepts: ["webhooks", "events", "integrations", "retry", "exponential-backoff"], files: ["src/lib/webhooks.ts", "src/app/api/v1/webhooks/route.ts"], importance: 7 },
77
+ { type: "file_edit", title: "Add OpenAPI specification with Swagger UI", subtitle: "API docs", facts: ["Generated OpenAPI 3.1 spec from zod schemas", "Added Swagger UI at /api/docs", "Included request/response examples"], narrative: "Generated an OpenAPI 3.1 specification from the existing zod validation schemas. Added Swagger UI accessible at /api/docs for interactive API documentation with request/response examples.", concepts: ["openapi", "swagger", "api-documentation", "zod"], files: ["src/app/api/docs/route.ts", "src/lib/openapi.ts"], importance: 5 },
78
+ { type: "command_run", title: "Debug 504 gateway timeout on large queries", subtitle: "Performance debugging", facts: ["Large post queries timing out after 30 seconds on Vercel", "Root cause: missing database index on posts.authorId", "Added composite index (authorId, createdAt DESC), query dropped to 50ms"], narrative: "Investigated 504 Gateway Timeout errors on the post listing endpoint in production (Vercel). Found that large queries filtering by author were doing a full table scan. Added a composite index on (authorId, createdAt DESC) which reduced query time from 30+ seconds to 50ms.", concepts: ["performance", "timeout", "database-index", "postgresql", "vercel", "debugging"], files: ["prisma/migrations/20260310_add_author_index.sql"], importance: 9 },
79
+ { type: "file_edit", title: "Implement API versioning strategy", subtitle: "API design", facts: ["URL-based versioning: /api/v1/, /api/v2/", "v1 deprecated with Sunset header", "Migration guide in API docs"], narrative: "Established an API versioning strategy using URL-based versioning (/api/v1/, /api/v2/). The v1 API returns a Sunset header indicating its deprecation date. Added a migration guide to the API documentation.", concepts: ["api-versioning", "deprecation", "sunset-header", "backward-compatibility"], files: ["src/app/api/v2/posts/route.ts", "src/lib/api/versioning.ts"], importance: 6 },
80
+ { type: "file_edit", title: "Add request logging with structured JSON", subtitle: "Observability", facts: ["Used pino for structured JSON logging", "Logs request method, path, status, duration, user ID", "Configured log levels per environment"], narrative: "Added structured JSON request logging using pino. Each request logs method, path, response status, duration in milliseconds, and authenticated user ID. Log levels are configured per environment (debug in dev, info in production).", concepts: ["logging", "pino", "observability", "structured-logging", "monitoring"], files: ["src/lib/logger.ts", "src/middleware.ts"], importance: 6 },
81
+ ],
82
+ },
83
+ {
84
+ sessionRange: [20, 24],
85
+ daysAgoRange: [9, 5],
86
+ project: "webapp",
87
+ observations: [
88
+ { type: "file_edit", title: "Write unit tests for auth module", subtitle: "Testing", facts: ["25 test cases covering login, signup, role checking", "Mocked Prisma client with vitest", "Achieved 92% coverage on auth module"], narrative: "Wrote comprehensive unit tests for the authentication module. 25 test cases covering login flow, signup validation, role-based access checks, and password hashing. Mocked the Prisma client using vitest's vi.mock. Achieved 92% code coverage.", concepts: ["unit-testing", "vitest", "mocking", "authentication", "coverage"], files: ["tests/unit/auth.test.ts", "tests/unit/rbac.test.ts"], importance: 7 },
89
+ { type: "file_edit", title: "Add E2E tests with Playwright", subtitle: "E2E testing", facts: ["Configured Playwright with Chrome and Firefox", "Tests: login flow, post CRUD, search, pagination", "Set up test database with Docker for isolation"], narrative: "Set up Playwright for end-to-end testing with Chrome and Firefox browsers. Created E2E tests for the complete login flow, post CRUD operations, search functionality, and pagination. Each test run gets a fresh database via Docker containers.", concepts: ["playwright", "e2e-testing", "docker", "test-isolation", "browser-testing"], files: ["playwright.config.ts", "tests/e2e/auth.spec.ts", "tests/e2e/posts.spec.ts", "docker-compose.test.yml"], importance: 8 },
90
+ { type: "command_run", title: "Fix flaky Playwright test on CI", subtitle: "CI debugging", facts: ["Test passed locally but failed in GitHub Actions", "Root cause: missing waitForNavigation after form submit", "Fixed by using page.waitForURL instead of waitForNavigation"], narrative: "Debugged a flaky Playwright test that passed locally but failed intermittently in GitHub Actions CI. The issue was a race condition after form submission — the test was checking the URL before navigation completed. Fixed by replacing the deprecated waitForNavigation with page.waitForURL.", concepts: ["playwright", "flaky-test", "ci", "github-actions", "debugging", "race-condition"], files: ["tests/e2e/auth.spec.ts"], importance: 6 },
91
+ { type: "file_edit", title: "Add API integration tests with supertest", subtitle: "API testing", facts: ["30 test cases for REST API endpoints", "Tests validation, auth, error responses, pagination", "Uses test database with transaction rollback"], narrative: "Created API integration tests using supertest. 30 test cases covering request validation, authentication requirements, error response formats, and cursor-based pagination. Each test runs in a database transaction that rolls back after completion.", concepts: ["integration-testing", "supertest", "api-testing", "transactions", "test-isolation"], files: ["tests/integration/api.test.ts"], importance: 7 },
92
+ { type: "file_edit", title: "Set up test coverage reporting with codecov", subtitle: "Coverage", facts: ["Configured vitest coverage with v8 provider", "Minimum coverage thresholds: 80% branches, 85% lines", "Upload to Codecov in CI pipeline"], narrative: "Configured vitest code coverage using the v8 provider. Set minimum coverage thresholds at 80% for branches and 85% for lines. Coverage reports are uploaded to Codecov as part of the GitHub Actions CI pipeline.", concepts: ["code-coverage", "codecov", "vitest", "ci", "quality-gates"], files: ["vitest.config.ts", ".github/workflows/ci.yml"], importance: 5 },
93
+ { type: "file_edit", title: "Create test fixtures and factories", subtitle: "Test infrastructure", facts: ["Built factory functions for User, Post, Comment, Tag", "Uses faker for realistic data generation", "Supports partial overrides for specific test scenarios"], narrative: "Created test factory functions for all main models (User, Post, Comment, Tag). Factories use faker.js for realistic data and support partial overrides so individual tests can customize specific fields.", concepts: ["test-factories", "faker", "testing-infrastructure", "fixtures"], files: ["tests/fixtures/factories.ts"], importance: 5 },
94
+ { type: "command_run", title: "Debug memory leak in test suite", subtitle: "Test debugging", facts: ["Tests consuming 2GB+ RAM after 100+ test files", "Root cause: Prisma client not disconnected in afterAll", "Fixed by adding global teardown that calls prisma.$disconnect()"], narrative: "Investigated why the test suite was consuming over 2GB of RAM. The Prisma client was creating new connections in each test file but never disconnecting. Fixed by adding a global teardown hook that calls prisma.$disconnect().", concepts: ["memory-leak", "testing", "prisma", "debugging", "resource-management"], files: ["vitest.config.ts", "tests/setup.ts"], importance: 7 },
95
+ { type: "file_edit", title: "Add snapshot testing for API responses", subtitle: "Snapshot tests", facts: ["Added toMatchSnapshot for API response shapes", "Snapshot updates require --update flag", "Catches unintended breaking changes in API responses"], narrative: "Added snapshot testing for API response shapes to catch unintended breaking changes. Response bodies are compared against stored snapshots. Snapshots must be explicitly updated with the --update flag when intentional changes are made.", concepts: ["snapshot-testing", "api-testing", "regression-testing", "vitest"], files: ["tests/integration/api.test.ts", "tests/integration/__snapshots__/"], importance: 4 },
96
+ ],
97
+ },
98
+ {
99
+ sessionRange: [25, 29],
100
+ daysAgoRange: [4, 0],
101
+ project: "webapp",
102
+ observations: [
103
+ { type: "file_edit", title: "Create multi-stage Dockerfile", subtitle: "Docker", facts: ["Multi-stage build: deps → build → production", "Final image size 180MB (down from 1.2GB)", "Runs as non-root user with UID 1001"], narrative: "Created a multi-stage Dockerfile for the Next.js application. Stage 1 installs dependencies, stage 2 builds the app, stage 3 copies only production artifacts. Final image is 180MB (down from 1.2GB). Application runs as a non-root user for security.", concepts: ["docker", "multi-stage-build", "containerization", "security", "image-optimization"], files: ["Dockerfile", ".dockerignore"], importance: 7 },
104
+ { type: "file_edit", title: "Set up GitHub Actions CI/CD pipeline", subtitle: "CI/CD", facts: ["Matrix build: Node 18 and 20", "Jobs: lint, test, build, deploy", "Auto-deploy to Vercel on main branch push"], narrative: "Created a comprehensive GitHub Actions CI/CD pipeline with matrix builds for Node 18 and 20. Pipeline runs lint, test (with coverage), build, and deploy jobs. Merges to main automatically trigger Vercel deployment.", concepts: ["github-actions", "ci-cd", "deployment", "vercel", "automation"], files: [".github/workflows/ci.yml", ".github/workflows/deploy.yml"], importance: 8 },
105
+ { type: "file_edit", title: "Configure Kubernetes deployment manifests", subtitle: "K8s", facts: ["Created Deployment, Service, Ingress, HPA resources", "HPA: min 2, max 10 replicas, CPU target 70%", "Health checks: liveness on /healthz, readiness on /readyz"], narrative: "Created Kubernetes deployment manifests including Deployment, Service, Ingress, and HorizontalPodAutoscaler. HPA scales between 2 and 10 replicas targeting 70% CPU utilization. Added liveness and readiness probes for health monitoring.", concepts: ["kubernetes", "deployment", "hpa", "autoscaling", "health-checks", "ingress"], files: ["k8s/deployment.yaml", "k8s/service.yaml", "k8s/ingress.yaml", "k8s/hpa.yaml"], importance: 8 },
106
+ { type: "file_edit", title: "Add Terraform for AWS infrastructure", subtitle: "IaC", facts: ["VPC with public/private subnets across 3 AZs", "RDS PostgreSQL with Multi-AZ failover", "ElastiCache Redis cluster with 2 replicas"], narrative: "Created Terraform modules for AWS infrastructure. VPC spans 3 availability zones with public and private subnets. RDS PostgreSQL instance with Multi-AZ failover for high availability. ElastiCache Redis cluster with 2 read replicas.", concepts: ["terraform", "aws", "infrastructure-as-code", "vpc", "rds", "elasticache"], files: ["terraform/main.tf", "terraform/vpc.tf", "terraform/rds.tf", "terraform/redis.tf"], importance: 8 },
107
+ { type: "command_run", title: "Debug Kubernetes pod crash loop", subtitle: "K8s debugging", facts: ["Pods in CrashLoopBackOff status", "Root cause: DATABASE_URL secret not mounted correctly", "Fixed: Secret key name was 'database-url' but env var expected 'DATABASE_URL'"], narrative: "Debugged pods stuck in CrashLoopBackOff. The application was failing to start because the DATABASE_URL environment variable was empty. Root cause: the Kubernetes secret had the key 'database-url' (kebab-case) but the secretKeyRef expected 'DATABASE_URL' (uppercase).", concepts: ["kubernetes", "debugging", "crashloopbackoff", "secrets", "environment-variables"], files: ["k8s/deployment.yaml", "k8s/secrets.yaml"], importance: 8 },
108
+ { type: "file_edit", title: "Set up Datadog monitoring and alerting", subtitle: "Monitoring", facts: ["Deployed Datadog agent as DaemonSet", "Custom metrics: request latency, error rate, DB query time", "Alerts: p99 latency > 500ms, error rate > 1%"], narrative: "Deployed the Datadog monitoring agent as a Kubernetes DaemonSet. Created custom metrics for request latency, error rate, and database query time. Set up alerts that trigger when p99 latency exceeds 500ms or error rate exceeds 1%.", concepts: ["datadog", "monitoring", "alerting", "observability", "kubernetes"], files: ["k8s/datadog-agent.yaml", "src/lib/metrics.ts"], importance: 7 },
109
+ { type: "file_edit", title: "Implement blue-green deployment strategy", subtitle: "Deployment", facts: ["Two identical environments: blue and green", "Health check must pass before traffic switch", "Instant rollback by switching back to previous color"], narrative: "Implemented blue-green deployment strategy. Two identical environments run simultaneously — deploy to the inactive one, run health checks, then switch traffic via Kubernetes service selector update. Rollback is instant by pointing traffic back to the previous color.", concepts: ["blue-green", "deployment-strategy", "zero-downtime", "rollback", "kubernetes"], files: ["k8s/blue-deployment.yaml", "k8s/green-deployment.yaml", "scripts/deploy.sh"], importance: 7 },
110
+ { type: "file_edit", title: "Add Prometheus metrics and Grafana dashboards", subtitle: "Observability", facts: ["Exported custom metrics via /metrics endpoint", "Metrics: http_request_duration, db_query_duration, cache_hit_ratio", "Created Grafana dashboard with request rate, latency, error panels"], narrative: "Added Prometheus metrics export on a /metrics endpoint. Custom metrics include HTTP request duration histogram, database query duration, and cache hit ratio. Created a Grafana dashboard with panels for request rate, latency percentiles, error rate, and cache performance.", concepts: ["prometheus", "grafana", "metrics", "observability", "dashboards"], files: ["src/lib/metrics.ts", "grafana/dashboard.json"], importance: 6 },
111
+ ],
112
+ },
113
+ ];
114
+
115
+ export function generateDataset(): {
116
+ observations: CompressedObservation[];
117
+ queries: LabeledQuery[];
118
+ sessions: Map<string, string[]>;
119
+ } {
120
+ const observations: CompressedObservation[] = [];
121
+ const sessions = new Map<string, string[]>();
122
+
123
+ for (const group of RAW_SESSIONS) {
124
+ const [sStart, sEnd] = group.sessionRange;
125
+ const [dStart, dEnd] = group.daysAgoRange;
126
+
127
+ for (let s = sStart; s <= sEnd; s++) {
128
+ const sessionId = `ses_${s.toString().padStart(3, "0")}`;
129
+ const daysAgo = dStart - ((s - sStart) / Math.max(1, sEnd - sStart)) * (dStart - dEnd);
130
+ const obsIds: string[] = [];
131
+
132
+ const obsPerSession = Math.min(group.observations.length, OBS_PER_SESSION);
133
+ for (let o = 0; o < obsPerSession; o++) {
134
+ const idx = ((s - sStart) * obsPerSession + o) % group.observations.length;
135
+ const raw = group.observations[idx];
136
+ const obsId = `obs_${sessionId}_${o.toString().padStart(2, "0")}`;
137
+ const hourOffset = o * 0.5;
138
+
139
+ observations.push({
140
+ id: obsId,
141
+ sessionId,
142
+ timestamp: ts(daysAgo - hourOffset / 24),
143
+ ...raw,
144
+ });
145
+ obsIds.push(obsId);
146
+ }
147
+ sessions.set(sessionId, obsIds);
148
+ }
149
+ }
150
+
151
+ const queries: LabeledQuery[] = [
152
+ {
153
+ query: "How did we set up authentication?",
154
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["nextauth", "authentication", "oauth", "jwt", "login", "signup"].includes(c))).map(o => o.id),
155
+ description: "Should find all auth-related observations across sessions 5-9",
156
+ category: "semantic",
157
+ },
158
+ {
159
+ query: "JWT token validation middleware",
160
+ relevantObsIds: observations.filter(o => o.concepts.includes("jwt") || (o.concepts.includes("middleware") && o.concepts.includes("authentication"))).map(o => o.id),
161
+ description: "Exact match on JWT middleware setup",
162
+ category: "exact",
163
+ },
164
+ {
165
+ query: "PostgreSQL connection issues",
166
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["postgresql", "pgbouncer", "connection-pooling", "database"].includes(c))).map(o => o.id),
167
+ description: "Should find database connection and pooling observations",
168
+ category: "semantic",
169
+ },
170
+ {
171
+ query: "Playwright test configuration",
172
+ relevantObsIds: observations.filter(o => o.concepts.includes("playwright") || (o.concepts.includes("e2e-testing"))).map(o => o.id),
173
+ description: "E2E testing setup with Playwright",
174
+ category: "exact",
175
+ },
176
+ {
177
+ query: "Why did the production deployment fail?",
178
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["debugging", "production", "crashloopbackoff", "timeout", "migration-drift"].includes(c))).map(o => o.id),
179
+ description: "Cross-session: find all production debugging incidents",
180
+ category: "cross-session",
181
+ },
182
+ {
183
+ query: "rate limiting implementation",
184
+ relevantObsIds: observations.filter(o => o.concepts.includes("rate-limiting")).map(o => o.id),
185
+ description: "Rate limiting across auth and API modules",
186
+ category: "exact",
187
+ },
188
+ {
189
+ query: "What security measures did we add?",
190
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["security", "csrf", "bcrypt", "rate-limiting", "rbac", "password-hashing"].includes(c))).map(o => o.id),
191
+ description: "Broad semantic: all security-related work",
192
+ category: "semantic",
193
+ },
194
+ {
195
+ query: "database performance optimization",
196
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["n+1", "query-optimization", "database-index", "performance", "eager-loading", "caching"].includes(c))).map(o => o.id),
197
+ description: "Performance optimizations across database and caching",
198
+ category: "semantic",
199
+ },
200
+ {
201
+ query: "Kubernetes pod crash debugging",
202
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["crashloopbackoff", "kubernetes"].includes(c)) && o.concepts.includes("debugging")).map(o => o.id),
203
+ description: "Specific K8s debugging incident",
204
+ category: "entity",
205
+ },
206
+ {
207
+ query: "Docker containerization setup",
208
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["docker", "multi-stage-build", "containerization", "dockerfile"].includes(c))).map(o => o.id),
209
+ description: "Docker-related observations",
210
+ category: "entity",
211
+ },
212
+ {
213
+ query: "How does caching work in the app?",
214
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["redis", "caching", "cache-aside", "ioredis", "elasticache"].includes(c))).map(o => o.id),
215
+ description: "All caching-related observations",
216
+ category: "semantic",
217
+ },
218
+ {
219
+ query: "test infrastructure and factories",
220
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["test-factories", "testing-infrastructure", "fixtures", "mocking"].includes(c))).map(o => o.id),
221
+ description: "Test setup infrastructure",
222
+ category: "exact",
223
+ },
224
+ {
225
+ query: "What happened with the OAuth callback error?",
226
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["oauth-debugging", "callback-url"].includes(c))).map(o => o.id),
227
+ description: "Specific debugging incident recall",
228
+ category: "cross-session",
229
+ },
230
+ {
231
+ query: "monitoring and observability setup",
232
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["datadog", "prometheus", "grafana", "monitoring", "observability", "alerting", "metrics", "logging", "pino"].includes(c))).map(o => o.id),
233
+ description: "All monitoring/observability observations",
234
+ category: "semantic",
235
+ },
236
+ {
237
+ query: "Prisma ORM configuration",
238
+ relevantObsIds: observations.filter(o => o.concepts.includes("prisma")).map(o => o.id),
239
+ description: "All Prisma-related observations",
240
+ category: "entity",
241
+ },
242
+ {
243
+ query: "CI/CD pipeline configuration",
244
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["ci-cd", "github-actions", "deployment", "ci"].includes(c))).map(o => o.id),
245
+ description: "CI/CD related observations",
246
+ category: "exact",
247
+ },
248
+ {
249
+ query: "memory leak debugging",
250
+ relevantObsIds: observations.filter(o => o.concepts.includes("memory-leak")).map(o => o.id),
251
+ description: "Memory leak incidents (WebSocket handler, test suite)",
252
+ category: "cross-session",
253
+ },
254
+ {
255
+ query: "API design decisions",
256
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["rest-api", "api-design", "api-versioning", "pagination", "openapi", "error-handling"].includes(c))).map(o => o.id),
257
+ description: "API design and architecture decisions",
258
+ category: "semantic",
259
+ },
260
+ {
261
+ query: "zod validation schemas",
262
+ relevantObsIds: observations.filter(o => o.concepts.includes("zod")).map(o => o.id),
263
+ description: "Where zod is used for validation",
264
+ category: "entity",
265
+ },
266
+ {
267
+ query: "infrastructure as code Terraform",
268
+ relevantObsIds: observations.filter(o => o.concepts.some(c => ["terraform", "infrastructure-as-code", "aws", "vpc", "rds", "elasticache"].includes(c))).map(o => o.id),
269
+ description: "Terraform/IaC observations",
270
+ category: "entity",
271
+ },
272
+ ];
273
+
274
+ return { observations, queries, sessions };
275
+ }
276
+
277
+ export function generateScaleDataset(count: number): CompressedObservation[] {
278
+ const base = generateDataset().observations;
279
+ const result: CompressedObservation[] = [];
280
+
281
+ for (let i = 0; i < count; i++) {
282
+ const src = base[i % base.length];
283
+ result.push({
284
+ ...src,
285
+ id: `obs_scale_${i.toString().padStart(6, "0")}`,
286
+ sessionId: `ses_${Math.floor(i / 8).toString().padStart(4, "0")}`,
287
+ timestamp: ts(Math.random() * 90),
288
+ title: `${src.title} (iteration ${i})`,
289
+ narrative: `${src.narrative} [Scale test variant ${i}, session group ${Math.floor(i / 8)}]`,
290
+ });
291
+ }
292
+ return result;
293
+ }