@agentmemory/agentmemory 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/.claude-plugin/marketplace.json +14 -0
  2. package/.github/workflows/ci.yml +22 -0
  3. package/.github/workflows/publish.yml +28 -0
  4. package/AGENTS.md +113 -0
  5. package/LICENSE +190 -0
  6. package/README.md +828 -0
  7. package/assets/banner.png +0 -0
  8. package/assets/demo.gif +0 -0
  9. package/assets/demo.mp4 +0 -0
  10. package/benchmark/QUALITY.md +73 -0
  11. package/benchmark/REAL-EMBEDDINGS.md +67 -0
  12. package/benchmark/SCALE.md +110 -0
  13. package/benchmark/dataset.ts +293 -0
  14. package/benchmark/quality-eval.ts +643 -0
  15. package/benchmark/real-embeddings-eval.ts +405 -0
  16. package/benchmark/scale-eval.ts +398 -0
  17. package/dist/cli.d.mts +1 -0
  18. package/dist/cli.mjs +137 -0
  19. package/dist/cli.mjs.map +1 -0
  20. package/dist/docker-compose.yml +14 -0
  21. package/dist/hooks/notification.d.mts +1 -0
  22. package/dist/hooks/notification.mjs +45 -0
  23. package/dist/hooks/notification.mjs.map +1 -0
  24. package/dist/hooks/post-tool-failure.d.mts +1 -0
  25. package/dist/hooks/post-tool-failure.mjs +45 -0
  26. package/dist/hooks/post-tool-failure.mjs.map +1 -0
  27. package/dist/hooks/post-tool-use.d.mts +1 -0
  28. package/dist/hooks/post-tool-use.mjs +53 -0
  29. package/dist/hooks/post-tool-use.mjs.map +1 -0
  30. package/dist/hooks/pre-compact.d.mts +1 -0
  31. package/dist/hooks/pre-compact.mjs +50 -0
  32. package/dist/hooks/pre-compact.mjs.map +1 -0
  33. package/dist/hooks/pre-tool-use.d.mts +1 -0
  34. package/dist/hooks/pre-tool-use.mjs +69 -0
  35. package/dist/hooks/pre-tool-use.mjs.map +1 -0
  36. package/dist/hooks/prompt-submit.d.mts +1 -0
  37. package/dist/hooks/prompt-submit.mjs +40 -0
  38. package/dist/hooks/prompt-submit.mjs.map +1 -0
  39. package/dist/hooks/session-end.d.mts +1 -0
  40. package/dist/hooks/session-end.mjs +61 -0
  41. package/dist/hooks/session-end.mjs.map +1 -0
  42. package/dist/hooks/session-start.d.mts +1 -0
  43. package/dist/hooks/session-start.mjs +42 -0
  44. package/dist/hooks/session-start.mjs.map +1 -0
  45. package/dist/hooks/stop.d.mts +1 -0
  46. package/dist/hooks/stop.mjs +33 -0
  47. package/dist/hooks/stop.mjs.map +1 -0
  48. package/dist/hooks/subagent-start.d.mts +1 -0
  49. package/dist/hooks/subagent-start.mjs +43 -0
  50. package/dist/hooks/subagent-start.mjs.map +1 -0
  51. package/dist/hooks/subagent-stop.d.mts +1 -0
  52. package/dist/hooks/subagent-stop.mjs +45 -0
  53. package/dist/hooks/subagent-stop.mjs.map +1 -0
  54. package/dist/hooks/task-completed.d.mts +1 -0
  55. package/dist/hooks/task-completed.mjs +46 -0
  56. package/dist/hooks/task-completed.mjs.map +1 -0
  57. package/dist/iii-config.yaml +51 -0
  58. package/dist/index.d.mts +2 -0
  59. package/dist/index.mjs +13776 -0
  60. package/dist/index.mjs.map +1 -0
  61. package/dist/src-QxitMPfJ.mjs +13775 -0
  62. package/dist/src-QxitMPfJ.mjs.map +1 -0
  63. package/dist/standalone.d.mts +1 -0
  64. package/dist/standalone.mjs +1155 -0
  65. package/dist/standalone.mjs.map +1 -0
  66. package/dist/transformers-BX_tgxdO.mjs +38684 -0
  67. package/dist/transformers-BX_tgxdO.mjs.map +1 -0
  68. package/dist/transformers-KMm1i9no.mjs +38683 -0
  69. package/dist/transformers-KMm1i9no.mjs.map +1 -0
  70. package/docker-compose.yml +14 -0
  71. package/iii-config.yaml +51 -0
  72. package/package.json +59 -0
  73. package/plugin/.claude-plugin/plugin.json +10 -0
  74. package/plugin/hooks/hooks.json +77 -0
  75. package/plugin/scripts/diagnostics.mjs +551 -0
  76. package/plugin/scripts/notification.mjs +45 -0
  77. package/plugin/scripts/post-tool-failure.mjs +45 -0
  78. package/plugin/scripts/post-tool-use.mjs +53 -0
  79. package/plugin/scripts/pre-compact.mjs +50 -0
  80. package/plugin/scripts/pre-tool-use.mjs +69 -0
  81. package/plugin/scripts/prompt-submit.mjs +40 -0
  82. package/plugin/scripts/session-end.mjs +61 -0
  83. package/plugin/scripts/session-start.mjs +42 -0
  84. package/plugin/scripts/stop.mjs +33 -0
  85. package/plugin/scripts/subagent-start.mjs +43 -0
  86. package/plugin/scripts/subagent-stop.mjs +45 -0
  87. package/plugin/scripts/task-completed.mjs +46 -0
  88. package/plugin/skills/forget/SKILL.md +32 -0
  89. package/plugin/skills/recall/SKILL.md +18 -0
  90. package/plugin/skills/remember/SKILL.md +25 -0
  91. package/plugin/skills/session-history/SKILL.md +17 -0
  92. package/src/auth.ts +12 -0
  93. package/src/cli.ts +159 -0
  94. package/src/config.ts +221 -0
  95. package/src/eval/metrics-store.ts +65 -0
  96. package/src/eval/quality.ts +51 -0
  97. package/src/eval/schemas.ts +124 -0
  98. package/src/eval/self-correct.ts +28 -0
  99. package/src/eval/validator.ts +31 -0
  100. package/src/functions/actions.ts +288 -0
  101. package/src/functions/audit.ts +61 -0
  102. package/src/functions/auto-forget.ts +169 -0
  103. package/src/functions/branch-aware.ts +169 -0
  104. package/src/functions/cascade.ts +80 -0
  105. package/src/functions/checkpoints.ts +209 -0
  106. package/src/functions/claude-bridge.ts +161 -0
  107. package/src/functions/compress.ts +194 -0
  108. package/src/functions/consolidate.ts +212 -0
  109. package/src/functions/consolidation-pipeline.ts +258 -0
  110. package/src/functions/context.ts +169 -0
  111. package/src/functions/crystallize.ts +293 -0
  112. package/src/functions/dedup.ts +57 -0
  113. package/src/functions/diagnostics.ts +785 -0
  114. package/src/functions/enrich.ts +132 -0
  115. package/src/functions/evict.ts +163 -0
  116. package/src/functions/export-import.ts +508 -0
  117. package/src/functions/facets.ts +248 -0
  118. package/src/functions/file-index.ts +106 -0
  119. package/src/functions/flow-compress.ts +214 -0
  120. package/src/functions/frontier.ts +196 -0
  121. package/src/functions/governance.ts +131 -0
  122. package/src/functions/graph-retrieval.ts +277 -0
  123. package/src/functions/graph.ts +275 -0
  124. package/src/functions/leases.ts +216 -0
  125. package/src/functions/lessons.ts +253 -0
  126. package/src/functions/mesh.ts +434 -0
  127. package/src/functions/migrate.ts +165 -0
  128. package/src/functions/observe.ts +144 -0
  129. package/src/functions/obsidian-export.ts +310 -0
  130. package/src/functions/patterns.ts +138 -0
  131. package/src/functions/privacy.ts +39 -0
  132. package/src/functions/profile.ts +155 -0
  133. package/src/functions/query-expansion.ts +186 -0
  134. package/src/functions/relations.ts +237 -0
  135. package/src/functions/remember.ts +162 -0
  136. package/src/functions/retention.ts +235 -0
  137. package/src/functions/routines.ts +289 -0
  138. package/src/functions/search.ts +80 -0
  139. package/src/functions/sentinels.ts +417 -0
  140. package/src/functions/signals.ts +186 -0
  141. package/src/functions/sketches.ts +274 -0
  142. package/src/functions/sliding-window.ts +257 -0
  143. package/src/functions/smart-search.ts +115 -0
  144. package/src/functions/snapshot.ts +219 -0
  145. package/src/functions/summarize.ts +155 -0
  146. package/src/functions/team.ts +147 -0
  147. package/src/functions/temporal-graph.ts +476 -0
  148. package/src/functions/timeline.ts +138 -0
  149. package/src/functions/verify.ts +117 -0
  150. package/src/health/monitor.ts +110 -0
  151. package/src/health/thresholds.ts +73 -0
  152. package/src/hooks/notification.ts +52 -0
  153. package/src/hooks/post-tool-failure.ts +58 -0
  154. package/src/hooks/post-tool-use.ts +62 -0
  155. package/src/hooks/pre-compact.ts +60 -0
  156. package/src/hooks/pre-tool-use.ts +72 -0
  157. package/src/hooks/prompt-submit.ts +46 -0
  158. package/src/hooks/session-end.ts +71 -0
  159. package/src/hooks/session-start.ts +48 -0
  160. package/src/hooks/stop.ts +39 -0
  161. package/src/hooks/subagent-start.ts +49 -0
  162. package/src/hooks/subagent-stop.ts +54 -0
  163. package/src/hooks/task-completed.ts +54 -0
  164. package/src/index.ts +342 -0
  165. package/src/mcp/in-memory-kv.ts +61 -0
  166. package/src/mcp/server.ts +1455 -0
  167. package/src/mcp/standalone.ts +177 -0
  168. package/src/mcp/tools-registry.ts +769 -0
  169. package/src/mcp/transport.ts +91 -0
  170. package/src/prompts/compression.ts +67 -0
  171. package/src/prompts/consolidation.ts +48 -0
  172. package/src/prompts/graph-extraction.ts +35 -0
  173. package/src/prompts/summary.ts +38 -0
  174. package/src/prompts/xml.ts +26 -0
  175. package/src/providers/agent-sdk.ts +34 -0
  176. package/src/providers/anthropic.ts +35 -0
  177. package/src/providers/circuit-breaker.ts +82 -0
  178. package/src/providers/embedding/cohere.ts +46 -0
  179. package/src/providers/embedding/gemini.ts +54 -0
  180. package/src/providers/embedding/index.ts +39 -0
  181. package/src/providers/embedding/local.ts +52 -0
  182. package/src/providers/embedding/openai.ts +45 -0
  183. package/src/providers/embedding/openrouter.ts +51 -0
  184. package/src/providers/embedding/voyage.ts +46 -0
  185. package/src/providers/fallback-chain.ts +31 -0
  186. package/src/providers/index.ts +84 -0
  187. package/src/providers/openrouter.ts +71 -0
  188. package/src/providers/resilient.ts +37 -0
  189. package/src/state/hybrid-search.ts +295 -0
  190. package/src/state/index-persistence.ts +63 -0
  191. package/src/state/keyed-mutex.ts +18 -0
  192. package/src/state/kv.ts +33 -0
  193. package/src/state/schema.ts +71 -0
  194. package/src/state/search-index.ts +245 -0
  195. package/src/state/stemmer.ts +104 -0
  196. package/src/state/synonyms.ts +63 -0
  197. package/src/state/vector-index.ts +130 -0
  198. package/src/telemetry/setup.ts +116 -0
  199. package/src/triggers/api.ts +1904 -0
  200. package/src/triggers/events.ts +71 -0
  201. package/src/types.ts +769 -0
  202. package/src/version.ts +1 -0
  203. package/src/viewer/index.html +2497 -0
  204. package/src/viewer/server.ts +207 -0
  205. package/src/xenova.d.ts +3 -0
  206. package/test/actions.test.ts +490 -0
  207. package/test/audit.test.ts +108 -0
  208. package/test/auto-forget.test.ts +188 -0
  209. package/test/cascade.test.ts +277 -0
  210. package/test/checkpoints.test.ts +493 -0
  211. package/test/circuit-breaker.test.ts +107 -0
  212. package/test/claude-bridge.test.ts +178 -0
  213. package/test/confidence.test.ts +247 -0
  214. package/test/consistency.test.ts +61 -0
  215. package/test/consolidation-pipeline.test.ts +251 -0
  216. package/test/crystallize.test.ts +521 -0
  217. package/test/diagnostics.test.ts +638 -0
  218. package/test/embedding-provider.test.ts +49 -0
  219. package/test/enrich.test.ts +209 -0
  220. package/test/eval.test.ts +300 -0
  221. package/test/export-import.test.ts +251 -0
  222. package/test/facets.test.ts +448 -0
  223. package/test/fallback-chain.test.ts +93 -0
  224. package/test/frontier.test.ts +485 -0
  225. package/test/governance.test.ts +147 -0
  226. package/test/graph-retrieval.test.ts +186 -0
  227. package/test/graph.test.ts +160 -0
  228. package/test/helpers/mocks.ts +40 -0
  229. package/test/hybrid-search.test.ts +145 -0
  230. package/test/index-persistence.test.ts +124 -0
  231. package/test/integration.test.ts +265 -0
  232. package/test/leases.test.ts +399 -0
  233. package/test/mcp-prompts.test.ts +218 -0
  234. package/test/mcp-resources.test.ts +286 -0
  235. package/test/mcp-standalone.test.ts +113 -0
  236. package/test/mesh.test.ts +700 -0
  237. package/test/privacy.test.ts +87 -0
  238. package/test/profile.test.ts +161 -0
  239. package/test/query-expansion.test.ts +154 -0
  240. package/test/relations.test.ts +198 -0
  241. package/test/retention.test.ts +245 -0
  242. package/test/routines.test.ts +497 -0
  243. package/test/schema-fingerprint.test.ts +81 -0
  244. package/test/schema.test.ts +42 -0
  245. package/test/search-index.test.ts +128 -0
  246. package/test/sentinels.test.ts +626 -0
  247. package/test/signals.test.ts +410 -0
  248. package/test/sketches.test.ts +549 -0
  249. package/test/sliding-window.test.ts +199 -0
  250. package/test/smart-search.test.ts +169 -0
  251. package/test/snapshot.test.ts +165 -0
  252. package/test/team.test.ts +156 -0
  253. package/test/temporal-graph.test.ts +378 -0
  254. package/test/timeline.test.ts +148 -0
  255. package/test/vector-index.test.ts +79 -0
  256. package/test/verify.test.ts +209 -0
  257. package/test/xml.test.ts +65 -0
  258. package/tsconfig.json +22 -0
  259. package/tsdown.config.ts +62 -0
@@ -0,0 +1,405 @@
1
+ import { SearchIndex } from "../src/state/search-index.js";
2
+ import { VectorIndex } from "../src/state/vector-index.js";
3
+ import { HybridSearch } from "../src/state/hybrid-search.js";
4
+ import { LocalEmbeddingProvider } from "../src/providers/embedding/local.js";
5
+ import type { CompressedObservation, EmbeddingProvider } from "../src/types.js";
6
+ import { generateDataset, type LabeledQuery } from "./dataset.js";
7
+ import { writeFileSync } from "node:fs";
8
+
9
+ function mockKV() {
10
+ const store = new Map<string, Map<string, unknown>>();
11
+ return {
12
+ get: async <T>(scope: string, key: string): Promise<T | null> =>
13
+ (store.get(scope)?.get(key) as T) ?? null,
14
+ set: async <T>(scope: string, key: string, data: T): Promise<T> => {
15
+ if (!store.has(scope)) store.set(scope, new Map());
16
+ store.get(scope)!.set(key, data);
17
+ return data;
18
+ },
19
+ delete: async (scope: string, key: string): Promise<void> => {
20
+ store.get(scope)?.delete(key);
21
+ },
22
+ list: async <T>(scope: string): Promise<T[]> => {
23
+ const entries = store.get(scope);
24
+ return entries ? (Array.from(entries.values()) as T[]) : [];
25
+ },
26
+ };
27
+ }
28
+
29
+ function estimateTokens(text: string): number {
30
+ return Math.ceil(text.length / 4);
31
+ }
32
+
33
+ function obsToText(obs: CompressedObservation): string {
34
+ return [obs.title, obs.subtitle || "", obs.narrative, ...obs.facts, ...obs.concepts].join(" ");
35
+ }
36
+
37
+ function recall(retrieved: string[], relevant: Set<string>, k: number): number {
38
+ if (relevant.size === 0) return 1;
39
+ const topK = new Set(retrieved.slice(0, k));
40
+ let hits = 0;
41
+ for (const id of relevant) if (topK.has(id)) hits++;
42
+ return hits / relevant.size;
43
+ }
44
+
45
+ function precision(retrieved: string[], relevant: Set<string>, k: number): number {
46
+ const topK = retrieved.slice(0, k);
47
+ if (topK.length === 0) return 0;
48
+ let hits = 0;
49
+ for (const id of topK) if (relevant.has(id)) hits++;
50
+ return hits / topK.length;
51
+ }
52
+
53
+ function dcg(relevances: boolean[], k: number): number {
54
+ let sum = 0;
55
+ for (let i = 0; i < Math.min(k, relevances.length); i++)
56
+ sum += (relevances[i] ? 1 : 0) / Math.log2(i + 2);
57
+ return sum;
58
+ }
59
+
60
+ function ndcg(retrieved: string[], relevant: Set<string>, k: number): number {
61
+ const actual = retrieved.slice(0, k).map(id => relevant.has(id));
62
+ const ideal = Array.from({ length: Math.min(k, relevant.size) }, () => true);
63
+ const idealDCG = dcg(ideal, k);
64
+ return idealDCG === 0 ? 0 : dcg(actual, k) / idealDCG;
65
+ }
66
+
67
+ function mrr(retrieved: string[], relevant: Set<string>): number {
68
+ for (let i = 0; i < retrieved.length; i++)
69
+ if (relevant.has(retrieved[i])) return 1 / (i + 1);
70
+ return 0;
71
+ }
72
+
73
+ function avg(nums: number[]): number {
74
+ return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0;
75
+ }
76
+
77
+ function pct(n: number): string {
78
+ return (n * 100).toFixed(1) + "%";
79
+ }
80
+
81
+ interface QueryResult {
82
+ query: string;
83
+ category: string;
84
+ recall_5: number;
85
+ recall_10: number;
86
+ precision_5: number;
87
+ ndcg_10: number;
88
+ mrr_val: number;
89
+ relevant_count: number;
90
+ latency_ms: number;
91
+ }
92
+
93
+ interface SystemResult {
94
+ name: string;
95
+ results: QueryResult[];
96
+ embed_time_ms: number;
97
+ tokens_per_query: number;
98
+ }
99
+
100
+ async function evalSystem(
101
+ name: string,
102
+ observations: CompressedObservation[],
103
+ queries: LabeledQuery[],
104
+ provider: EmbeddingProvider | null,
105
+ weights: { bm25: number; vector: number; graph: number },
106
+ ): Promise<SystemResult> {
107
+ const kv = mockKV();
108
+ const bm25 = new SearchIndex();
109
+ const vector = provider ? new VectorIndex() : null;
110
+
111
+ console.log(` Indexing ${observations.length} observations...`);
112
+ const embedStart = performance.now();
113
+
114
+ for (const obs of observations) {
115
+ bm25.add(obs);
116
+ await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs);
117
+ }
118
+
119
+ if (provider && vector) {
120
+ const batchSize = 32;
121
+ for (let i = 0; i < observations.length; i += batchSize) {
122
+ const batch = observations.slice(i, i + batchSize);
123
+ const texts = batch.map(o => obsToText(o));
124
+ const embeddings = await provider.embedBatch(texts);
125
+ for (let j = 0; j < batch.length; j++) {
126
+ vector.add(batch[j].id, batch[j].sessionId, embeddings[j]);
127
+ }
128
+ if ((i + batchSize) % 100 === 0 || i + batchSize >= observations.length) {
129
+ process.stdout.write(`\r Embedded ${Math.min(i + batchSize, observations.length)}/${observations.length}`);
130
+ }
131
+ }
132
+ console.log("");
133
+ }
134
+
135
+ const embedTime = performance.now() - embedStart;
136
+
137
+ const hybrid = new HybridSearch(
138
+ bm25,
139
+ vector,
140
+ provider,
141
+ kv as never,
142
+ weights.bm25,
143
+ weights.vector,
144
+ weights.graph,
145
+ );
146
+
147
+ console.log(` Running ${queries.length} queries...`);
148
+ const results: QueryResult[] = [];
149
+
150
+ for (const q of queries) {
151
+ const relevant = new Set(q.relevantObsIds);
152
+ const start = performance.now();
153
+ const searchResults = await hybrid.search(q.query, 20);
154
+ const latency = performance.now() - start;
155
+
156
+ const retrieved = searchResults.map(r => r.observation.id);
157
+ results.push({
158
+ query: q.query,
159
+ category: q.category,
160
+ recall_5: recall(retrieved, relevant, 5),
161
+ recall_10: recall(retrieved, relevant, 10),
162
+ precision_5: precision(retrieved, relevant, 5),
163
+ ndcg_10: ndcg(retrieved, relevant, 10),
164
+ mrr_val: mrr(retrieved, relevant),
165
+ relevant_count: relevant.size,
166
+ latency_ms: latency,
167
+ });
168
+ }
169
+
170
+ let totalReturnedTokens = 0;
171
+ for (const q of queries) {
172
+ const searchResults = await hybrid.search(q.query, 10);
173
+ totalReturnedTokens += searchResults.reduce(
174
+ (sum, r) => sum + estimateTokens(JSON.stringify(r.observation)),
175
+ 0,
176
+ );
177
+ }
178
+ const avgReturnedTokens = Math.round(totalReturnedTokens / queries.length);
179
+
180
+ return {
181
+ name,
182
+ results,
183
+ embed_time_ms: embedTime,
184
+ tokens_per_query: avgReturnedTokens,
185
+ };
186
+ }
187
+
188
+ async function evalBuiltinGrep(
189
+ observations: CompressedObservation[],
190
+ queries: LabeledQuery[],
191
+ ): Promise<SystemResult> {
192
+ const results: QueryResult[] = [];
193
+
194
+ for (const q of queries) {
195
+ const relevant = new Set(q.relevantObsIds);
196
+ const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2);
197
+ const start = performance.now();
198
+
199
+ const scored: Array<{ id: string; score: number }> = [];
200
+ for (const obs of observations) {
201
+ const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ").toLowerCase();
202
+ let score = 0;
203
+ for (const term of queryTerms) if (text.includes(term)) score++;
204
+ if (score > 0) scored.push({ id: obs.id, score });
205
+ }
206
+ scored.sort((a, b) => b.score - a.score);
207
+ const latency = performance.now() - start;
208
+
209
+ const retrieved = scored.map(s => s.id).slice(0, 20);
210
+ results.push({
211
+ query: q.query,
212
+ category: q.category,
213
+ recall_5: recall(retrieved, relevant, 5),
214
+ recall_10: recall(retrieved, relevant, 10),
215
+ precision_5: precision(retrieved, relevant, 5),
216
+ ndcg_10: ndcg(retrieved, relevant, 10),
217
+ mrr_val: mrr(retrieved, relevant),
218
+ relevant_count: relevant.size,
219
+ latency_ms: latency,
220
+ });
221
+ }
222
+
223
+ const allTokens = estimateTokens(observations.map(o =>
224
+ `## ${o.title}\n${o.narrative}\nConcepts: ${o.concepts.join(", ")}`
225
+ ).join("\n\n"));
226
+
227
+ return { name: "Built-in (grep all)", results, embed_time_ms: 0, tokens_per_query: allTokens };
228
+ }
229
+
230
+ function generateReport(systems: SystemResult[], obsCount: number): string {
231
+ const lines: string[] = [];
232
+ const w = (s: string) => lines.push(s);
233
+
234
+ w("# agentmemory v0.6.0 — Real Embeddings Quality Evaluation");
235
+ w("");
236
+ w(`**Date:** ${new Date().toISOString()}`);
237
+ w(`**Platform:** ${process.platform} ${process.arch}, Node ${process.version}`);
238
+ w(`**Dataset:** ${obsCount} observations, 30 sessions, 20 labeled queries`);
239
+ w(`**Embedding model:** Xenova/all-MiniLM-L6-v2 (384d, local, no API key)`);
240
+ w("");
241
+
242
+ w("## Head-to-Head: Real Embeddings vs Keyword Search");
243
+ w("");
244
+ w("| System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Avg Latency | Tokens/query |");
245
+ w("|--------|----------|-----------|-------------|---------|-----|-------------|--------------|");
246
+
247
+ for (const s of systems) {
248
+ const r = s.results;
249
+ w(`| ${s.name} | ${pct(avg(r.map(q => q.recall_5)))} | ${pct(avg(r.map(q => q.recall_10)))} | ${pct(avg(r.map(q => q.precision_5)))} | ${pct(avg(r.map(q => q.ndcg_10)))} | ${pct(avg(r.map(q => q.mrr_val)))} | ${avg(r.map(q => q.latency_ms)).toFixed(2)}ms | ${s.tokens_per_query.toLocaleString()} |`);
250
+ }
251
+
252
+ w("");
253
+ w("## Improvement from Real Embeddings");
254
+ w("");
255
+
256
+ const bm25Only = systems.find(s => s.name === "BM25-only (stemmed+synonyms)");
257
+ const dual = systems.find(s => s.name.includes("Dual-stream"));
258
+ const triple = systems.find(s => s.name.includes("Triple-stream"));
259
+ const builtin = systems.find(s => s.name.includes("grep"));
260
+
261
+ if (bm25Only && dual) {
262
+ const recallDelta = avg(dual.results.map(q => q.recall_10)) - avg(bm25Only.results.map(q => q.recall_10));
263
+ w(`Adding real vector embeddings to BM25 improves recall@10 by **${(recallDelta * 100).toFixed(1)} percentage points**.`);
264
+ }
265
+ if (builtin && dual) {
266
+ const tokenSaving = (1 - dual.tokens_per_query / builtin.tokens_per_query) * 100;
267
+ w(`Token savings vs loading everything: **${tokenSaving.toFixed(0)}%** (${dual.tokens_per_query.toLocaleString()} vs ${builtin.tokens_per_query.toLocaleString()} tokens).`);
268
+ }
269
+
270
+ w("");
271
+ w("## Per-Query: Where Real Embeddings Win");
272
+ w("");
273
+
274
+ if (bm25Only && dual) {
275
+ w("Queries where dual-stream (real embeddings) outperforms BM25-only:");
276
+ w("");
277
+ w("| Query | Category | BM25 Recall@10 | +Vector Recall@10 | Delta |");
278
+ w("|-------|----------|---------------|-------------------|-------|");
279
+
280
+ for (let i = 0; i < bm25Only.results.length; i++) {
281
+ const bq = bm25Only.results[i];
282
+ const dq = dual.results[i];
283
+ const delta = dq.recall_10 - bq.recall_10;
284
+ const marker = delta > 0 ? " **" : delta < 0 ? " *" : "";
285
+ if (Math.abs(delta) > 0.001) {
286
+ w(`| ${bq.query.slice(0, 45)}${bq.query.length > 45 ? "..." : ""} | ${bq.category} | ${pct(bq.recall_10)} | ${pct(dq.recall_10)} | ${delta > 0 ? "+" : ""}${(delta * 100).toFixed(1)}pp${marker} |`);
287
+ }
288
+ }
289
+ }
290
+
291
+ w("");
292
+ w("## By Category Comparison");
293
+ w("");
294
+ const categories = ["exact", "semantic", "cross-session", "entity"];
295
+
296
+ w("| Category | Built-in grep | BM25 (stemmed) | +Real Vectors | +Graph |");
297
+ w("|----------|--------------|----------------|--------------|--------|");
298
+
299
+ for (const cat of categories) {
300
+ const vals = systems.map(s => {
301
+ const qs = s.results.filter(q => q.category === cat);
302
+ return qs.length ? pct(avg(qs.map(q => q.recall_10))) : "-";
303
+ });
304
+ w(`| ${cat} | ${vals.join(" | ")} |`);
305
+ }
306
+
307
+ w("");
308
+ w("## Embedding Performance");
309
+ w("");
310
+ w("| System | Embedding Time | Model | Dimensions |");
311
+ w("|--------|---------------|-------|------------|");
312
+ for (const s of systems) {
313
+ if (s.embed_time_ms > 100) {
314
+ w(`| ${s.name} | ${(s.embed_time_ms / 1000).toFixed(1)}s | Xenova/all-MiniLM-L6-v2 | 384 |`);
315
+ }
316
+ }
317
+ w("");
318
+ w("Embedding is a one-time cost at ingestion. Search is sub-millisecond after indexing.");
319
+
320
+ w("");
321
+ w("## Key Findings");
322
+ w("");
323
+
324
+ if (bm25Only && dual) {
325
+ const semBm25 = bm25Only.results.filter(q => q.category === "semantic");
326
+ const semDual = dual.results.filter(q => q.category === "semantic");
327
+ const semImprove = avg(semDual.map(q => q.recall_10)) - avg(semBm25.map(q => q.recall_10));
328
+
329
+ w(`1. **Semantic queries improve most**: ${(semImprove * 100).toFixed(1)}pp recall@10 gain from real embeddings`);
330
+ w(`2. **"database performance optimization"** — the hardest query — goes from BM25 ${pct(bm25Only.results.find(q => q.query.includes("database perf"))?.recall_10 ?? 0)} to vector-augmented ${pct(dual.results.find(q => q.query.includes("database perf"))?.recall_10 ?? 0)}`);
331
+ w(`3. **Entity/exact queries** are already well-served by BM25+stemming — vectors add marginal value`);
332
+ w(`4. **Local embeddings (Xenova)** run without API keys — zero cost, zero latency concerns`);
333
+ }
334
+
335
+ w("");
336
+ w("## Recommendation");
337
+ w("");
338
+ w("Enable local embeddings by default (`EMBEDDING_PROVIDER=local` or install `@xenova/transformers`).");
339
+ w("This gives agentmemory genuine semantic search that built-in agent memories cannot match —");
340
+ w("understanding that \"database performance optimization\" relates to \"N+1 query fix\" and \"eager loading\".");
341
+ w("");
342
+
343
+ w("---");
344
+ w(`*All measurements use Xenova/all-MiniLM-L6-v2 local embeddings (384 dimensions, no API calls).*`);
345
+
346
+ return lines.join("\n");
347
+ }
348
+
349
+ async function main() {
350
+ console.log("=== agentmemory Real Embeddings Benchmark ===\n");
351
+
352
+ console.log("Loading Xenova/all-MiniLM-L6-v2 model (first run downloads ~80MB)...");
353
+ let provider: EmbeddingProvider;
354
+ try {
355
+ provider = new LocalEmbeddingProvider();
356
+ const testEmbed = await provider.embed("test");
357
+ console.log(`Model loaded. Dimensions: ${testEmbed.length}\n`);
358
+ } catch (err) {
359
+ console.error("Failed to load Xenova model:", err);
360
+ console.error("Install with: npm install @xenova/transformers");
361
+ process.exit(1);
362
+ }
363
+
364
+ const { observations, queries } = generateDataset();
365
+ console.log(`Dataset: ${observations.length} observations, ${queries.length} queries\n`);
366
+
367
+ console.log("1. Built-in (grep all)...");
368
+ const builtinResult = await evalBuiltinGrep(observations, queries);
369
+ console.log(` Recall@10: ${pct(avg(builtinResult.results.map(q => q.recall_10)))}\n`);
370
+
371
+ console.log("2. BM25-only (stemmed+synonyms)...");
372
+ const bm25Result = await evalSystem(
373
+ "BM25-only (stemmed+synonyms)",
374
+ observations, queries, null,
375
+ { bm25: 1.0, vector: 0, graph: 0 },
376
+ );
377
+ console.log(` Recall@10: ${pct(avg(bm25Result.results.map(q => q.recall_10)))}\n`);
378
+
379
+ console.log("3. Dual-stream (BM25 + real Xenova vectors)...");
380
+ const dualResult = await evalSystem(
381
+ "Dual-stream (BM25+Xenova)",
382
+ observations, queries, provider,
383
+ { bm25: 0.4, vector: 0.6, graph: 0 },
384
+ );
385
+ console.log(` Recall@10: ${pct(avg(dualResult.results.map(q => q.recall_10)))}\n`);
386
+
387
+ console.log("4. Triple-stream (BM25 + Xenova + Graph)...");
388
+ const tripleResult = await evalSystem(
389
+ "Triple-stream (BM25+Xenova+Graph)",
390
+ observations, queries, provider,
391
+ { bm25: 0.4, vector: 0.6, graph: 0.3 },
392
+ );
393
+ console.log(` Recall@10: ${pct(avg(tripleResult.results.map(q => q.recall_10)))}\n`);
394
+
395
+ const report = generateReport(
396
+ [builtinResult, bm25Result, dualResult, tripleResult],
397
+ observations.length,
398
+ );
399
+
400
+ writeFileSync("benchmark/REAL-EMBEDDINGS.md", report);
401
+ console.log(report);
402
+ console.log(`\nReport written to benchmark/REAL-EMBEDDINGS.md`);
403
+ }
404
+
405
+ main().catch(console.error);