@agentmemory/agentmemory 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. package/AGENTS.md +2 -2
  2. package/README.md +76 -82
  3. package/dist/cli.mjs +99 -32
  4. package/dist/cli.mjs.map +1 -1
  5. package/dist/index.mjs +4 -2
  6. package/dist/index.mjs.map +1 -1
  7. package/dist/{src-QxitMPfJ.mjs → src-sYZDDbiA.mjs} +5 -3
  8. package/dist/src-sYZDDbiA.mjs.map +1 -0
  9. package/dist/standalone.mjs +1 -1
  10. package/dist/standalone.mjs.map +1 -1
  11. package/package.json +11 -1
  12. package/plugin/.claude-plugin/plugin.json +1 -1
  13. package/plugin/scripts/notification.d.mts +1 -0
  14. package/plugin/scripts/notification.mjs.map +1 -0
  15. package/plugin/scripts/post-tool-failure.d.mts +1 -0
  16. package/plugin/scripts/post-tool-failure.mjs.map +1 -0
  17. package/plugin/scripts/post-tool-use.d.mts +1 -0
  18. package/plugin/scripts/post-tool-use.mjs.map +1 -0
  19. package/plugin/scripts/pre-compact.d.mts +1 -0
  20. package/plugin/scripts/pre-compact.mjs.map +1 -0
  21. package/plugin/scripts/pre-tool-use.d.mts +1 -0
  22. package/plugin/scripts/pre-tool-use.mjs.map +1 -0
  23. package/plugin/scripts/prompt-submit.d.mts +1 -0
  24. package/plugin/scripts/prompt-submit.mjs.map +1 -0
  25. package/plugin/scripts/session-end.d.mts +1 -0
  26. package/plugin/scripts/session-end.mjs.map +1 -0
  27. package/plugin/scripts/session-start.d.mts +1 -0
  28. package/plugin/scripts/session-start.mjs.map +1 -0
  29. package/plugin/scripts/stop.d.mts +1 -0
  30. package/plugin/scripts/stop.mjs.map +1 -0
  31. package/plugin/scripts/subagent-start.d.mts +1 -0
  32. package/plugin/scripts/subagent-start.mjs.map +1 -0
  33. package/plugin/scripts/subagent-stop.d.mts +1 -0
  34. package/plugin/scripts/subagent-stop.mjs.map +1 -0
  35. package/plugin/scripts/task-completed.d.mts +1 -0
  36. package/plugin/scripts/task-completed.mjs.map +1 -0
  37. package/.claude-plugin/marketplace.json +0 -14
  38. package/.github/workflows/ci.yml +0 -22
  39. package/.github/workflows/publish.yml +0 -28
  40. package/assets/banner.png +0 -0
  41. package/assets/demo.gif +0 -0
  42. package/assets/demo.mp4 +0 -0
  43. package/benchmark/QUALITY.md +0 -73
  44. package/benchmark/REAL-EMBEDDINGS.md +0 -67
  45. package/benchmark/SCALE.md +0 -110
  46. package/benchmark/dataset.ts +0 -293
  47. package/benchmark/quality-eval.ts +0 -643
  48. package/benchmark/real-embeddings-eval.ts +0 -405
  49. package/benchmark/scale-eval.ts +0 -398
  50. package/dist/src-QxitMPfJ.mjs.map +0 -1
  51. package/src/auth.ts +0 -12
  52. package/src/cli.ts +0 -159
  53. package/src/config.ts +0 -221
  54. package/src/eval/metrics-store.ts +0 -65
  55. package/src/eval/quality.ts +0 -51
  56. package/src/eval/schemas.ts +0 -124
  57. package/src/eval/self-correct.ts +0 -28
  58. package/src/eval/validator.ts +0 -31
  59. package/src/functions/actions.ts +0 -288
  60. package/src/functions/audit.ts +0 -61
  61. package/src/functions/auto-forget.ts +0 -169
  62. package/src/functions/branch-aware.ts +0 -169
  63. package/src/functions/cascade.ts +0 -80
  64. package/src/functions/checkpoints.ts +0 -209
  65. package/src/functions/claude-bridge.ts +0 -161
  66. package/src/functions/compress.ts +0 -194
  67. package/src/functions/consolidate.ts +0 -212
  68. package/src/functions/consolidation-pipeline.ts +0 -258
  69. package/src/functions/context.ts +0 -169
  70. package/src/functions/crystallize.ts +0 -293
  71. package/src/functions/dedup.ts +0 -57
  72. package/src/functions/diagnostics.ts +0 -785
  73. package/src/functions/enrich.ts +0 -132
  74. package/src/functions/evict.ts +0 -163
  75. package/src/functions/export-import.ts +0 -508
  76. package/src/functions/facets.ts +0 -248
  77. package/src/functions/file-index.ts +0 -106
  78. package/src/functions/flow-compress.ts +0 -214
  79. package/src/functions/frontier.ts +0 -196
  80. package/src/functions/governance.ts +0 -131
  81. package/src/functions/graph-retrieval.ts +0 -277
  82. package/src/functions/graph.ts +0 -275
  83. package/src/functions/leases.ts +0 -216
  84. package/src/functions/lessons.ts +0 -253
  85. package/src/functions/mesh.ts +0 -434
  86. package/src/functions/migrate.ts +0 -165
  87. package/src/functions/observe.ts +0 -144
  88. package/src/functions/obsidian-export.ts +0 -310
  89. package/src/functions/patterns.ts +0 -138
  90. package/src/functions/privacy.ts +0 -39
  91. package/src/functions/profile.ts +0 -155
  92. package/src/functions/query-expansion.ts +0 -186
  93. package/src/functions/relations.ts +0 -237
  94. package/src/functions/remember.ts +0 -162
  95. package/src/functions/retention.ts +0 -235
  96. package/src/functions/routines.ts +0 -289
  97. package/src/functions/search.ts +0 -80
  98. package/src/functions/sentinels.ts +0 -417
  99. package/src/functions/signals.ts +0 -186
  100. package/src/functions/sketches.ts +0 -274
  101. package/src/functions/sliding-window.ts +0 -257
  102. package/src/functions/smart-search.ts +0 -115
  103. package/src/functions/snapshot.ts +0 -219
  104. package/src/functions/summarize.ts +0 -155
  105. package/src/functions/team.ts +0 -147
  106. package/src/functions/temporal-graph.ts +0 -476
  107. package/src/functions/timeline.ts +0 -138
  108. package/src/functions/verify.ts +0 -117
  109. package/src/health/monitor.ts +0 -110
  110. package/src/health/thresholds.ts +0 -73
  111. package/src/hooks/notification.ts +0 -52
  112. package/src/hooks/post-tool-failure.ts +0 -58
  113. package/src/hooks/post-tool-use.ts +0 -62
  114. package/src/hooks/pre-compact.ts +0 -60
  115. package/src/hooks/pre-tool-use.ts +0 -72
  116. package/src/hooks/prompt-submit.ts +0 -46
  117. package/src/hooks/session-end.ts +0 -71
  118. package/src/hooks/session-start.ts +0 -48
  119. package/src/hooks/stop.ts +0 -39
  120. package/src/hooks/subagent-start.ts +0 -49
  121. package/src/hooks/subagent-stop.ts +0 -54
  122. package/src/hooks/task-completed.ts +0 -54
  123. package/src/index.ts +0 -342
  124. package/src/mcp/in-memory-kv.ts +0 -61
  125. package/src/mcp/server.ts +0 -1455
  126. package/src/mcp/standalone.ts +0 -177
  127. package/src/mcp/tools-registry.ts +0 -769
  128. package/src/mcp/transport.ts +0 -91
  129. package/src/prompts/compression.ts +0 -67
  130. package/src/prompts/consolidation.ts +0 -48
  131. package/src/prompts/graph-extraction.ts +0 -35
  132. package/src/prompts/summary.ts +0 -38
  133. package/src/prompts/xml.ts +0 -26
  134. package/src/providers/agent-sdk.ts +0 -34
  135. package/src/providers/anthropic.ts +0 -35
  136. package/src/providers/circuit-breaker.ts +0 -82
  137. package/src/providers/embedding/cohere.ts +0 -46
  138. package/src/providers/embedding/gemini.ts +0 -54
  139. package/src/providers/embedding/index.ts +0 -39
  140. package/src/providers/embedding/local.ts +0 -52
  141. package/src/providers/embedding/openai.ts +0 -45
  142. package/src/providers/embedding/openrouter.ts +0 -51
  143. package/src/providers/embedding/voyage.ts +0 -46
  144. package/src/providers/fallback-chain.ts +0 -31
  145. package/src/providers/index.ts +0 -84
  146. package/src/providers/openrouter.ts +0 -71
  147. package/src/providers/resilient.ts +0 -37
  148. package/src/state/hybrid-search.ts +0 -295
  149. package/src/state/index-persistence.ts +0 -63
  150. package/src/state/keyed-mutex.ts +0 -18
  151. package/src/state/kv.ts +0 -33
  152. package/src/state/schema.ts +0 -71
  153. package/src/state/search-index.ts +0 -245
  154. package/src/state/stemmer.ts +0 -104
  155. package/src/state/synonyms.ts +0 -63
  156. package/src/state/vector-index.ts +0 -130
  157. package/src/telemetry/setup.ts +0 -116
  158. package/src/triggers/api.ts +0 -1904
  159. package/src/triggers/events.ts +0 -71
  160. package/src/types.ts +0 -769
  161. package/src/version.ts +0 -1
  162. package/src/viewer/index.html +0 -2497
  163. package/src/viewer/server.ts +0 -207
  164. package/src/xenova.d.ts +0 -3
  165. package/test/actions.test.ts +0 -490
  166. package/test/audit.test.ts +0 -108
  167. package/test/auto-forget.test.ts +0 -188
  168. package/test/cascade.test.ts +0 -277
  169. package/test/checkpoints.test.ts +0 -493
  170. package/test/circuit-breaker.test.ts +0 -107
  171. package/test/claude-bridge.test.ts +0 -178
  172. package/test/confidence.test.ts +0 -247
  173. package/test/consistency.test.ts +0 -61
  174. package/test/consolidation-pipeline.test.ts +0 -251
  175. package/test/crystallize.test.ts +0 -521
  176. package/test/diagnostics.test.ts +0 -638
  177. package/test/embedding-provider.test.ts +0 -49
  178. package/test/enrich.test.ts +0 -209
  179. package/test/eval.test.ts +0 -300
  180. package/test/export-import.test.ts +0 -251
  181. package/test/facets.test.ts +0 -448
  182. package/test/fallback-chain.test.ts +0 -93
  183. package/test/frontier.test.ts +0 -485
  184. package/test/governance.test.ts +0 -147
  185. package/test/graph-retrieval.test.ts +0 -186
  186. package/test/graph.test.ts +0 -160
  187. package/test/helpers/mocks.ts +0 -40
  188. package/test/hybrid-search.test.ts +0 -145
  189. package/test/index-persistence.test.ts +0 -124
  190. package/test/integration.test.ts +0 -265
  191. package/test/leases.test.ts +0 -399
  192. package/test/mcp-prompts.test.ts +0 -218
  193. package/test/mcp-resources.test.ts +0 -286
  194. package/test/mcp-standalone.test.ts +0 -113
  195. package/test/mesh.test.ts +0 -700
  196. package/test/privacy.test.ts +0 -87
  197. package/test/profile.test.ts +0 -161
  198. package/test/query-expansion.test.ts +0 -154
  199. package/test/relations.test.ts +0 -198
  200. package/test/retention.test.ts +0 -245
  201. package/test/routines.test.ts +0 -497
  202. package/test/schema-fingerprint.test.ts +0 -81
  203. package/test/schema.test.ts +0 -42
  204. package/test/search-index.test.ts +0 -128
  205. package/test/sentinels.test.ts +0 -626
  206. package/test/signals.test.ts +0 -410
  207. package/test/sketches.test.ts +0 -549
  208. package/test/sliding-window.test.ts +0 -199
  209. package/test/smart-search.test.ts +0 -169
  210. package/test/snapshot.test.ts +0 -165
  211. package/test/team.test.ts +0 -156
  212. package/test/temporal-graph.test.ts +0 -378
  213. package/test/timeline.test.ts +0 -148
  214. package/test/vector-index.test.ts +0 -79
  215. package/test/verify.test.ts +0 -209
  216. package/test/xml.test.ts +0 -65
  217. package/tsconfig.json +0 -22
  218. package/tsdown.config.ts +0 -62
@@ -1,643 +0,0 @@
1
- import { SearchIndex } from "../src/state/search-index.js";
2
- import { VectorIndex } from "../src/state/vector-index.js";
3
- import { HybridSearch } from "../src/state/hybrid-search.js";
4
- import { GraphRetrieval } from "../src/functions/graph-retrieval.js";
5
- import { extractEntitiesFromQuery } from "../src/functions/query-expansion.js";
6
- import type { CompressedObservation, GraphNode, GraphEdge, GraphEdgeType } from "../src/types.js";
7
- import { generateDataset, type LabeledQuery } from "./dataset.js";
8
- import { writeFileSync } from "node:fs";
9
-
10
- interface QualityMetrics {
11
- query: string;
12
- category: string;
13
- recall_at_5: number;
14
- recall_at_10: number;
15
- recall_at_20: number;
16
- precision_at_5: number;
17
- precision_at_10: number;
18
- ndcg_at_10: number;
19
- mrr: number;
20
- relevant_count: number;
21
- retrieved_count: number;
22
- latency_ms: number;
23
- }
24
-
25
- interface SystemMetrics {
26
- system: string;
27
- avg_recall_at_5: number;
28
- avg_recall_at_10: number;
29
- avg_recall_at_20: number;
30
- avg_precision_at_5: number;
31
- avg_precision_at_10: number;
32
- avg_ndcg_at_10: number;
33
- avg_mrr: number;
34
- avg_latency_ms: number;
35
- total_tokens_per_query: number;
36
- per_query: QualityMetrics[];
37
- }
38
-
39
- function dcg(relevances: boolean[], k: number): number {
40
- let sum = 0;
41
- for (let i = 0; i < Math.min(k, relevances.length); i++) {
42
- sum += (relevances[i] ? 1 : 0) / Math.log2(i + 2);
43
- }
44
- return sum;
45
- }
46
-
47
- function ndcg(retrieved: string[], relevant: Set<string>, k: number): number {
48
- const actualRelevances = retrieved.slice(0, k).map(id => relevant.has(id));
49
- const idealRelevances = Array.from({ length: Math.min(k, relevant.size) }, () => true);
50
- const idealDCG = dcg(idealRelevances, k);
51
- if (idealDCG === 0) return 0;
52
- return dcg(actualRelevances, k) / idealDCG;
53
- }
54
-
55
- function recall(retrieved: string[], relevant: Set<string>, k: number): number {
56
- if (relevant.size === 0) return 1;
57
- const topK = new Set(retrieved.slice(0, k));
58
- let hits = 0;
59
- for (const id of relevant) {
60
- if (topK.has(id)) hits++;
61
- }
62
- return hits / relevant.size;
63
- }
64
-
65
- function precision(retrieved: string[], relevant: Set<string>, k: number): number {
66
- const topK = retrieved.slice(0, k);
67
- if (topK.length === 0) return 0;
68
- let hits = 0;
69
- for (const id of topK) {
70
- if (relevant.has(id)) hits++;
71
- }
72
- return hits / topK.length;
73
- }
74
-
75
- function mrr(retrieved: string[], relevant: Set<string>): number {
76
- for (let i = 0; i < retrieved.length; i++) {
77
- if (relevant.has(retrieved[i])) return 1 / (i + 1);
78
- }
79
- return 0;
80
- }
81
-
82
- function estimateTokens(text: string): number {
83
- return Math.ceil(text.length / 4);
84
- }
85
-
86
- function mockKV() {
87
- const store = new Map<string, Map<string, unknown>>();
88
- return {
89
- get: async <T>(scope: string, key: string): Promise<T | null> => {
90
- return (store.get(scope)?.get(key) as T) ?? null;
91
- },
92
- set: async <T>(scope: string, key: string, data: T): Promise<T> => {
93
- if (!store.has(scope)) store.set(scope, new Map());
94
- store.get(scope)!.set(key, data);
95
- return data;
96
- },
97
- delete: async (scope: string, key: string): Promise<void> => {
98
- store.get(scope)?.delete(key);
99
- },
100
- list: async <T>(scope: string): Promise<T[]> => {
101
- const entries = store.get(scope);
102
- return entries ? (Array.from(entries.values()) as T[]) : [];
103
- },
104
- };
105
- }
106
-
107
- function deterministicEmbedding(text: string, dims = 384): Float32Array {
108
- const arr = new Float32Array(dims);
109
- const words = text.toLowerCase().split(/\W+/).filter(w => w.length > 2);
110
- for (const word of words) {
111
- for (let i = 0; i < word.length; i++) {
112
- const idx = (word.charCodeAt(i) * 31 + i * 17) % dims;
113
- arr[idx] += 1;
114
- const idx2 = (word.charCodeAt(i) * 37 + i * 13 + word.length * 7) % dims;
115
- arr[idx2] += 0.5;
116
- }
117
- }
118
- const norm = Math.sqrt(arr.reduce((s, v) => s + v * v, 0));
119
- if (norm > 0) for (let i = 0; i < dims; i++) arr[i] /= norm;
120
- return arr;
121
- }
122
-
123
- async function evalBm25Only(
124
- observations: CompressedObservation[],
125
- queries: LabeledQuery[],
126
- ): Promise<SystemMetrics> {
127
- const index = new SearchIndex();
128
- for (const obs of observations) index.add(obs);
129
-
130
- const perQuery: QualityMetrics[] = [];
131
-
132
- for (const q of queries) {
133
- const relevant = new Set(q.relevantObsIds);
134
- const start = performance.now();
135
- const results = index.search(q.query, 20);
136
- const latency = performance.now() - start;
137
-
138
- const retrieved = results.map(r => r.obsId);
139
- perQuery.push({
140
- query: q.query,
141
- category: q.category,
142
- recall_at_5: recall(retrieved, relevant, 5),
143
- recall_at_10: recall(retrieved, relevant, 10),
144
- recall_at_20: recall(retrieved, relevant, 20),
145
- precision_at_5: precision(retrieved, relevant, 5),
146
- precision_at_10: precision(retrieved, relevant, 10),
147
- ndcg_at_10: ndcg(retrieved, relevant, 10),
148
- mrr: mrr(retrieved, relevant),
149
- relevant_count: relevant.size,
150
- retrieved_count: results.length,
151
- latency_ms: latency,
152
- });
153
- }
154
-
155
- const avgTokens = perQuery.reduce((sum, q) => sum + q.retrieved_count, 0) / perQuery.length;
156
- const avgObsTokens = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
157
-
158
- return {
159
- system: "BM25-only",
160
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
161
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
162
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
163
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
164
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
165
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
166
- avg_mrr: avg(perQuery.map(q => q.mrr)),
167
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
168
- total_tokens_per_query: Math.round(avgObsTokens * avgTokens),
169
- per_query: perQuery,
170
- };
171
- }
172
-
173
- async function evalDualStream(
174
- observations: CompressedObservation[],
175
- queries: LabeledQuery[],
176
- ): Promise<SystemMetrics> {
177
- const kv = mockKV();
178
- const bm25 = new SearchIndex();
179
- const vector = new VectorIndex();
180
- const dims = 384;
181
-
182
- for (const obs of observations) {
183
- bm25.add(obs);
184
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ");
185
- vector.add(obs.id, obs.sessionId, deterministicEmbedding(text, dims));
186
- await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs);
187
- }
188
-
189
- const mockEmbed: any = {
190
- name: "deterministic",
191
- dimensions: dims,
192
- embed: async (text: string) => deterministicEmbedding(text, dims),
193
- embedBatch: async (texts: string[]) => texts.map(t => deterministicEmbedding(t, dims)),
194
- };
195
-
196
- const hybrid = new HybridSearch(bm25, vector, mockEmbed, kv as never, 0.4, 0.6, 0);
197
- const perQuery: QualityMetrics[] = [];
198
-
199
- for (const q of queries) {
200
- const relevant = new Set(q.relevantObsIds);
201
- const start = performance.now();
202
- const results = await hybrid.search(q.query, 20);
203
- const latency = performance.now() - start;
204
-
205
- const retrieved = results.map(r => r.observation.id);
206
- perQuery.push({
207
- query: q.query,
208
- category: q.category,
209
- recall_at_5: recall(retrieved, relevant, 5),
210
- recall_at_10: recall(retrieved, relevant, 10),
211
- recall_at_20: recall(retrieved, relevant, 20),
212
- precision_at_5: precision(retrieved, relevant, 5),
213
- precision_at_10: precision(retrieved, relevant, 10),
214
- ndcg_at_10: ndcg(retrieved, relevant, 10),
215
- mrr: mrr(retrieved, relevant),
216
- relevant_count: relevant.size,
217
- retrieved_count: results.length,
218
- latency_ms: latency,
219
- });
220
- }
221
-
222
- const avgResultTokens = perQuery.reduce((sum, q) => {
223
- return sum + q.retrieved_count;
224
- }, 0) / perQuery.length;
225
- const avgObsTokens2 = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
226
-
227
- return {
228
- system: "Dual-stream (BM25+Vector)",
229
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
230
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
231
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
232
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
233
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
234
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
235
- avg_mrr: avg(perQuery.map(q => q.mrr)),
236
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
237
- total_tokens_per_query: Math.round(avgObsTokens2 * avgResultTokens),
238
- per_query: perQuery,
239
- };
240
- }
241
-
242
- async function evalTripleStream(
243
- observations: CompressedObservation[],
244
- queries: LabeledQuery[],
245
- ): Promise<SystemMetrics> {
246
- const kv = mockKV();
247
- const bm25 = new SearchIndex();
248
- const vector = new VectorIndex();
249
- const dims = 384;
250
-
251
- for (const obs of observations) {
252
- bm25.add(obs);
253
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ");
254
- vector.add(obs.id, obs.sessionId, deterministicEmbedding(text, dims));
255
- await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs);
256
- }
257
-
258
- const conceptToNodes = new Map<string, string>();
259
- const nodeTypes: GraphNode["type"][] = ["concept", "library", "file", "pattern"];
260
- const edgeTypes: GraphEdgeType[] = ["uses", "related_to", "depends_on", "modifies"];
261
- const now = new Date().toISOString();
262
- let nodeId = 0;
263
-
264
- for (const obs of observations) {
265
- for (const concept of obs.concepts) {
266
- if (!conceptToNodes.has(concept)) {
267
- const nid = `gn_${nodeId++}`;
268
- conceptToNodes.set(concept, nid);
269
- await kv.set("mem:graph:nodes", nid, {
270
- id: nid,
271
- type: nodeTypes[nodeId % nodeTypes.length],
272
- name: concept,
273
- properties: {},
274
- sourceObservationIds: [],
275
- createdAt: now,
276
- } as GraphNode);
277
- }
278
- const nid = conceptToNodes.get(concept)!;
279
- const existing = await kv.get<GraphNode>("mem:graph:nodes", nid);
280
- if (existing && !existing.sourceObservationIds.includes(obs.id)) {
281
- existing.sourceObservationIds.push(obs.id);
282
- await kv.set("mem:graph:nodes", nid, existing);
283
- }
284
- }
285
-
286
- const capped = obs.concepts.slice(0, 10);
287
- for (let i = 0; i < capped.length; i++) {
288
- for (let j = i + 1; j < capped.length; j++) {
289
- const srcNid = conceptToNodes.get(capped[i])!;
290
- const tgtNid = conceptToNodes.get(capped[j])!;
291
- if (srcNid && tgtNid && srcNid !== tgtNid) {
292
- const eid = `ge_${srcNid}_${tgtNid}`;
293
- const existing = await kv.get<GraphEdge>("mem:graph:edges", eid);
294
- const weight = existing ? Math.min(1.0, existing.weight + 0.1) : 0.5;
295
- await kv.set("mem:graph:edges", eid, {
296
- id: eid,
297
- type: edgeTypes[(i + j) % edgeTypes.length],
298
- sourceNodeId: srcNid,
299
- targetNodeId: tgtNid,
300
- weight,
301
- sourceObservationIds: existing
302
- ? [...new Set([...existing.sourceObservationIds, obs.id])]
303
- : [obs.id],
304
- createdAt: now,
305
- tcommit: now,
306
- version: 1,
307
- isLatest: true,
308
- } as GraphEdge);
309
- }
310
- }
311
- }
312
- }
313
-
314
- const mockEmbed: any = {
315
- name: "deterministic",
316
- dimensions: dims,
317
- embed: async (text: string) => deterministicEmbedding(text, dims),
318
- embedBatch: async (texts: string[]) => texts.map(t => deterministicEmbedding(t, dims)),
319
- };
320
-
321
- const hybrid = new HybridSearch(bm25, vector, mockEmbed, kv as never, 0.4, 0.6, 0.3);
322
- const perQuery: QualityMetrics[] = [];
323
-
324
- for (const q of queries) {
325
- const relevant = new Set(q.relevantObsIds);
326
- const start = performance.now();
327
- const results = await hybrid.search(q.query, 20);
328
- const latency = performance.now() - start;
329
-
330
- const retrieved = results.map(r => r.observation.id);
331
- perQuery.push({
332
- query: q.query,
333
- category: q.category,
334
- recall_at_5: recall(retrieved, relevant, 5),
335
- recall_at_10: recall(retrieved, relevant, 10),
336
- recall_at_20: recall(retrieved, relevant, 20),
337
- precision_at_5: precision(retrieved, relevant, 5),
338
- precision_at_10: precision(retrieved, relevant, 10),
339
- ndcg_at_10: ndcg(retrieved, relevant, 10),
340
- mrr: mrr(retrieved, relevant),
341
- relevant_count: relevant.size,
342
- retrieved_count: results.length,
343
- latency_ms: latency,
344
- });
345
- }
346
-
347
- const avgResultTokens3 = perQuery.reduce((sum, q) => {
348
- return sum + q.retrieved_count;
349
- }, 0) / perQuery.length;
350
- const avgObsTokens3 = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
351
-
352
- return {
353
- system: "Triple-stream (BM25+Vector+Graph)",
354
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
355
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
356
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
357
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
358
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
359
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
360
- avg_mrr: avg(perQuery.map(q => q.mrr)),
361
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
362
- total_tokens_per_query: Math.round(avgObsTokens3 * avgResultTokens3),
363
- per_query: perQuery,
364
- };
365
- }
366
-
367
- async function evalBuiltinMemory(
368
- observations: CompressedObservation[],
369
- queries: LabeledQuery[],
370
- ): Promise<SystemMetrics> {
371
- const allText = observations.map(o =>
372
- `## ${o.title}\n${o.narrative}\nConcepts: ${o.concepts.join(", ")}\nFiles: ${o.files.join(", ")}`
373
- ).join("\n\n");
374
-
375
- const totalTokens = estimateTokens(allText);
376
-
377
- const perQuery: QualityMetrics[] = [];
378
-
379
- for (const q of queries) {
380
- const relevant = new Set(q.relevantObsIds);
381
- const start = performance.now();
382
-
383
- const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2);
384
- const scored: Array<{ id: string; score: number }> = [];
385
-
386
- for (const obs of observations) {
387
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ").toLowerCase();
388
- let score = 0;
389
- for (const term of queryTerms) {
390
- if (text.includes(term)) score++;
391
- }
392
- if (score > 0) scored.push({ id: obs.id, score });
393
- }
394
-
395
- scored.sort((a, b) => b.score - a.score);
396
- const latency = performance.now() - start;
397
-
398
- const retrieved = scored.map(s => s.id).slice(0, 20);
399
- perQuery.push({
400
- query: q.query,
401
- category: q.category,
402
- recall_at_5: recall(retrieved, relevant, 5),
403
- recall_at_10: recall(retrieved, relevant, 10),
404
- recall_at_20: recall(retrieved, relevant, 20),
405
- precision_at_5: precision(retrieved, relevant, 5),
406
- precision_at_10: precision(retrieved, relevant, 10),
407
- ndcg_at_10: ndcg(retrieved, relevant, 10),
408
- mrr: mrr(retrieved, relevant),
409
- relevant_count: relevant.size,
410
- retrieved_count: Math.min(scored.length, 20),
411
- latency_ms: latency,
412
- });
413
- }
414
-
415
- return {
416
- system: "Built-in (CLAUDE.md / grep)",
417
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
418
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
419
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
420
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
421
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
422
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
423
- avg_mrr: avg(perQuery.map(q => q.mrr)),
424
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
425
- total_tokens_per_query: totalTokens,
426
- per_query: perQuery,
427
- };
428
- }
429
-
430
- async function evalBuiltinMemoryTruncated(
431
- observations: CompressedObservation[],
432
- queries: LabeledQuery[],
433
- ): Promise<SystemMetrics> {
434
- const MAX_LINES = 200;
435
- const lines = observations.map(o =>
436
- `- ${o.title}: ${o.narrative.slice(0, 80)}... [${o.concepts.slice(0, 3).join(", ")}]`
437
- );
438
- const truncated = lines.slice(0, MAX_LINES);
439
- const truncatedIds = new Set(observations.slice(0, MAX_LINES).map(o => o.id));
440
- const totalTokens = estimateTokens(truncated.join("\n"));
441
-
442
- const perQuery: QualityMetrics[] = [];
443
-
444
- for (const q of queries) {
445
- const relevant = new Set(q.relevantObsIds);
446
- const start = performance.now();
447
-
448
- const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2);
449
- const scored: Array<{ id: string; score: number }> = [];
450
-
451
- for (let i = 0; i < Math.min(MAX_LINES, observations.length); i++) {
452
- const obs = observations[i];
453
- const line = truncated[i];
454
- let score = 0;
455
- for (const term of queryTerms) {
456
- if (line.toLowerCase().includes(term)) score++;
457
- }
458
- if (score > 0) scored.push({ id: obs.id, score });
459
- }
460
-
461
- scored.sort((a, b) => b.score - a.score);
462
- const latency = performance.now() - start;
463
-
464
- const retrieved = scored.map(s => s.id).slice(0, 20);
465
-
466
- const reachableRelevant = new Set(
467
- [...relevant].filter(id => truncatedIds.has(id))
468
- );
469
-
470
- perQuery.push({
471
- query: q.query,
472
- category: q.category,
473
- recall_at_5: recall(retrieved, relevant, 5),
474
- recall_at_10: recall(retrieved, relevant, 10),
475
- recall_at_20: recall(retrieved, relevant, 20),
476
- precision_at_5: precision(retrieved, relevant, 5),
477
- precision_at_10: precision(retrieved, relevant, 10),
478
- ndcg_at_10: ndcg(retrieved, relevant, 10),
479
- mrr: mrr(retrieved, relevant),
480
- relevant_count: relevant.size,
481
- retrieved_count: Math.min(scored.length, 20),
482
- latency_ms: latency,
483
- });
484
- }
485
-
486
- return {
487
- system: "Built-in (200-line MEMORY.md)",
488
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
489
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
490
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
491
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
492
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
493
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
494
- avg_mrr: avg(perQuery.map(q => q.mrr)),
495
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
496
- total_tokens_per_query: totalTokens,
497
- per_query: perQuery,
498
- };
499
- }
500
-
501
- function avg(nums: number[]): number {
502
- return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0;
503
- }
504
-
505
- function pct(n: number): string {
506
- return (n * 100).toFixed(1) + "%";
507
- }
508
-
509
- function generateReport(systems: SystemMetrics[], obsCount: number, queryCount: number): string {
510
- const lines: string[] = [];
511
- const w = (s: string) => lines.push(s);
512
-
513
- w("# agentmemory v0.6.0 — Search Quality Evaluation");
514
- w("");
515
- w(`**Date:** ${new Date().toISOString()}`);
516
- w(`**Dataset:** ${obsCount} observations across 30 sessions (realistic coding project)`);
517
- w(`**Queries:** ${queryCount} labeled queries with ground-truth relevance`);
518
- w(`**Metric definitions:** Recall@K (fraction of relevant docs in top K), Precision@K (fraction of top K that are relevant), NDCG@10 (ranking quality), MRR (position of first relevant result)`);
519
- w("");
520
-
521
- w("## Head-to-Head Comparison");
522
- w("");
523
- w("| System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Latency | Tokens/query |");
524
- w("|--------|----------|-----------|-------------|---------|-----|---------|--------------|");
525
- for (const s of systems) {
526
- w(`| ${s.system} | ${pct(s.avg_recall_at_5)} | ${pct(s.avg_recall_at_10)} | ${pct(s.avg_precision_at_5)} | ${pct(s.avg_ndcg_at_10)} | ${pct(s.avg_mrr)} | ${s.avg_latency_ms.toFixed(2)}ms | ${s.total_tokens_per_query.toLocaleString()} |`);
527
- }
528
-
529
- w("");
530
- w("## Why This Matters");
531
- w("");
532
-
533
- const builtin = systems.find(s => s.system.includes("CLAUDE.md / grep"));
534
- const truncated = systems.find(s => s.system.includes("200-line"));
535
- const triple = systems.find(s => s.system.includes("Triple"));
536
- const bm25 = systems.find(s => s.system === "BM25-only");
537
-
538
- if (builtin && triple) {
539
- const recallLift = ((triple.avg_recall_at_10 - builtin.avg_recall_at_10) / Math.max(0.001, builtin.avg_recall_at_10) * 100);
540
- const tokenSaving = ((1 - triple.total_tokens_per_query / builtin.total_tokens_per_query) * 100);
541
- w(`**Recall improvement:** agentmemory triple-stream finds ${pct(triple.avg_recall_at_10)} of relevant memories at K=10 vs ${pct(builtin.avg_recall_at_10)} for keyword grep (${recallLift > 0 ? "+" : ""}${recallLift.toFixed(0)}%)`);
542
- w(`**Token savings:** agentmemory returns only the top 10 results (${triple.total_tokens_per_query.toLocaleString()} tokens) vs loading everything into context (${builtin.total_tokens_per_query.toLocaleString()} tokens) — ${tokenSaving.toFixed(0)}% reduction`);
543
- }
544
-
545
- if (truncated && triple) {
546
- w(`**200-line cap:** Claude Code's MEMORY.md is capped at 200 lines. With ${obsCount} observations, ${pct(truncated.avg_recall_at_10)} recall at K=10 — memories from later sessions are simply invisible.`);
547
- }
548
-
549
- w("");
550
- w("## Per-Query Breakdown (Triple-Stream)");
551
- w("");
552
-
553
- if (triple) {
554
- w("| Query | Category | Recall@10 | NDCG@10 | MRR | Relevant | Latency |");
555
- w("|-------|----------|-----------|---------|-----|----------|---------|");
556
- for (const q of triple.per_query) {
557
- w(`| ${q.query.slice(0, 45)}${q.query.length > 45 ? "..." : ""} | ${q.category} | ${pct(q.recall_at_10)} | ${pct(q.ndcg_at_10)} | ${pct(q.mrr)} | ${q.relevant_count} | ${q.latency_ms.toFixed(1)}ms |`);
558
- }
559
- }
560
-
561
- w("");
562
- w("## By Query Category");
563
- w("");
564
-
565
- const categories = ["exact", "semantic", "cross-session", "entity"];
566
- if (triple) {
567
- w("| Category | Avg Recall@10 | Avg NDCG@10 | Avg MRR | Queries |");
568
- w("|----------|---------------|-------------|---------|---------|");
569
- for (const cat of categories) {
570
- const qs = triple.per_query.filter(q => q.category === cat);
571
- if (qs.length === 0) continue;
572
- w(`| ${cat} | ${pct(avg(qs.map(q => q.recall_at_10)))} | ${pct(avg(qs.map(q => q.ndcg_at_10)))} | ${pct(avg(qs.map(q => q.mrr)))} | ${qs.length} |`);
573
- }
574
- }
575
-
576
- w("");
577
- w("## Context Window Analysis");
578
- w("");
579
- w("The fundamental problem with built-in agent memory:");
580
- w("");
581
- w("| Observations | MEMORY.md tokens | agentmemory tokens (top 10) | Savings | MEMORY.md reachable |");
582
- w("|-------------|-----------------|---------------------------|---------|-------------------|");
583
-
584
- for (const count of [240, 500, 1000, 5000]) {
585
- const memTokens = Math.round(count * 50);
586
- const amTokens = triple ? triple.total_tokens_per_query : 500;
587
- const saving = ((1 - amTokens / memTokens) * 100);
588
- const reachable = count <= 200 ? "100%" : `${((200 / count) * 100).toFixed(0)}%`;
589
- w(`| ${count.toLocaleString()} | ${memTokens.toLocaleString()} | ${amTokens.toLocaleString()} | ${saving.toFixed(0)}% | ${reachable} |`);
590
- }
591
-
592
- w("");
593
- w("At 240 observations (our dataset), MEMORY.md already hits its 200-line cap and loses access to the most recent 40 observations. At 1,000 observations, 80% of memories are invisible. agentmemory always searches the full corpus.");
594
-
595
- w("");
596
- w("---");
597
- w("");
598
- w(`*${systems.reduce((s, sys) => s + sys.per_query.length, 0)} evaluations across ${systems.length} systems. Ground-truth labels assigned by concept matching against observation metadata.*`);
599
-
600
- return lines.join("\n");
601
- }
602
-
603
- async function main() {
604
- console.log("Generating labeled dataset...");
605
- const { observations, queries, sessions } = generateDataset();
606
- console.log(`Dataset: ${observations.length} observations, ${sessions.size} sessions, ${queries.length} queries`);
607
- console.log(`Avg relevant docs per query: ${(queries.reduce((s, q) => s + q.relevantObsIds.length, 0) / queries.length).toFixed(1)}`);
608
- console.log("");
609
-
610
- console.log("Evaluating: Built-in (CLAUDE.md / grep)...");
611
- const builtinResults = await evalBuiltinMemory(observations, queries);
612
- console.log(` Recall@10: ${pct(builtinResults.avg_recall_at_10)}, NDCG@10: ${pct(builtinResults.avg_ndcg_at_10)}`);
613
-
614
- console.log("Evaluating: Built-in (200-line MEMORY.md)...");
615
- const truncatedResults = await evalBuiltinMemoryTruncated(observations, queries);
616
- console.log(` Recall@10: ${pct(truncatedResults.avg_recall_at_10)}, NDCG@10: ${pct(truncatedResults.avg_ndcg_at_10)}`);
617
-
618
- console.log("Evaluating: BM25-only...");
619
- const bm25Results = await evalBm25Only(observations, queries);
620
- console.log(` Recall@10: ${pct(bm25Results.avg_recall_at_10)}, NDCG@10: ${pct(bm25Results.avg_ndcg_at_10)}`);
621
-
622
- console.log("Evaluating: Dual-stream (BM25+Vector)...");
623
- const dualResults = await evalDualStream(observations, queries);
624
- console.log(` Recall@10: ${pct(dualResults.avg_recall_at_10)}, NDCG@10: ${pct(dualResults.avg_ndcg_at_10)}`);
625
-
626
- console.log("Evaluating: Triple-stream (BM25+Vector+Graph)...");
627
- const tripleResults = await evalTripleStream(observations, queries);
628
- console.log(` Recall@10: ${pct(tripleResults.avg_recall_at_10)}, NDCG@10: ${pct(tripleResults.avg_ndcg_at_10)}`);
629
-
630
- console.log("");
631
-
632
- const report = generateReport(
633
- [builtinResults, truncatedResults, bm25Results, dualResults, tripleResults],
634
- observations.length,
635
- queries.length,
636
- );
637
-
638
- writeFileSync("benchmark/QUALITY.md", report);
639
- console.log(report);
640
- console.log(`\nReport written to benchmark/QUALITY.md`);
641
- }
642
-
643
- main().catch(console.error);