@agentmemory/agentmemory 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. package/AGENTS.md +2 -2
  2. package/README.md +41 -68
  3. package/dist/cli.mjs +3 -3
  4. package/dist/index.mjs +3 -2
  5. package/dist/index.mjs.map +1 -1
  6. package/dist/{src-1fTKFEtN.mjs → src-sYZDDbiA.mjs} +4 -3
  7. package/dist/src-sYZDDbiA.mjs.map +1 -0
  8. package/dist/standalone.mjs +1 -1
  9. package/dist/standalone.mjs.map +1 -1
  10. package/package.json +10 -1
  11. package/plugin/.claude-plugin/plugin.json +1 -1
  12. package/plugin/scripts/notification.d.mts +1 -0
  13. package/plugin/scripts/notification.mjs.map +1 -0
  14. package/plugin/scripts/post-tool-failure.d.mts +1 -0
  15. package/plugin/scripts/post-tool-failure.mjs.map +1 -0
  16. package/plugin/scripts/post-tool-use.d.mts +1 -0
  17. package/plugin/scripts/post-tool-use.mjs.map +1 -0
  18. package/plugin/scripts/pre-compact.d.mts +1 -0
  19. package/plugin/scripts/pre-compact.mjs.map +1 -0
  20. package/plugin/scripts/pre-tool-use.d.mts +1 -0
  21. package/plugin/scripts/pre-tool-use.mjs.map +1 -0
  22. package/plugin/scripts/prompt-submit.d.mts +1 -0
  23. package/plugin/scripts/prompt-submit.mjs.map +1 -0
  24. package/plugin/scripts/session-end.d.mts +1 -0
  25. package/plugin/scripts/session-end.mjs.map +1 -0
  26. package/plugin/scripts/session-start.d.mts +1 -0
  27. package/plugin/scripts/session-start.mjs.map +1 -0
  28. package/plugin/scripts/stop.d.mts +1 -0
  29. package/plugin/scripts/stop.mjs.map +1 -0
  30. package/plugin/scripts/subagent-start.d.mts +1 -0
  31. package/plugin/scripts/subagent-start.mjs.map +1 -0
  32. package/plugin/scripts/subagent-stop.d.mts +1 -0
  33. package/plugin/scripts/subagent-stop.mjs.map +1 -0
  34. package/plugin/scripts/task-completed.d.mts +1 -0
  35. package/plugin/scripts/task-completed.mjs.map +1 -0
  36. package/.claude-plugin/marketplace.json +0 -14
  37. package/.github/workflows/ci.yml +0 -22
  38. package/.github/workflows/publish.yml +0 -28
  39. package/assets/banner.png +0 -0
  40. package/assets/demo.gif +0 -0
  41. package/assets/demo.mp4 +0 -0
  42. package/benchmark/QUALITY.md +0 -73
  43. package/benchmark/REAL-EMBEDDINGS.md +0 -67
  44. package/benchmark/SCALE.md +0 -110
  45. package/benchmark/dataset.ts +0 -293
  46. package/benchmark/quality-eval.ts +0 -643
  47. package/benchmark/real-embeddings-eval.ts +0 -405
  48. package/benchmark/scale-eval.ts +0 -398
  49. package/dist/src-1fTKFEtN.mjs.map +0 -1
  50. package/src/auth.ts +0 -12
  51. package/src/cli.ts +0 -251
  52. package/src/config.ts +0 -221
  53. package/src/eval/metrics-store.ts +0 -65
  54. package/src/eval/quality.ts +0 -51
  55. package/src/eval/schemas.ts +0 -124
  56. package/src/eval/self-correct.ts +0 -28
  57. package/src/eval/validator.ts +0 -31
  58. package/src/functions/actions.ts +0 -288
  59. package/src/functions/audit.ts +0 -61
  60. package/src/functions/auto-forget.ts +0 -169
  61. package/src/functions/branch-aware.ts +0 -169
  62. package/src/functions/cascade.ts +0 -80
  63. package/src/functions/checkpoints.ts +0 -209
  64. package/src/functions/claude-bridge.ts +0 -161
  65. package/src/functions/compress.ts +0 -194
  66. package/src/functions/consolidate.ts +0 -212
  67. package/src/functions/consolidation-pipeline.ts +0 -258
  68. package/src/functions/context.ts +0 -169
  69. package/src/functions/crystallize.ts +0 -293
  70. package/src/functions/dedup.ts +0 -57
  71. package/src/functions/diagnostics.ts +0 -785
  72. package/src/functions/enrich.ts +0 -132
  73. package/src/functions/evict.ts +0 -163
  74. package/src/functions/export-import.ts +0 -508
  75. package/src/functions/facets.ts +0 -248
  76. package/src/functions/file-index.ts +0 -106
  77. package/src/functions/flow-compress.ts +0 -214
  78. package/src/functions/frontier.ts +0 -196
  79. package/src/functions/governance.ts +0 -131
  80. package/src/functions/graph-retrieval.ts +0 -277
  81. package/src/functions/graph.ts +0 -275
  82. package/src/functions/leases.ts +0 -216
  83. package/src/functions/lessons.ts +0 -253
  84. package/src/functions/mesh.ts +0 -434
  85. package/src/functions/migrate.ts +0 -165
  86. package/src/functions/observe.ts +0 -144
  87. package/src/functions/obsidian-export.ts +0 -310
  88. package/src/functions/patterns.ts +0 -138
  89. package/src/functions/privacy.ts +0 -39
  90. package/src/functions/profile.ts +0 -155
  91. package/src/functions/query-expansion.ts +0 -186
  92. package/src/functions/relations.ts +0 -237
  93. package/src/functions/remember.ts +0 -162
  94. package/src/functions/retention.ts +0 -235
  95. package/src/functions/routines.ts +0 -289
  96. package/src/functions/search.ts +0 -80
  97. package/src/functions/sentinels.ts +0 -417
  98. package/src/functions/signals.ts +0 -186
  99. package/src/functions/sketches.ts +0 -274
  100. package/src/functions/sliding-window.ts +0 -257
  101. package/src/functions/smart-search.ts +0 -115
  102. package/src/functions/snapshot.ts +0 -219
  103. package/src/functions/summarize.ts +0 -155
  104. package/src/functions/team.ts +0 -147
  105. package/src/functions/temporal-graph.ts +0 -476
  106. package/src/functions/timeline.ts +0 -138
  107. package/src/functions/verify.ts +0 -117
  108. package/src/health/monitor.ts +0 -110
  109. package/src/health/thresholds.ts +0 -73
  110. package/src/hooks/notification.ts +0 -52
  111. package/src/hooks/post-tool-failure.ts +0 -58
  112. package/src/hooks/post-tool-use.ts +0 -62
  113. package/src/hooks/pre-compact.ts +0 -60
  114. package/src/hooks/pre-tool-use.ts +0 -72
  115. package/src/hooks/prompt-submit.ts +0 -46
  116. package/src/hooks/session-end.ts +0 -71
  117. package/src/hooks/session-start.ts +0 -48
  118. package/src/hooks/stop.ts +0 -39
  119. package/src/hooks/subagent-start.ts +0 -49
  120. package/src/hooks/subagent-stop.ts +0 -54
  121. package/src/hooks/task-completed.ts +0 -54
  122. package/src/index.ts +0 -342
  123. package/src/mcp/in-memory-kv.ts +0 -61
  124. package/src/mcp/server.ts +0 -1455
  125. package/src/mcp/standalone.ts +0 -177
  126. package/src/mcp/tools-registry.ts +0 -769
  127. package/src/mcp/transport.ts +0 -91
  128. package/src/prompts/compression.ts +0 -67
  129. package/src/prompts/consolidation.ts +0 -48
  130. package/src/prompts/graph-extraction.ts +0 -35
  131. package/src/prompts/summary.ts +0 -38
  132. package/src/prompts/xml.ts +0 -26
  133. package/src/providers/agent-sdk.ts +0 -34
  134. package/src/providers/anthropic.ts +0 -35
  135. package/src/providers/circuit-breaker.ts +0 -82
  136. package/src/providers/embedding/cohere.ts +0 -46
  137. package/src/providers/embedding/gemini.ts +0 -54
  138. package/src/providers/embedding/index.ts +0 -39
  139. package/src/providers/embedding/local.ts +0 -52
  140. package/src/providers/embedding/openai.ts +0 -45
  141. package/src/providers/embedding/openrouter.ts +0 -51
  142. package/src/providers/embedding/voyage.ts +0 -46
  143. package/src/providers/fallback-chain.ts +0 -31
  144. package/src/providers/index.ts +0 -84
  145. package/src/providers/openrouter.ts +0 -71
  146. package/src/providers/resilient.ts +0 -37
  147. package/src/state/hybrid-search.ts +0 -295
  148. package/src/state/index-persistence.ts +0 -63
  149. package/src/state/keyed-mutex.ts +0 -18
  150. package/src/state/kv.ts +0 -33
  151. package/src/state/schema.ts +0 -71
  152. package/src/state/search-index.ts +0 -245
  153. package/src/state/stemmer.ts +0 -104
  154. package/src/state/synonyms.ts +0 -63
  155. package/src/state/vector-index.ts +0 -130
  156. package/src/telemetry/setup.ts +0 -116
  157. package/src/triggers/api.ts +0 -1904
  158. package/src/triggers/events.ts +0 -71
  159. package/src/types.ts +0 -769
  160. package/src/version.ts +0 -1
  161. package/src/viewer/index.html +0 -2556
  162. package/src/viewer/server.ts +0 -207
  163. package/src/xenova.d.ts +0 -3
  164. package/test/actions.test.ts +0 -490
  165. package/test/audit.test.ts +0 -108
  166. package/test/auto-forget.test.ts +0 -188
  167. package/test/cascade.test.ts +0 -277
  168. package/test/checkpoints.test.ts +0 -493
  169. package/test/circuit-breaker.test.ts +0 -107
  170. package/test/claude-bridge.test.ts +0 -178
  171. package/test/confidence.test.ts +0 -247
  172. package/test/consistency.test.ts +0 -61
  173. package/test/consolidation-pipeline.test.ts +0 -251
  174. package/test/crystallize.test.ts +0 -521
  175. package/test/diagnostics.test.ts +0 -638
  176. package/test/embedding-provider.test.ts +0 -49
  177. package/test/enrich.test.ts +0 -209
  178. package/test/eval.test.ts +0 -300
  179. package/test/export-import.test.ts +0 -251
  180. package/test/facets.test.ts +0 -448
  181. package/test/fallback-chain.test.ts +0 -93
  182. package/test/frontier.test.ts +0 -485
  183. package/test/governance.test.ts +0 -147
  184. package/test/graph-retrieval.test.ts +0 -186
  185. package/test/graph.test.ts +0 -160
  186. package/test/helpers/mocks.ts +0 -40
  187. package/test/hybrid-search.test.ts +0 -145
  188. package/test/index-persistence.test.ts +0 -124
  189. package/test/integration.test.ts +0 -265
  190. package/test/leases.test.ts +0 -399
  191. package/test/mcp-prompts.test.ts +0 -218
  192. package/test/mcp-resources.test.ts +0 -286
  193. package/test/mcp-standalone.test.ts +0 -113
  194. package/test/mesh.test.ts +0 -700
  195. package/test/privacy.test.ts +0 -87
  196. package/test/profile.test.ts +0 -161
  197. package/test/query-expansion.test.ts +0 -154
  198. package/test/relations.test.ts +0 -198
  199. package/test/retention.test.ts +0 -245
  200. package/test/routines.test.ts +0 -497
  201. package/test/schema-fingerprint.test.ts +0 -81
  202. package/test/schema.test.ts +0 -42
  203. package/test/search-index.test.ts +0 -128
  204. package/test/sentinels.test.ts +0 -626
  205. package/test/signals.test.ts +0 -410
  206. package/test/sketches.test.ts +0 -549
  207. package/test/sliding-window.test.ts +0 -199
  208. package/test/smart-search.test.ts +0 -169
  209. package/test/snapshot.test.ts +0 -165
  210. package/test/team.test.ts +0 -156
  211. package/test/temporal-graph.test.ts +0 -378
  212. package/test/timeline.test.ts +0 -148
  213. package/test/vector-index.test.ts +0 -79
  214. package/test/verify.test.ts +0 -209
  215. package/test/xml.test.ts +0 -65
  216. package/tsconfig.json +0 -22
  217. package/tsdown.config.ts +0 -62
@@ -1,643 +0,0 @@
1
- import { SearchIndex } from "../src/state/search-index.js";
2
- import { VectorIndex } from "../src/state/vector-index.js";
3
- import { HybridSearch } from "../src/state/hybrid-search.js";
4
- import { GraphRetrieval } from "../src/functions/graph-retrieval.js";
5
- import { extractEntitiesFromQuery } from "../src/functions/query-expansion.js";
6
- import type { CompressedObservation, GraphNode, GraphEdge, GraphEdgeType } from "../src/types.js";
7
- import { generateDataset, type LabeledQuery } from "./dataset.js";
8
- import { writeFileSync } from "node:fs";
9
-
10
- interface QualityMetrics {
11
- query: string;
12
- category: string;
13
- recall_at_5: number;
14
- recall_at_10: number;
15
- recall_at_20: number;
16
- precision_at_5: number;
17
- precision_at_10: number;
18
- ndcg_at_10: number;
19
- mrr: number;
20
- relevant_count: number;
21
- retrieved_count: number;
22
- latency_ms: number;
23
- }
24
-
25
- interface SystemMetrics {
26
- system: string;
27
- avg_recall_at_5: number;
28
- avg_recall_at_10: number;
29
- avg_recall_at_20: number;
30
- avg_precision_at_5: number;
31
- avg_precision_at_10: number;
32
- avg_ndcg_at_10: number;
33
- avg_mrr: number;
34
- avg_latency_ms: number;
35
- total_tokens_per_query: number;
36
- per_query: QualityMetrics[];
37
- }
38
-
39
- function dcg(relevances: boolean[], k: number): number {
40
- let sum = 0;
41
- for (let i = 0; i < Math.min(k, relevances.length); i++) {
42
- sum += (relevances[i] ? 1 : 0) / Math.log2(i + 2);
43
- }
44
- return sum;
45
- }
46
-
47
- function ndcg(retrieved: string[], relevant: Set<string>, k: number): number {
48
- const actualRelevances = retrieved.slice(0, k).map(id => relevant.has(id));
49
- const idealRelevances = Array.from({ length: Math.min(k, relevant.size) }, () => true);
50
- const idealDCG = dcg(idealRelevances, k);
51
- if (idealDCG === 0) return 0;
52
- return dcg(actualRelevances, k) / idealDCG;
53
- }
54
-
55
- function recall(retrieved: string[], relevant: Set<string>, k: number): number {
56
- if (relevant.size === 0) return 1;
57
- const topK = new Set(retrieved.slice(0, k));
58
- let hits = 0;
59
- for (const id of relevant) {
60
- if (topK.has(id)) hits++;
61
- }
62
- return hits / relevant.size;
63
- }
64
-
65
- function precision(retrieved: string[], relevant: Set<string>, k: number): number {
66
- const topK = retrieved.slice(0, k);
67
- if (topK.length === 0) return 0;
68
- let hits = 0;
69
- for (const id of topK) {
70
- if (relevant.has(id)) hits++;
71
- }
72
- return hits / topK.length;
73
- }
74
-
75
- function mrr(retrieved: string[], relevant: Set<string>): number {
76
- for (let i = 0; i < retrieved.length; i++) {
77
- if (relevant.has(retrieved[i])) return 1 / (i + 1);
78
- }
79
- return 0;
80
- }
81
-
82
- function estimateTokens(text: string): number {
83
- return Math.ceil(text.length / 4);
84
- }
85
-
86
- function mockKV() {
87
- const store = new Map<string, Map<string, unknown>>();
88
- return {
89
- get: async <T>(scope: string, key: string): Promise<T | null> => {
90
- return (store.get(scope)?.get(key) as T) ?? null;
91
- },
92
- set: async <T>(scope: string, key: string, data: T): Promise<T> => {
93
- if (!store.has(scope)) store.set(scope, new Map());
94
- store.get(scope)!.set(key, data);
95
- return data;
96
- },
97
- delete: async (scope: string, key: string): Promise<void> => {
98
- store.get(scope)?.delete(key);
99
- },
100
- list: async <T>(scope: string): Promise<T[]> => {
101
- const entries = store.get(scope);
102
- return entries ? (Array.from(entries.values()) as T[]) : [];
103
- },
104
- };
105
- }
106
-
107
- function deterministicEmbedding(text: string, dims = 384): Float32Array {
108
- const arr = new Float32Array(dims);
109
- const words = text.toLowerCase().split(/\W+/).filter(w => w.length > 2);
110
- for (const word of words) {
111
- for (let i = 0; i < word.length; i++) {
112
- const idx = (word.charCodeAt(i) * 31 + i * 17) % dims;
113
- arr[idx] += 1;
114
- const idx2 = (word.charCodeAt(i) * 37 + i * 13 + word.length * 7) % dims;
115
- arr[idx2] += 0.5;
116
- }
117
- }
118
- const norm = Math.sqrt(arr.reduce((s, v) => s + v * v, 0));
119
- if (norm > 0) for (let i = 0; i < dims; i++) arr[i] /= norm;
120
- return arr;
121
- }
122
-
123
- async function evalBm25Only(
124
- observations: CompressedObservation[],
125
- queries: LabeledQuery[],
126
- ): Promise<SystemMetrics> {
127
- const index = new SearchIndex();
128
- for (const obs of observations) index.add(obs);
129
-
130
- const perQuery: QualityMetrics[] = [];
131
-
132
- for (const q of queries) {
133
- const relevant = new Set(q.relevantObsIds);
134
- const start = performance.now();
135
- const results = index.search(q.query, 20);
136
- const latency = performance.now() - start;
137
-
138
- const retrieved = results.map(r => r.obsId);
139
- perQuery.push({
140
- query: q.query,
141
- category: q.category,
142
- recall_at_5: recall(retrieved, relevant, 5),
143
- recall_at_10: recall(retrieved, relevant, 10),
144
- recall_at_20: recall(retrieved, relevant, 20),
145
- precision_at_5: precision(retrieved, relevant, 5),
146
- precision_at_10: precision(retrieved, relevant, 10),
147
- ndcg_at_10: ndcg(retrieved, relevant, 10),
148
- mrr: mrr(retrieved, relevant),
149
- relevant_count: relevant.size,
150
- retrieved_count: results.length,
151
- latency_ms: latency,
152
- });
153
- }
154
-
155
- const avgTokens = perQuery.reduce((sum, q) => sum + q.retrieved_count, 0) / perQuery.length;
156
- const avgObsTokens = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
157
-
158
- return {
159
- system: "BM25-only",
160
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
161
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
162
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
163
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
164
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
165
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
166
- avg_mrr: avg(perQuery.map(q => q.mrr)),
167
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
168
- total_tokens_per_query: Math.round(avgObsTokens * avgTokens),
169
- per_query: perQuery,
170
- };
171
- }
172
-
173
- async function evalDualStream(
174
- observations: CompressedObservation[],
175
- queries: LabeledQuery[],
176
- ): Promise<SystemMetrics> {
177
- const kv = mockKV();
178
- const bm25 = new SearchIndex();
179
- const vector = new VectorIndex();
180
- const dims = 384;
181
-
182
- for (const obs of observations) {
183
- bm25.add(obs);
184
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ");
185
- vector.add(obs.id, obs.sessionId, deterministicEmbedding(text, dims));
186
- await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs);
187
- }
188
-
189
- const mockEmbed: any = {
190
- name: "deterministic",
191
- dimensions: dims,
192
- embed: async (text: string) => deterministicEmbedding(text, dims),
193
- embedBatch: async (texts: string[]) => texts.map(t => deterministicEmbedding(t, dims)),
194
- };
195
-
196
- const hybrid = new HybridSearch(bm25, vector, mockEmbed, kv as never, 0.4, 0.6, 0);
197
- const perQuery: QualityMetrics[] = [];
198
-
199
- for (const q of queries) {
200
- const relevant = new Set(q.relevantObsIds);
201
- const start = performance.now();
202
- const results = await hybrid.search(q.query, 20);
203
- const latency = performance.now() - start;
204
-
205
- const retrieved = results.map(r => r.observation.id);
206
- perQuery.push({
207
- query: q.query,
208
- category: q.category,
209
- recall_at_5: recall(retrieved, relevant, 5),
210
- recall_at_10: recall(retrieved, relevant, 10),
211
- recall_at_20: recall(retrieved, relevant, 20),
212
- precision_at_5: precision(retrieved, relevant, 5),
213
- precision_at_10: precision(retrieved, relevant, 10),
214
- ndcg_at_10: ndcg(retrieved, relevant, 10),
215
- mrr: mrr(retrieved, relevant),
216
- relevant_count: relevant.size,
217
- retrieved_count: results.length,
218
- latency_ms: latency,
219
- });
220
- }
221
-
222
- const avgResultTokens = perQuery.reduce((sum, q) => {
223
- return sum + q.retrieved_count;
224
- }, 0) / perQuery.length;
225
- const avgObsTokens2 = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
226
-
227
- return {
228
- system: "Dual-stream (BM25+Vector)",
229
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
230
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
231
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
232
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
233
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
234
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
235
- avg_mrr: avg(perQuery.map(q => q.mrr)),
236
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
237
- total_tokens_per_query: Math.round(avgObsTokens2 * avgResultTokens),
238
- per_query: perQuery,
239
- };
240
- }
241
-
242
- async function evalTripleStream(
243
- observations: CompressedObservation[],
244
- queries: LabeledQuery[],
245
- ): Promise<SystemMetrics> {
246
- const kv = mockKV();
247
- const bm25 = new SearchIndex();
248
- const vector = new VectorIndex();
249
- const dims = 384;
250
-
251
- for (const obs of observations) {
252
- bm25.add(obs);
253
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ");
254
- vector.add(obs.id, obs.sessionId, deterministicEmbedding(text, dims));
255
- await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs);
256
- }
257
-
258
- const conceptToNodes = new Map<string, string>();
259
- const nodeTypes: GraphNode["type"][] = ["concept", "library", "file", "pattern"];
260
- const edgeTypes: GraphEdgeType[] = ["uses", "related_to", "depends_on", "modifies"];
261
- const now = new Date().toISOString();
262
- let nodeId = 0;
263
-
264
- for (const obs of observations) {
265
- for (const concept of obs.concepts) {
266
- if (!conceptToNodes.has(concept)) {
267
- const nid = `gn_${nodeId++}`;
268
- conceptToNodes.set(concept, nid);
269
- await kv.set("mem:graph:nodes", nid, {
270
- id: nid,
271
- type: nodeTypes[nodeId % nodeTypes.length],
272
- name: concept,
273
- properties: {},
274
- sourceObservationIds: [],
275
- createdAt: now,
276
- } as GraphNode);
277
- }
278
- const nid = conceptToNodes.get(concept)!;
279
- const existing = await kv.get<GraphNode>("mem:graph:nodes", nid);
280
- if (existing && !existing.sourceObservationIds.includes(obs.id)) {
281
- existing.sourceObservationIds.push(obs.id);
282
- await kv.set("mem:graph:nodes", nid, existing);
283
- }
284
- }
285
-
286
- const capped = obs.concepts.slice(0, 10);
287
- for (let i = 0; i < capped.length; i++) {
288
- for (let j = i + 1; j < capped.length; j++) {
289
- const srcNid = conceptToNodes.get(capped[i])!;
290
- const tgtNid = conceptToNodes.get(capped[j])!;
291
- if (srcNid && tgtNid && srcNid !== tgtNid) {
292
- const eid = `ge_${srcNid}_${tgtNid}`;
293
- const existing = await kv.get<GraphEdge>("mem:graph:edges", eid);
294
- const weight = existing ? Math.min(1.0, existing.weight + 0.1) : 0.5;
295
- await kv.set("mem:graph:edges", eid, {
296
- id: eid,
297
- type: edgeTypes[(i + j) % edgeTypes.length],
298
- sourceNodeId: srcNid,
299
- targetNodeId: tgtNid,
300
- weight,
301
- sourceObservationIds: existing
302
- ? [...new Set([...existing.sourceObservationIds, obs.id])]
303
- : [obs.id],
304
- createdAt: now,
305
- tcommit: now,
306
- version: 1,
307
- isLatest: true,
308
- } as GraphEdge);
309
- }
310
- }
311
- }
312
- }
313
-
314
- const mockEmbed: any = {
315
- name: "deterministic",
316
- dimensions: dims,
317
- embed: async (text: string) => deterministicEmbedding(text, dims),
318
- embedBatch: async (texts: string[]) => texts.map(t => deterministicEmbedding(t, dims)),
319
- };
320
-
321
- const hybrid = new HybridSearch(bm25, vector, mockEmbed, kv as never, 0.4, 0.6, 0.3);
322
- const perQuery: QualityMetrics[] = [];
323
-
324
- for (const q of queries) {
325
- const relevant = new Set(q.relevantObsIds);
326
- const start = performance.now();
327
- const results = await hybrid.search(q.query, 20);
328
- const latency = performance.now() - start;
329
-
330
- const retrieved = results.map(r => r.observation.id);
331
- perQuery.push({
332
- query: q.query,
333
- category: q.category,
334
- recall_at_5: recall(retrieved, relevant, 5),
335
- recall_at_10: recall(retrieved, relevant, 10),
336
- recall_at_20: recall(retrieved, relevant, 20),
337
- precision_at_5: precision(retrieved, relevant, 5),
338
- precision_at_10: precision(retrieved, relevant, 10),
339
- ndcg_at_10: ndcg(retrieved, relevant, 10),
340
- mrr: mrr(retrieved, relevant),
341
- relevant_count: relevant.size,
342
- retrieved_count: results.length,
343
- latency_ms: latency,
344
- });
345
- }
346
-
347
- const avgResultTokens3 = perQuery.reduce((sum, q) => {
348
- return sum + q.retrieved_count;
349
- }, 0) / perQuery.length;
350
- const avgObsTokens3 = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50;
351
-
352
- return {
353
- system: "Triple-stream (BM25+Vector+Graph)",
354
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
355
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
356
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
357
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
358
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
359
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
360
- avg_mrr: avg(perQuery.map(q => q.mrr)),
361
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
362
- total_tokens_per_query: Math.round(avgObsTokens3 * avgResultTokens3),
363
- per_query: perQuery,
364
- };
365
- }
366
-
367
- async function evalBuiltinMemory(
368
- observations: CompressedObservation[],
369
- queries: LabeledQuery[],
370
- ): Promise<SystemMetrics> {
371
- const allText = observations.map(o =>
372
- `## ${o.title}\n${o.narrative}\nConcepts: ${o.concepts.join(", ")}\nFiles: ${o.files.join(", ")}`
373
- ).join("\n\n");
374
-
375
- const totalTokens = estimateTokens(allText);
376
-
377
- const perQuery: QualityMetrics[] = [];
378
-
379
- for (const q of queries) {
380
- const relevant = new Set(q.relevantObsIds);
381
- const start = performance.now();
382
-
383
- const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2);
384
- const scored: Array<{ id: string; score: number }> = [];
385
-
386
- for (const obs of observations) {
387
- const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ").toLowerCase();
388
- let score = 0;
389
- for (const term of queryTerms) {
390
- if (text.includes(term)) score++;
391
- }
392
- if (score > 0) scored.push({ id: obs.id, score });
393
- }
394
-
395
- scored.sort((a, b) => b.score - a.score);
396
- const latency = performance.now() - start;
397
-
398
- const retrieved = scored.map(s => s.id).slice(0, 20);
399
- perQuery.push({
400
- query: q.query,
401
- category: q.category,
402
- recall_at_5: recall(retrieved, relevant, 5),
403
- recall_at_10: recall(retrieved, relevant, 10),
404
- recall_at_20: recall(retrieved, relevant, 20),
405
- precision_at_5: precision(retrieved, relevant, 5),
406
- precision_at_10: precision(retrieved, relevant, 10),
407
- ndcg_at_10: ndcg(retrieved, relevant, 10),
408
- mrr: mrr(retrieved, relevant),
409
- relevant_count: relevant.size,
410
- retrieved_count: Math.min(scored.length, 20),
411
- latency_ms: latency,
412
- });
413
- }
414
-
415
- return {
416
- system: "Built-in (CLAUDE.md / grep)",
417
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
418
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
419
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
420
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
421
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
422
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
423
- avg_mrr: avg(perQuery.map(q => q.mrr)),
424
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
425
- total_tokens_per_query: totalTokens,
426
- per_query: perQuery,
427
- };
428
- }
429
-
430
- async function evalBuiltinMemoryTruncated(
431
- observations: CompressedObservation[],
432
- queries: LabeledQuery[],
433
- ): Promise<SystemMetrics> {
434
- const MAX_LINES = 200;
435
- const lines = observations.map(o =>
436
- `- ${o.title}: ${o.narrative.slice(0, 80)}... [${o.concepts.slice(0, 3).join(", ")}]`
437
- );
438
- const truncated = lines.slice(0, MAX_LINES);
439
- const truncatedIds = new Set(observations.slice(0, MAX_LINES).map(o => o.id));
440
- const totalTokens = estimateTokens(truncated.join("\n"));
441
-
442
- const perQuery: QualityMetrics[] = [];
443
-
444
- for (const q of queries) {
445
- const relevant = new Set(q.relevantObsIds);
446
- const start = performance.now();
447
-
448
- const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2);
449
- const scored: Array<{ id: string; score: number }> = [];
450
-
451
- for (let i = 0; i < Math.min(MAX_LINES, observations.length); i++) {
452
- const obs = observations[i];
453
- const line = truncated[i];
454
- let score = 0;
455
- for (const term of queryTerms) {
456
- if (line.toLowerCase().includes(term)) score++;
457
- }
458
- if (score > 0) scored.push({ id: obs.id, score });
459
- }
460
-
461
- scored.sort((a, b) => b.score - a.score);
462
- const latency = performance.now() - start;
463
-
464
- const retrieved = scored.map(s => s.id).slice(0, 20);
465
-
466
- const reachableRelevant = new Set(
467
- [...relevant].filter(id => truncatedIds.has(id))
468
- );
469
-
470
- perQuery.push({
471
- query: q.query,
472
- category: q.category,
473
- recall_at_5: recall(retrieved, relevant, 5),
474
- recall_at_10: recall(retrieved, relevant, 10),
475
- recall_at_20: recall(retrieved, relevant, 20),
476
- precision_at_5: precision(retrieved, relevant, 5),
477
- precision_at_10: precision(retrieved, relevant, 10),
478
- ndcg_at_10: ndcg(retrieved, relevant, 10),
479
- mrr: mrr(retrieved, relevant),
480
- relevant_count: relevant.size,
481
- retrieved_count: Math.min(scored.length, 20),
482
- latency_ms: latency,
483
- });
484
- }
485
-
486
- return {
487
- system: "Built-in (200-line MEMORY.md)",
488
- avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)),
489
- avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)),
490
- avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)),
491
- avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)),
492
- avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)),
493
- avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)),
494
- avg_mrr: avg(perQuery.map(q => q.mrr)),
495
- avg_latency_ms: avg(perQuery.map(q => q.latency_ms)),
496
- total_tokens_per_query: totalTokens,
497
- per_query: perQuery,
498
- };
499
- }
500
-
501
- function avg(nums: number[]): number {
502
- return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0;
503
- }
504
-
505
- function pct(n: number): string {
506
- return (n * 100).toFixed(1) + "%";
507
- }
508
-
509
- function generateReport(systems: SystemMetrics[], obsCount: number, queryCount: number): string {
510
- const lines: string[] = [];
511
- const w = (s: string) => lines.push(s);
512
-
513
- w("# agentmemory v0.6.0 — Search Quality Evaluation");
514
- w("");
515
- w(`**Date:** ${new Date().toISOString()}`);
516
- w(`**Dataset:** ${obsCount} observations across 30 sessions (realistic coding project)`);
517
- w(`**Queries:** ${queryCount} labeled queries with ground-truth relevance`);
518
- w(`**Metric definitions:** Recall@K (fraction of relevant docs in top K), Precision@K (fraction of top K that are relevant), NDCG@10 (ranking quality), MRR (position of first relevant result)`);
519
- w("");
520
-
521
- w("## Head-to-Head Comparison");
522
- w("");
523
- w("| System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Latency | Tokens/query |");
524
- w("|--------|----------|-----------|-------------|---------|-----|---------|--------------|");
525
- for (const s of systems) {
526
- w(`| ${s.system} | ${pct(s.avg_recall_at_5)} | ${pct(s.avg_recall_at_10)} | ${pct(s.avg_precision_at_5)} | ${pct(s.avg_ndcg_at_10)} | ${pct(s.avg_mrr)} | ${s.avg_latency_ms.toFixed(2)}ms | ${s.total_tokens_per_query.toLocaleString()} |`);
527
- }
528
-
529
- w("");
530
- w("## Why This Matters");
531
- w("");
532
-
533
- const builtin = systems.find(s => s.system.includes("CLAUDE.md / grep"));
534
- const truncated = systems.find(s => s.system.includes("200-line"));
535
- const triple = systems.find(s => s.system.includes("Triple"));
536
- const bm25 = systems.find(s => s.system === "BM25-only");
537
-
538
- if (builtin && triple) {
539
- const recallLift = ((triple.avg_recall_at_10 - builtin.avg_recall_at_10) / Math.max(0.001, builtin.avg_recall_at_10) * 100);
540
- const tokenSaving = ((1 - triple.total_tokens_per_query / builtin.total_tokens_per_query) * 100);
541
- w(`**Recall improvement:** agentmemory triple-stream finds ${pct(triple.avg_recall_at_10)} of relevant memories at K=10 vs ${pct(builtin.avg_recall_at_10)} for keyword grep (${recallLift > 0 ? "+" : ""}${recallLift.toFixed(0)}%)`);
542
- w(`**Token savings:** agentmemory returns only the top 10 results (${triple.total_tokens_per_query.toLocaleString()} tokens) vs loading everything into context (${builtin.total_tokens_per_query.toLocaleString()} tokens) — ${tokenSaving.toFixed(0)}% reduction`);
543
- }
544
-
545
- if (truncated && triple) {
546
- w(`**200-line cap:** Claude Code's MEMORY.md is capped at 200 lines. With ${obsCount} observations, ${pct(truncated.avg_recall_at_10)} recall at K=10 — memories from later sessions are simply invisible.`);
547
- }
548
-
549
- w("");
550
- w("## Per-Query Breakdown (Triple-Stream)");
551
- w("");
552
-
553
- if (triple) {
554
- w("| Query | Category | Recall@10 | NDCG@10 | MRR | Relevant | Latency |");
555
- w("|-------|----------|-----------|---------|-----|----------|---------|");
556
- for (const q of triple.per_query) {
557
- w(`| ${q.query.slice(0, 45)}${q.query.length > 45 ? "..." : ""} | ${q.category} | ${pct(q.recall_at_10)} | ${pct(q.ndcg_at_10)} | ${pct(q.mrr)} | ${q.relevant_count} | ${q.latency_ms.toFixed(1)}ms |`);
558
- }
559
- }
560
-
561
- w("");
562
- w("## By Query Category");
563
- w("");
564
-
565
- const categories = ["exact", "semantic", "cross-session", "entity"];
566
- if (triple) {
567
- w("| Category | Avg Recall@10 | Avg NDCG@10 | Avg MRR | Queries |");
568
- w("|----------|---------------|-------------|---------|---------|");
569
- for (const cat of categories) {
570
- const qs = triple.per_query.filter(q => q.category === cat);
571
- if (qs.length === 0) continue;
572
- w(`| ${cat} | ${pct(avg(qs.map(q => q.recall_at_10)))} | ${pct(avg(qs.map(q => q.ndcg_at_10)))} | ${pct(avg(qs.map(q => q.mrr)))} | ${qs.length} |`);
573
- }
574
- }
575
-
576
- w("");
577
- w("## Context Window Analysis");
578
- w("");
579
- w("The fundamental problem with built-in agent memory:");
580
- w("");
581
- w("| Observations | MEMORY.md tokens | agentmemory tokens (top 10) | Savings | MEMORY.md reachable |");
582
- w("|-------------|-----------------|---------------------------|---------|-------------------|");
583
-
584
- for (const count of [240, 500, 1000, 5000]) {
585
- const memTokens = Math.round(count * 50);
586
- const amTokens = triple ? triple.total_tokens_per_query : 500;
587
- const saving = ((1 - amTokens / memTokens) * 100);
588
- const reachable = count <= 200 ? "100%" : `${((200 / count) * 100).toFixed(0)}%`;
589
- w(`| ${count.toLocaleString()} | ${memTokens.toLocaleString()} | ${amTokens.toLocaleString()} | ${saving.toFixed(0)}% | ${reachable} |`);
590
- }
591
-
592
- w("");
593
- w("At 240 observations (our dataset), MEMORY.md already hits its 200-line cap and loses access to the most recent 40 observations. At 1,000 observations, 80% of memories are invisible. agentmemory always searches the full corpus.");
594
-
595
- w("");
596
- w("---");
597
- w("");
598
- w(`*${systems.reduce((s, sys) => s + sys.per_query.length, 0)} evaluations across ${systems.length} systems. Ground-truth labels assigned by concept matching against observation metadata.*`);
599
-
600
- return lines.join("\n");
601
- }
602
-
603
- async function main() {
604
- console.log("Generating labeled dataset...");
605
- const { observations, queries, sessions } = generateDataset();
606
- console.log(`Dataset: ${observations.length} observations, ${sessions.size} sessions, ${queries.length} queries`);
607
- console.log(`Avg relevant docs per query: ${(queries.reduce((s, q) => s + q.relevantObsIds.length, 0) / queries.length).toFixed(1)}`);
608
- console.log("");
609
-
610
- console.log("Evaluating: Built-in (CLAUDE.md / grep)...");
611
- const builtinResults = await evalBuiltinMemory(observations, queries);
612
- console.log(` Recall@10: ${pct(builtinResults.avg_recall_at_10)}, NDCG@10: ${pct(builtinResults.avg_ndcg_at_10)}`);
613
-
614
- console.log("Evaluating: Built-in (200-line MEMORY.md)...");
615
- const truncatedResults = await evalBuiltinMemoryTruncated(observations, queries);
616
- console.log(` Recall@10: ${pct(truncatedResults.avg_recall_at_10)}, NDCG@10: ${pct(truncatedResults.avg_ndcg_at_10)}`);
617
-
618
- console.log("Evaluating: BM25-only...");
619
- const bm25Results = await evalBm25Only(observations, queries);
620
- console.log(` Recall@10: ${pct(bm25Results.avg_recall_at_10)}, NDCG@10: ${pct(bm25Results.avg_ndcg_at_10)}`);
621
-
622
- console.log("Evaluating: Dual-stream (BM25+Vector)...");
623
- const dualResults = await evalDualStream(observations, queries);
624
- console.log(` Recall@10: ${pct(dualResults.avg_recall_at_10)}, NDCG@10: ${pct(dualResults.avg_ndcg_at_10)}`);
625
-
626
- console.log("Evaluating: Triple-stream (BM25+Vector+Graph)...");
627
- const tripleResults = await evalTripleStream(observations, queries);
628
- console.log(` Recall@10: ${pct(tripleResults.avg_recall_at_10)}, NDCG@10: ${pct(tripleResults.avg_ndcg_at_10)}`);
629
-
630
- console.log("");
631
-
632
- const report = generateReport(
633
- [builtinResults, truncatedResults, bm25Results, dualResults, tripleResults],
634
- observations.length,
635
- queries.length,
636
- );
637
-
638
- writeFileSync("benchmark/QUALITY.md", report);
639
- console.log(report);
640
- console.log(`\nReport written to benchmark/QUALITY.md`);
641
- }
642
-
643
- main().catch(console.error);