@totalreclaw/totalreclaw 1.0.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -87
- package/api-client.ts +328 -0
- package/consolidation.test.ts +356 -0
- package/consolidation.ts +227 -0
- package/crypto.ts +351 -0
- package/embedding.ts +75 -0
- package/extractor-dedup.test.ts +168 -0
- package/extractor.ts +237 -0
- package/generate-mnemonic.ts +14 -0
- package/hot-cache-wrapper.ts +126 -0
- package/import-adapters/base-adapter.ts +93 -0
- package/import-adapters/import-adapters.test.ts +595 -0
- package/import-adapters/index.ts +22 -0
- package/import-adapters/mcp-memory-adapter.ts +274 -0
- package/import-adapters/mem0-adapter.ts +233 -0
- package/import-adapters/types.ts +89 -0
- package/index.ts +2680 -0
- package/llm-client.ts +418 -0
- package/lsh.test.ts +463 -0
- package/lsh.ts +257 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +19 -34
- package/pocv2-e2e-test.ts +917 -0
- package/reranker.test.ts +594 -0
- package/reranker.ts +537 -0
- package/semantic-dedup.test.ts +392 -0
- package/semantic-dedup.ts +100 -0
- package/setup.sh +19 -0
- package/store-dedup-wiring.test.ts +186 -0
- package/subgraph-search.ts +282 -0
- package/subgraph-store.ts +348 -0
- package/SKILL.md +0 -709
- package/dist/index.js +0 -32154
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for memory consolidation & near-duplicate detection.
|
|
3
|
+
*
|
|
4
|
+
* Run with: npx tsx consolidation.test.ts
|
|
5
|
+
*
|
|
6
|
+
* Uses TAP-style output (no test framework dependency).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import {
|
|
10
|
+
findNearDuplicate,
|
|
11
|
+
shouldSupersede,
|
|
12
|
+
clusterFacts,
|
|
13
|
+
getStoreDedupThreshold,
|
|
14
|
+
getConsolidationThreshold,
|
|
15
|
+
STORE_DEDUP_MAX_CANDIDATES,
|
|
16
|
+
} from './consolidation.js';
|
|
17
|
+
import type { DecryptedCandidate } from './consolidation.js';
|
|
18
|
+
|
|
19
|
+
let passed = 0;
|
|
20
|
+
let failed = 0;
|
|
21
|
+
let testNum = 0;
|
|
22
|
+
|
|
23
|
+
function assert(condition: boolean, message: string): void {
|
|
24
|
+
testNum++;
|
|
25
|
+
if (condition) {
|
|
26
|
+
passed++;
|
|
27
|
+
console.log(`ok ${testNum} - ${message}`);
|
|
28
|
+
} else {
|
|
29
|
+
failed++;
|
|
30
|
+
console.log(`not ok ${testNum} - ${message}`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function assertClose(actual: number, expected: number, epsilon: number, message: string): void {
|
|
35
|
+
const diff = Math.abs(actual - expected);
|
|
36
|
+
assert(diff < epsilon, `${message} (expected ~${expected}, got ${actual}, diff=${diff})`);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Helper: create a DecryptedCandidate
|
|
40
|
+
function makeCandidate(
|
|
41
|
+
overrides: Partial<DecryptedCandidate> & { id: string },
|
|
42
|
+
): DecryptedCandidate {
|
|
43
|
+
return {
|
|
44
|
+
text: `fact ${overrides.id}`,
|
|
45
|
+
embedding: null,
|
|
46
|
+
importance: 5,
|
|
47
|
+
decayScore: 1.0,
|
|
48
|
+
createdAt: 1000,
|
|
49
|
+
version: 1,
|
|
50
|
+
...overrides,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// getStoreDedupThreshold tests
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
console.log('# getStoreDedupThreshold');
|
|
59
|
+
|
|
60
|
+
{
|
|
61
|
+
// Default threshold should be 0.85 (no env var set)
|
|
62
|
+
const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
63
|
+
delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
64
|
+
assertClose(getStoreDedupThreshold(), 0.85, 1e-10, 'default threshold is 0.85');
|
|
65
|
+
if (orig !== undefined) process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
{
|
|
69
|
+
// Custom threshold via env var
|
|
70
|
+
const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
71
|
+
process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = '0.75';
|
|
72
|
+
assertClose(getStoreDedupThreshold(), 0.75, 1e-10, 'custom threshold 0.75 from env');
|
|
73
|
+
if (orig !== undefined) {
|
|
74
|
+
process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
|
|
75
|
+
} else {
|
|
76
|
+
delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
{
|
|
81
|
+
// Invalid env var falls back to default
|
|
82
|
+
const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
83
|
+
process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = 'not-a-number';
|
|
84
|
+
assertClose(getStoreDedupThreshold(), 0.85, 1e-10, 'invalid env var falls back to 0.85');
|
|
85
|
+
if (orig !== undefined) {
|
|
86
|
+
process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
|
|
87
|
+
} else {
|
|
88
|
+
delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
// getConsolidationThreshold tests
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
console.log('# getConsolidationThreshold');
|
|
97
|
+
|
|
98
|
+
{
|
|
99
|
+
// Default threshold should be 0.88 (no env var set)
|
|
100
|
+
const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
101
|
+
delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
102
|
+
assertClose(getConsolidationThreshold(), 0.88, 1e-10, 'default threshold is 0.88');
|
|
103
|
+
if (orig !== undefined) process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
{
|
|
107
|
+
// Custom threshold via env var
|
|
108
|
+
const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
109
|
+
process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = '0.95';
|
|
110
|
+
assertClose(getConsolidationThreshold(), 0.95, 1e-10, 'custom threshold 0.95 from env');
|
|
111
|
+
if (orig !== undefined) {
|
|
112
|
+
process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
|
|
113
|
+
} else {
|
|
114
|
+
delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
{
|
|
119
|
+
// Invalid env var falls back to default
|
|
120
|
+
const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
121
|
+
process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = 'garbage';
|
|
122
|
+
assertClose(getConsolidationThreshold(), 0.88, 1e-10, 'invalid env var falls back to 0.88');
|
|
123
|
+
if (orig !== undefined) {
|
|
124
|
+
process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
|
|
125
|
+
} else {
|
|
126
|
+
delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
// STORE_DEDUP_MAX_CANDIDATES constant
|
|
132
|
+
// ---------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
console.log('# STORE_DEDUP_MAX_CANDIDATES');
|
|
135
|
+
|
|
136
|
+
assert(STORE_DEDUP_MAX_CANDIDATES === 200, 'STORE_DEDUP_MAX_CANDIDATES is 200');
|
|
137
|
+
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
// findNearDuplicate tests
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
console.log('# findNearDuplicate');
|
|
143
|
+
|
|
144
|
+
{
|
|
145
|
+
// Empty candidates -> null
|
|
146
|
+
const result = findNearDuplicate([1, 0, 0], [], 0.85);
|
|
147
|
+
assert(result === null, 'empty candidates returns null');
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
{
|
|
151
|
+
// No embeddings on candidates -> null
|
|
152
|
+
const candidates = [
|
|
153
|
+
makeCandidate({ id: 'a', embedding: null }),
|
|
154
|
+
makeCandidate({ id: 'b', embedding: null }),
|
|
155
|
+
];
|
|
156
|
+
const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
|
|
157
|
+
assert(result === null, 'candidates without embeddings returns null');
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
{
|
|
161
|
+
// Below threshold -> null
|
|
162
|
+
const candidates = [
|
|
163
|
+
makeCandidate({ id: 'a', embedding: [0, 1, 0] }), // orthogonal, cosine = 0
|
|
164
|
+
];
|
|
165
|
+
const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
|
|
166
|
+
assert(result === null, 'below threshold returns null');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
{
|
|
170
|
+
// Above threshold -> returns match
|
|
171
|
+
const candidates = [
|
|
172
|
+
makeCandidate({ id: 'a', embedding: [1, 0, 0] }), // cosine = 1.0
|
|
173
|
+
];
|
|
174
|
+
const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
|
|
175
|
+
assert(result !== null, 'above threshold returns match');
|
|
176
|
+
assert(result!.existingFact.id === 'a', 'match is the correct candidate');
|
|
177
|
+
assertClose(result!.similarity, 1.0, 1e-6, 'similarity is ~1.0');
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
{
|
|
181
|
+
// Multiple matches -> returns highest similarity
|
|
182
|
+
const candidates = [
|
|
183
|
+
makeCandidate({ id: 'low', embedding: [0.86, Math.sqrt(1 - 0.86 * 0.86), 0] }), // cosine ~ 0.86
|
|
184
|
+
makeCandidate({ id: 'high', embedding: [0.99, Math.sqrt(1 - 0.99 * 0.99), 0] }), // cosine ~ 0.99
|
|
185
|
+
makeCandidate({ id: 'mid', embedding: [0.90, Math.sqrt(1 - 0.90 * 0.90), 0] }), // cosine ~ 0.90
|
|
186
|
+
];
|
|
187
|
+
const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
|
|
188
|
+
assert(result !== null, 'multiple matches: returns a match');
|
|
189
|
+
assert(result!.existingFact.id === 'high', 'multiple matches: returns highest similarity');
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
{
|
|
193
|
+
// Parallel vectors (cosine = 1.0) -> match
|
|
194
|
+
const candidates = [
|
|
195
|
+
makeCandidate({ id: 'parallel', embedding: [3, 6, 9] }), // parallel to [1, 2, 3]
|
|
196
|
+
];
|
|
197
|
+
const result = findNearDuplicate([1, 2, 3], candidates, 0.85);
|
|
198
|
+
assert(result !== null, 'parallel vectors: returns match');
|
|
199
|
+
assertClose(result!.similarity, 1.0, 1e-6, 'parallel vectors: cosine is ~1.0');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
{
|
|
203
|
+
// Orthogonal vectors (cosine = 0) -> null
|
|
204
|
+
const candidates = [
|
|
205
|
+
makeCandidate({ id: 'ortho', embedding: [0, 1, 0] }),
|
|
206
|
+
];
|
|
207
|
+
const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
|
|
208
|
+
assert(result === null, 'orthogonal vectors: returns null');
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// ---------------------------------------------------------------------------
|
|
212
|
+
// shouldSupersede tests
|
|
213
|
+
// ---------------------------------------------------------------------------
|
|
214
|
+
|
|
215
|
+
console.log('# shouldSupersede');
|
|
216
|
+
|
|
217
|
+
{
|
|
218
|
+
// Higher new importance -> supersede
|
|
219
|
+
const existing = makeCandidate({ id: 'old', importance: 5 });
|
|
220
|
+
const result = shouldSupersede(8, existing);
|
|
221
|
+
assert(result === 'supersede', 'higher new importance -> supersede');
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
{
|
|
225
|
+
// Lower new importance -> skip
|
|
226
|
+
const existing = makeCandidate({ id: 'old', importance: 8 });
|
|
227
|
+
const result = shouldSupersede(3, existing);
|
|
228
|
+
assert(result === 'skip', 'lower new importance -> skip');
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
{
|
|
232
|
+
// Equal importance -> supersede (newer wins)
|
|
233
|
+
const existing = makeCandidate({ id: 'old', importance: 5 });
|
|
234
|
+
const result = shouldSupersede(5, existing);
|
|
235
|
+
assert(result === 'supersede', 'equal importance -> supersede (newer wins)');
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// clusterFacts tests
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
console.log('# clusterFacts');
|
|
243
|
+
|
|
244
|
+
{
|
|
245
|
+
// Empty facts -> empty clusters
|
|
246
|
+
const clusters = clusterFacts([], 0.88);
|
|
247
|
+
assert(clusters.length === 0, 'empty facts -> no clusters');
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
{
|
|
251
|
+
// Single fact -> no clusters (needs at least 2 to form a cluster)
|
|
252
|
+
const facts = [
|
|
253
|
+
makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
|
|
254
|
+
];
|
|
255
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
256
|
+
assert(clusters.length === 0, 'single fact -> no clusters');
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
{
|
|
260
|
+
// Two identical embeddings -> one cluster
|
|
261
|
+
const facts = [
|
|
262
|
+
makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
|
|
263
|
+
makeCandidate({ id: 'b', embedding: [1, 0, 0] }),
|
|
264
|
+
];
|
|
265
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
266
|
+
assert(clusters.length === 1, 'two identical -> one cluster');
|
|
267
|
+
assert(clusters[0].duplicates.length === 1, 'two identical -> one duplicate');
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
{
|
|
271
|
+
// Two dissimilar embeddings -> no clusters
|
|
272
|
+
const facts = [
|
|
273
|
+
makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
|
|
274
|
+
makeCandidate({ id: 'b', embedding: [0, 1, 0] }), // orthogonal
|
|
275
|
+
];
|
|
276
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
277
|
+
assert(clusters.length === 0, 'two dissimilar -> no clusters');
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
{
|
|
281
|
+
// Multiple clusters: two groups of duplicates + one unique
|
|
282
|
+
const facts = [
|
|
283
|
+
makeCandidate({ id: 'a1', embedding: [1, 0, 0] }),
|
|
284
|
+
makeCandidate({ id: 'a2', embedding: [1, 0, 0] }), // dup of a1
|
|
285
|
+
makeCandidate({ id: 'b1', embedding: [0, 1, 0] }),
|
|
286
|
+
makeCandidate({ id: 'b2', embedding: [0, 1, 0] }), // dup of b1
|
|
287
|
+
makeCandidate({ id: 'c1', embedding: [0, 0, 1] }), // unique
|
|
288
|
+
];
|
|
289
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
290
|
+
assert(clusters.length === 2, 'multiple clusters: two groups found');
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
{
|
|
294
|
+
// Facts without embeddings are not clustered
|
|
295
|
+
const facts = [
|
|
296
|
+
makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
|
|
297
|
+
makeCandidate({ id: 'b', embedding: null }), // no embedding
|
|
298
|
+
makeCandidate({ id: 'c', embedding: [1, 0, 0] }), // dup of a
|
|
299
|
+
];
|
|
300
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
301
|
+
assert(clusters.length === 1, 'no-embedding facts skipped, one cluster of a+c');
|
|
302
|
+
// b should not appear in any cluster
|
|
303
|
+
const allIds = clusters.flatMap(c => [c.representative.id, ...c.duplicates.map(d => d.id)]);
|
|
304
|
+
assert(!allIds.includes('b'), 'no-embedding fact not in any cluster');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
{
|
|
308
|
+
// Representative = highest importance (via decayScore tiebreak)
|
|
309
|
+
const facts = [
|
|
310
|
+
makeCandidate({ id: 'low', embedding: [1, 0, 0], decayScore: 0.5, importance: 3 }),
|
|
311
|
+
makeCandidate({ id: 'high', embedding: [1, 0, 0], decayScore: 0.9, importance: 8 }),
|
|
312
|
+
makeCandidate({ id: 'mid', embedding: [1, 0, 0], decayScore: 0.7, importance: 5 }),
|
|
313
|
+
];
|
|
314
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
315
|
+
assert(clusters.length === 1, 'representative test: one cluster');
|
|
316
|
+
assert(clusters[0].representative.id === 'high', 'representative = highest decayScore');
|
|
317
|
+
assert(clusters[0].duplicates.length === 2, 'two duplicates');
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
{
|
|
321
|
+
// Tiebreak: same decayScore -> most recent (highest createdAt)
|
|
322
|
+
const facts = [
|
|
323
|
+
makeCandidate({ id: 'old', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
|
|
324
|
+
makeCandidate({ id: 'new', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 2000 }),
|
|
325
|
+
];
|
|
326
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
327
|
+
assert(clusters.length === 1, 'tiebreak test: one cluster');
|
|
328
|
+
assert(clusters[0].representative.id === 'new', 'tiebreak: most recent is representative');
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
{
|
|
332
|
+
// Tiebreak: same decayScore + createdAt -> longest text
|
|
333
|
+
const facts = [
|
|
334
|
+
makeCandidate({ id: 'short', text: 'abc', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
|
|
335
|
+
makeCandidate({ id: 'long', text: 'abcdefghij', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
|
|
336
|
+
];
|
|
337
|
+
const clusters = clusterFacts(facts, 0.88);
|
|
338
|
+
assert(clusters.length === 1, 'tiebreak longest text: one cluster');
|
|
339
|
+
assert(clusters[0].representative.id === 'long', 'tiebreak: longest text is representative');
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// ---------------------------------------------------------------------------
|
|
343
|
+
// Summary
|
|
344
|
+
// ---------------------------------------------------------------------------
|
|
345
|
+
|
|
346
|
+
console.log(`\n1..${testNum}`);
|
|
347
|
+
console.log(`# pass: ${passed}`);
|
|
348
|
+
console.log(`# fail: ${failed}`);
|
|
349
|
+
|
|
350
|
+
if (failed > 0) {
|
|
351
|
+
console.log('\nFAILED');
|
|
352
|
+
process.exit(1);
|
|
353
|
+
} else {
|
|
354
|
+
console.log('\nALL TESTS PASSED');
|
|
355
|
+
process.exit(0);
|
|
356
|
+
}
|
package/consolidation.ts
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TotalReclaw Plugin - Memory Consolidation & Near-Duplicate Detection
|
|
3
|
+
*
|
|
4
|
+
* Provides cross-session / cross-vault deduplication of stored facts using
|
|
5
|
+
* cosine similarity on their embeddings. Unlike semantic-dedup.ts (which
|
|
6
|
+
* handles within-batch dedup at threshold 0.9), this module handles:
|
|
7
|
+
*
|
|
8
|
+
* 1. Store-time dedup — before writing a new fact, check whether a
|
|
9
|
+
* near-duplicate already exists in the vault (findNearDuplicate).
|
|
10
|
+
* 2. Supersede logic — when a near-duplicate is found, decide whether
|
|
11
|
+
* the new fact should replace or be skipped (shouldSupersede).
|
|
12
|
+
* 3. Bulk consolidation — cluster all facts in the vault and identify
|
|
13
|
+
* groups of near-duplicates for cleanup (clusterFacts).
|
|
14
|
+
*
|
|
15
|
+
* This module intentionally has minimal dependencies (only reranker for
|
|
16
|
+
* cosineSimilarity) so it can be tested without pulling in the full
|
|
17
|
+
* plugin dependency graph.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { cosineSimilarity } from './reranker.js';
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Configuration
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get the cosine similarity threshold for store-time dedup.
|
|
28
|
+
*
|
|
29
|
+
* Configurable via TOTALRECLAW_STORE_DEDUP_THRESHOLD env var.
|
|
30
|
+
* Must be a number in [0, 1]. Falls back to 0.85 if invalid or unset.
|
|
31
|
+
*/
|
|
32
|
+
export function getStoreDedupThreshold(): number {
|
|
33
|
+
const envVal = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
|
|
34
|
+
if (envVal !== undefined) {
|
|
35
|
+
const parsed = parseFloat(envVal);
|
|
36
|
+
if (!isNaN(parsed) && parsed >= 0 && parsed <= 1) return parsed;
|
|
37
|
+
}
|
|
38
|
+
return 0.85;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Get the cosine similarity threshold for bulk consolidation clustering.
|
|
43
|
+
*
|
|
44
|
+
* Configurable via TOTALRECLAW_CONSOLIDATION_THRESHOLD env var.
|
|
45
|
+
* Must be a number in [0, 1]. Falls back to 0.88 if invalid or unset.
|
|
46
|
+
*/
|
|
47
|
+
export function getConsolidationThreshold(): number {
|
|
48
|
+
const envVal = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
|
|
49
|
+
if (envVal !== undefined) {
|
|
50
|
+
const parsed = parseFloat(envVal);
|
|
51
|
+
if (!isNaN(parsed) && parsed >= 0 && parsed <= 1) return parsed;
|
|
52
|
+
}
|
|
53
|
+
return 0.88;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Maximum candidates to compare against during store-time dedup. */
|
|
57
|
+
export const STORE_DEDUP_MAX_CANDIDATES = 200;
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Types
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
/** A decrypted fact candidate from the vault, with metadata for ranking. */
|
|
64
|
+
export interface DecryptedCandidate {
|
|
65
|
+
id: string;
|
|
66
|
+
text: string;
|
|
67
|
+
embedding: number[] | null;
|
|
68
|
+
importance: number;
|
|
69
|
+
decayScore: number;
|
|
70
|
+
createdAt: number;
|
|
71
|
+
version: number;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** A match result from near-duplicate detection. */
|
|
75
|
+
export interface NearDuplicateMatch {
|
|
76
|
+
existingFact: DecryptedCandidate;
|
|
77
|
+
similarity: number;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** A cluster of near-duplicate facts for consolidation. */
|
|
81
|
+
export interface ConsolidationCluster {
|
|
82
|
+
representative: DecryptedCandidate;
|
|
83
|
+
duplicates: DecryptedCandidate[];
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
// Store-time dedup
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Find the best near-duplicate match for a new fact among existing candidates.
|
|
92
|
+
*
|
|
93
|
+
* Compares the new fact's embedding against all candidates using cosine
|
|
94
|
+
* similarity. Returns the candidate with the highest similarity above the
|
|
95
|
+
* threshold, or null if no match is found.
|
|
96
|
+
*
|
|
97
|
+
* Candidates without embeddings are skipped (fail-safe).
|
|
98
|
+
*
|
|
99
|
+
* @param newFactEmbedding - Embedding vector for the new fact
|
|
100
|
+
* @param candidates - Existing facts to compare against
|
|
101
|
+
* @param threshold - Cosine similarity threshold (e.g. 0.85)
|
|
102
|
+
* @returns - Best match above threshold, or null
|
|
103
|
+
*/
|
|
104
|
+
export function findNearDuplicate(
|
|
105
|
+
newFactEmbedding: number[],
|
|
106
|
+
candidates: DecryptedCandidate[],
|
|
107
|
+
threshold: number,
|
|
108
|
+
): NearDuplicateMatch | null {
|
|
109
|
+
let bestMatch: NearDuplicateMatch | null = null;
|
|
110
|
+
|
|
111
|
+
for (const candidate of candidates) {
|
|
112
|
+
if (!candidate.embedding || candidate.embedding.length === 0) continue;
|
|
113
|
+
|
|
114
|
+
const similarity = cosineSimilarity(newFactEmbedding, candidate.embedding);
|
|
115
|
+
if (similarity >= threshold) {
|
|
116
|
+
if (!bestMatch || similarity > bestMatch.similarity) {
|
|
117
|
+
bestMatch = { existingFact: candidate, similarity };
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return bestMatch;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
// Supersede logic
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Decide whether a new fact should supersede an existing near-duplicate.
|
|
131
|
+
*
|
|
132
|
+
* - Higher importance wins.
|
|
133
|
+
* - Equal importance: new fact supersedes (newer is preferred).
|
|
134
|
+
*
|
|
135
|
+
* @param newImportance - Importance score of the new fact
|
|
136
|
+
* @param existingFact - The existing near-duplicate candidate
|
|
137
|
+
* @returns - 'supersede' if new fact should replace, 'skip' otherwise
|
|
138
|
+
*/
|
|
139
|
+
export function shouldSupersede(
|
|
140
|
+
newImportance: number,
|
|
141
|
+
existingFact: DecryptedCandidate,
|
|
142
|
+
): 'supersede' | 'skip' {
|
|
143
|
+
if (newImportance >= existingFact.importance) return 'supersede';
|
|
144
|
+
return 'skip';
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
// Bulk consolidation
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Pick the best representative from a group of near-duplicate facts.
|
|
153
|
+
*
|
|
154
|
+
* Tiebreak order:
|
|
155
|
+
* 1. Highest decayScore
|
|
156
|
+
* 2. Most recent (highest createdAt)
|
|
157
|
+
* 3. Longest text
|
|
158
|
+
*/
|
|
159
|
+
function pickRepresentative(facts: DecryptedCandidate[]): DecryptedCandidate {
|
|
160
|
+
let best = facts[0];
|
|
161
|
+
for (let i = 1; i < facts.length; i++) {
|
|
162
|
+
const f = facts[i];
|
|
163
|
+
if (
|
|
164
|
+
f.decayScore > best.decayScore ||
|
|
165
|
+
(f.decayScore === best.decayScore && f.createdAt > best.createdAt) ||
|
|
166
|
+
(f.decayScore === best.decayScore && f.createdAt === best.createdAt && f.text.length > best.text.length)
|
|
167
|
+
) {
|
|
168
|
+
best = f;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
return best;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Cluster facts by semantic similarity using greedy single-pass clustering.
|
|
176
|
+
*
|
|
177
|
+
* For each fact (in order), assigns it to the first existing cluster whose
|
|
178
|
+
* representative has cosine similarity >= threshold. If no cluster matches,
|
|
179
|
+
* a new cluster is started.
|
|
180
|
+
*
|
|
181
|
+
* Only returns clusters that have duplicates (i.e. more than one member).
|
|
182
|
+
* Facts without embeddings are not clustered.
|
|
183
|
+
*
|
|
184
|
+
* @param facts - All facts to cluster
|
|
185
|
+
* @param threshold - Cosine similarity threshold (e.g. 0.88)
|
|
186
|
+
* @returns - Clusters with duplicates (representative + duplicates)
|
|
187
|
+
*/
|
|
188
|
+
export function clusterFacts(
|
|
189
|
+
facts: DecryptedCandidate[],
|
|
190
|
+
threshold: number,
|
|
191
|
+
): ConsolidationCluster[] {
|
|
192
|
+
const clusters: { members: DecryptedCandidate[] }[] = [];
|
|
193
|
+
|
|
194
|
+
for (const fact of facts) {
|
|
195
|
+
if (!fact.embedding || fact.embedding.length === 0) continue;
|
|
196
|
+
|
|
197
|
+
let assigned = false;
|
|
198
|
+
for (const cluster of clusters) {
|
|
199
|
+
// Compare against the first member's embedding (cluster seed)
|
|
200
|
+
const seed = cluster.members[0];
|
|
201
|
+
if (!seed.embedding) continue;
|
|
202
|
+
|
|
203
|
+
const similarity = cosineSimilarity(fact.embedding, seed.embedding);
|
|
204
|
+
if (similarity >= threshold) {
|
|
205
|
+
cluster.members.push(fact);
|
|
206
|
+
assigned = true;
|
|
207
|
+
break;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (!assigned) {
|
|
212
|
+
clusters.push({ members: [fact] });
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Only return clusters with duplicates, pick representative for each
|
|
217
|
+
const result: ConsolidationCluster[] = [];
|
|
218
|
+
for (const cluster of clusters) {
|
|
219
|
+
if (cluster.members.length < 2) continue;
|
|
220
|
+
|
|
221
|
+
const representative = pickRepresentative(cluster.members);
|
|
222
|
+
const duplicates = cluster.members.filter((m) => m !== representative);
|
|
223
|
+
result.push({ representative, duplicates });
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return result;
|
|
227
|
+
}
|