@totalreclaw/totalreclaw 1.0.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ /**
2
+ * Unit tests for memory consolidation & near-duplicate detection.
3
+ *
4
+ * Run with: npx tsx consolidation.test.ts
5
+ *
6
+ * Uses TAP-style output (no test framework dependency).
7
+ */
8
+
9
+ import {
10
+ findNearDuplicate,
11
+ shouldSupersede,
12
+ clusterFacts,
13
+ getStoreDedupThreshold,
14
+ getConsolidationThreshold,
15
+ STORE_DEDUP_MAX_CANDIDATES,
16
+ } from './consolidation.js';
17
+ import type { DecryptedCandidate } from './consolidation.js';
18
+
19
+ let passed = 0;
20
+ let failed = 0;
21
+ let testNum = 0;
22
+
23
+ function assert(condition: boolean, message: string): void {
24
+ testNum++;
25
+ if (condition) {
26
+ passed++;
27
+ console.log(`ok ${testNum} - ${message}`);
28
+ } else {
29
+ failed++;
30
+ console.log(`not ok ${testNum} - ${message}`);
31
+ }
32
+ }
33
+
34
+ function assertClose(actual: number, expected: number, epsilon: number, message: string): void {
35
+ const diff = Math.abs(actual - expected);
36
+ assert(diff < epsilon, `${message} (expected ~${expected}, got ${actual}, diff=${diff})`);
37
+ }
38
+
39
+ // Helper: create a DecryptedCandidate
40
+ function makeCandidate(
41
+ overrides: Partial<DecryptedCandidate> & { id: string },
42
+ ): DecryptedCandidate {
43
+ return {
44
+ text: `fact ${overrides.id}`,
45
+ embedding: null,
46
+ importance: 5,
47
+ decayScore: 1.0,
48
+ createdAt: 1000,
49
+ version: 1,
50
+ ...overrides,
51
+ };
52
+ }
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // getStoreDedupThreshold tests
56
+ // ---------------------------------------------------------------------------
57
+
58
+ console.log('# getStoreDedupThreshold');
59
+
60
+ {
61
+ // Default threshold should be 0.85 (no env var set)
62
+ const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
63
+ delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
64
+ assertClose(getStoreDedupThreshold(), 0.85, 1e-10, 'default threshold is 0.85');
65
+ if (orig !== undefined) process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
66
+ }
67
+
68
+ {
69
+ // Custom threshold via env var
70
+ const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
71
+ process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = '0.75';
72
+ assertClose(getStoreDedupThreshold(), 0.75, 1e-10, 'custom threshold 0.75 from env');
73
+ if (orig !== undefined) {
74
+ process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
75
+ } else {
76
+ delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
77
+ }
78
+ }
79
+
80
+ {
81
+ // Invalid env var falls back to default
82
+ const orig = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
83
+ process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = 'not-a-number';
84
+ assertClose(getStoreDedupThreshold(), 0.85, 1e-10, 'invalid env var falls back to 0.85');
85
+ if (orig !== undefined) {
86
+ process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD = orig;
87
+ } else {
88
+ delete process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
89
+ }
90
+ }
91
+
92
+ // ---------------------------------------------------------------------------
93
+ // getConsolidationThreshold tests
94
+ // ---------------------------------------------------------------------------
95
+
96
+ console.log('# getConsolidationThreshold');
97
+
98
+ {
99
+ // Default threshold should be 0.88 (no env var set)
100
+ const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
101
+ delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
102
+ assertClose(getConsolidationThreshold(), 0.88, 1e-10, 'default threshold is 0.88');
103
+ if (orig !== undefined) process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
104
+ }
105
+
106
+ {
107
+ // Custom threshold via env var
108
+ const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
109
+ process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = '0.95';
110
+ assertClose(getConsolidationThreshold(), 0.95, 1e-10, 'custom threshold 0.95 from env');
111
+ if (orig !== undefined) {
112
+ process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
113
+ } else {
114
+ delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
115
+ }
116
+ }
117
+
118
+ {
119
+ // Invalid env var falls back to default
120
+ const orig = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
121
+ process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = 'garbage';
122
+ assertClose(getConsolidationThreshold(), 0.88, 1e-10, 'invalid env var falls back to 0.88');
123
+ if (orig !== undefined) {
124
+ process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD = orig;
125
+ } else {
126
+ delete process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
127
+ }
128
+ }
129
+
130
+ // ---------------------------------------------------------------------------
131
+ // STORE_DEDUP_MAX_CANDIDATES constant
132
+ // ---------------------------------------------------------------------------
133
+
134
+ console.log('# STORE_DEDUP_MAX_CANDIDATES');
135
+
136
+ assert(STORE_DEDUP_MAX_CANDIDATES === 200, 'STORE_DEDUP_MAX_CANDIDATES is 200');
137
+
138
+ // ---------------------------------------------------------------------------
139
+ // findNearDuplicate tests
140
+ // ---------------------------------------------------------------------------
141
+
142
+ console.log('# findNearDuplicate');
143
+
144
+ {
145
+ // Empty candidates -> null
146
+ const result = findNearDuplicate([1, 0, 0], [], 0.85);
147
+ assert(result === null, 'empty candidates returns null');
148
+ }
149
+
150
+ {
151
+ // No embeddings on candidates -> null
152
+ const candidates = [
153
+ makeCandidate({ id: 'a', embedding: null }),
154
+ makeCandidate({ id: 'b', embedding: null }),
155
+ ];
156
+ const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
157
+ assert(result === null, 'candidates without embeddings returns null');
158
+ }
159
+
160
+ {
161
+ // Below threshold -> null
162
+ const candidates = [
163
+ makeCandidate({ id: 'a', embedding: [0, 1, 0] }), // orthogonal, cosine = 0
164
+ ];
165
+ const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
166
+ assert(result === null, 'below threshold returns null');
167
+ }
168
+
169
+ {
170
+ // Above threshold -> returns match
171
+ const candidates = [
172
+ makeCandidate({ id: 'a', embedding: [1, 0, 0] }), // cosine = 1.0
173
+ ];
174
+ const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
175
+ assert(result !== null, 'above threshold returns match');
176
+ assert(result!.existingFact.id === 'a', 'match is the correct candidate');
177
+ assertClose(result!.similarity, 1.0, 1e-6, 'similarity is ~1.0');
178
+ }
179
+
180
+ {
181
+ // Multiple matches -> returns highest similarity
182
+ const candidates = [
183
+ makeCandidate({ id: 'low', embedding: [0.86, Math.sqrt(1 - 0.86 * 0.86), 0] }), // cosine ~ 0.86
184
+ makeCandidate({ id: 'high', embedding: [0.99, Math.sqrt(1 - 0.99 * 0.99), 0] }), // cosine ~ 0.99
185
+ makeCandidate({ id: 'mid', embedding: [0.90, Math.sqrt(1 - 0.90 * 0.90), 0] }), // cosine ~ 0.90
186
+ ];
187
+ const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
188
+ assert(result !== null, 'multiple matches: returns a match');
189
+ assert(result!.existingFact.id === 'high', 'multiple matches: returns highest similarity');
190
+ }
191
+
192
+ {
193
+ // Parallel vectors (cosine = 1.0) -> match
194
+ const candidates = [
195
+ makeCandidate({ id: 'parallel', embedding: [3, 6, 9] }), // parallel to [1, 2, 3]
196
+ ];
197
+ const result = findNearDuplicate([1, 2, 3], candidates, 0.85);
198
+ assert(result !== null, 'parallel vectors: returns match');
199
+ assertClose(result!.similarity, 1.0, 1e-6, 'parallel vectors: cosine is ~1.0');
200
+ }
201
+
202
+ {
203
+ // Orthogonal vectors (cosine = 0) -> null
204
+ const candidates = [
205
+ makeCandidate({ id: 'ortho', embedding: [0, 1, 0] }),
206
+ ];
207
+ const result = findNearDuplicate([1, 0, 0], candidates, 0.85);
208
+ assert(result === null, 'orthogonal vectors: returns null');
209
+ }
210
+
211
+ // ---------------------------------------------------------------------------
212
+ // shouldSupersede tests
213
+ // ---------------------------------------------------------------------------
214
+
215
+ console.log('# shouldSupersede');
216
+
217
+ {
218
+ // Higher new importance -> supersede
219
+ const existing = makeCandidate({ id: 'old', importance: 5 });
220
+ const result = shouldSupersede(8, existing);
221
+ assert(result === 'supersede', 'higher new importance -> supersede');
222
+ }
223
+
224
+ {
225
+ // Lower new importance -> skip
226
+ const existing = makeCandidate({ id: 'old', importance: 8 });
227
+ const result = shouldSupersede(3, existing);
228
+ assert(result === 'skip', 'lower new importance -> skip');
229
+ }
230
+
231
+ {
232
+ // Equal importance -> supersede (newer wins)
233
+ const existing = makeCandidate({ id: 'old', importance: 5 });
234
+ const result = shouldSupersede(5, existing);
235
+ assert(result === 'supersede', 'equal importance -> supersede (newer wins)');
236
+ }
237
+
238
+ // ---------------------------------------------------------------------------
239
+ // clusterFacts tests
240
+ // ---------------------------------------------------------------------------
241
+
242
+ console.log('# clusterFacts');
243
+
244
+ {
245
+ // Empty facts -> empty clusters
246
+ const clusters = clusterFacts([], 0.88);
247
+ assert(clusters.length === 0, 'empty facts -> no clusters');
248
+ }
249
+
250
+ {
251
+ // Single fact -> no clusters (needs at least 2 to form a cluster)
252
+ const facts = [
253
+ makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
254
+ ];
255
+ const clusters = clusterFacts(facts, 0.88);
256
+ assert(clusters.length === 0, 'single fact -> no clusters');
257
+ }
258
+
259
+ {
260
+ // Two identical embeddings -> one cluster
261
+ const facts = [
262
+ makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
263
+ makeCandidate({ id: 'b', embedding: [1, 0, 0] }),
264
+ ];
265
+ const clusters = clusterFacts(facts, 0.88);
266
+ assert(clusters.length === 1, 'two identical -> one cluster');
267
+ assert(clusters[0].duplicates.length === 1, 'two identical -> one duplicate');
268
+ }
269
+
270
+ {
271
+ // Two dissimilar embeddings -> no clusters
272
+ const facts = [
273
+ makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
274
+ makeCandidate({ id: 'b', embedding: [0, 1, 0] }), // orthogonal
275
+ ];
276
+ const clusters = clusterFacts(facts, 0.88);
277
+ assert(clusters.length === 0, 'two dissimilar -> no clusters');
278
+ }
279
+
280
+ {
281
+ // Multiple clusters: two groups of duplicates + one unique
282
+ const facts = [
283
+ makeCandidate({ id: 'a1', embedding: [1, 0, 0] }),
284
+ makeCandidate({ id: 'a2', embedding: [1, 0, 0] }), // dup of a1
285
+ makeCandidate({ id: 'b1', embedding: [0, 1, 0] }),
286
+ makeCandidate({ id: 'b2', embedding: [0, 1, 0] }), // dup of b1
287
+ makeCandidate({ id: 'c1', embedding: [0, 0, 1] }), // unique
288
+ ];
289
+ const clusters = clusterFacts(facts, 0.88);
290
+ assert(clusters.length === 2, 'multiple clusters: two groups found');
291
+ }
292
+
293
+ {
294
+ // Facts without embeddings are not clustered
295
+ const facts = [
296
+ makeCandidate({ id: 'a', embedding: [1, 0, 0] }),
297
+ makeCandidate({ id: 'b', embedding: null }), // no embedding
298
+ makeCandidate({ id: 'c', embedding: [1, 0, 0] }), // dup of a
299
+ ];
300
+ const clusters = clusterFacts(facts, 0.88);
301
+ assert(clusters.length === 1, 'no-embedding facts skipped, one cluster of a+c');
302
+ // b should not appear in any cluster
303
+ const allIds = clusters.flatMap(c => [c.representative.id, ...c.duplicates.map(d => d.id)]);
304
+ assert(!allIds.includes('b'), 'no-embedding fact not in any cluster');
305
+ }
306
+
307
+ {
308
+ // Representative = highest importance (via decayScore tiebreak)
309
+ const facts = [
310
+ makeCandidate({ id: 'low', embedding: [1, 0, 0], decayScore: 0.5, importance: 3 }),
311
+ makeCandidate({ id: 'high', embedding: [1, 0, 0], decayScore: 0.9, importance: 8 }),
312
+ makeCandidate({ id: 'mid', embedding: [1, 0, 0], decayScore: 0.7, importance: 5 }),
313
+ ];
314
+ const clusters = clusterFacts(facts, 0.88);
315
+ assert(clusters.length === 1, 'representative test: one cluster');
316
+ assert(clusters[0].representative.id === 'high', 'representative = highest decayScore');
317
+ assert(clusters[0].duplicates.length === 2, 'two duplicates');
318
+ }
319
+
320
+ {
321
+ // Tiebreak: same decayScore -> most recent (highest createdAt)
322
+ const facts = [
323
+ makeCandidate({ id: 'old', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
324
+ makeCandidate({ id: 'new', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 2000 }),
325
+ ];
326
+ const clusters = clusterFacts(facts, 0.88);
327
+ assert(clusters.length === 1, 'tiebreak test: one cluster');
328
+ assert(clusters[0].representative.id === 'new', 'tiebreak: most recent is representative');
329
+ }
330
+
331
+ {
332
+ // Tiebreak: same decayScore + createdAt -> longest text
333
+ const facts = [
334
+ makeCandidate({ id: 'short', text: 'abc', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
335
+ makeCandidate({ id: 'long', text: 'abcdefghij', embedding: [1, 0, 0], decayScore: 1.0, createdAt: 1000 }),
336
+ ];
337
+ const clusters = clusterFacts(facts, 0.88);
338
+ assert(clusters.length === 1, 'tiebreak longest text: one cluster');
339
+ assert(clusters[0].representative.id === 'long', 'tiebreak: longest text is representative');
340
+ }
341
+
342
+ // ---------------------------------------------------------------------------
343
+ // Summary
344
+ // ---------------------------------------------------------------------------
345
+
346
+ console.log(`\n1..${testNum}`);
347
+ console.log(`# pass: ${passed}`);
348
+ console.log(`# fail: ${failed}`);
349
+
350
+ if (failed > 0) {
351
+ console.log('\nFAILED');
352
+ process.exit(1);
353
+ } else {
354
+ console.log('\nALL TESTS PASSED');
355
+ process.exit(0);
356
+ }
@@ -0,0 +1,227 @@
1
+ /**
2
+ * TotalReclaw Plugin - Memory Consolidation & Near-Duplicate Detection
3
+ *
4
+ * Provides cross-session / cross-vault deduplication of stored facts using
5
+ * cosine similarity on their embeddings. Unlike semantic-dedup.ts (which
6
+ * handles within-batch dedup at threshold 0.9), this module handles:
7
+ *
8
+ * 1. Store-time dedup — before writing a new fact, check whether a
9
+ * near-duplicate already exists in the vault (findNearDuplicate).
10
+ * 2. Supersede logic — when a near-duplicate is found, decide whether
11
+ * the new fact should replace or be skipped (shouldSupersede).
12
+ * 3. Bulk consolidation — cluster all facts in the vault and identify
13
+ * groups of near-duplicates for cleanup (clusterFacts).
14
+ *
15
+ * This module intentionally has minimal dependencies (only reranker for
16
+ * cosineSimilarity) so it can be tested without pulling in the full
17
+ * plugin dependency graph.
18
+ */
19
+
20
+ import { cosineSimilarity } from './reranker.js';
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Configuration
24
+ // ---------------------------------------------------------------------------
25
+
26
+ /**
27
+ * Get the cosine similarity threshold for store-time dedup.
28
+ *
29
+ * Configurable via TOTALRECLAW_STORE_DEDUP_THRESHOLD env var.
30
+ * Must be a number in [0, 1]. Falls back to 0.85 if invalid or unset.
31
+ */
32
+ export function getStoreDedupThreshold(): number {
33
+ const envVal = process.env.TOTALRECLAW_STORE_DEDUP_THRESHOLD;
34
+ if (envVal !== undefined) {
35
+ const parsed = parseFloat(envVal);
36
+ if (!isNaN(parsed) && parsed >= 0 && parsed <= 1) return parsed;
37
+ }
38
+ return 0.85;
39
+ }
40
+
41
+ /**
42
+ * Get the cosine similarity threshold for bulk consolidation clustering.
43
+ *
44
+ * Configurable via TOTALRECLAW_CONSOLIDATION_THRESHOLD env var.
45
+ * Must be a number in [0, 1]. Falls back to 0.88 if invalid or unset.
46
+ */
47
+ export function getConsolidationThreshold(): number {
48
+ const envVal = process.env.TOTALRECLAW_CONSOLIDATION_THRESHOLD;
49
+ if (envVal !== undefined) {
50
+ const parsed = parseFloat(envVal);
51
+ if (!isNaN(parsed) && parsed >= 0 && parsed <= 1) return parsed;
52
+ }
53
+ return 0.88;
54
+ }
55
+
56
+ /** Maximum candidates to compare against during store-time dedup. */
57
+ export const STORE_DEDUP_MAX_CANDIDATES = 200;
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // Types
61
+ // ---------------------------------------------------------------------------
62
+
63
+ /** A decrypted fact candidate from the vault, with metadata for ranking. */
64
+ export interface DecryptedCandidate {
65
+ id: string;
66
+ text: string;
67
+ embedding: number[] | null;
68
+ importance: number;
69
+ decayScore: number;
70
+ createdAt: number;
71
+ version: number;
72
+ }
73
+
74
+ /** A match result from near-duplicate detection. */
75
+ export interface NearDuplicateMatch {
76
+ existingFact: DecryptedCandidate;
77
+ similarity: number;
78
+ }
79
+
80
+ /** A cluster of near-duplicate facts for consolidation. */
81
+ export interface ConsolidationCluster {
82
+ representative: DecryptedCandidate;
83
+ duplicates: DecryptedCandidate[];
84
+ }
85
+
86
+ // ---------------------------------------------------------------------------
87
+ // Store-time dedup
88
+ // ---------------------------------------------------------------------------
89
+
90
+ /**
91
+ * Find the best near-duplicate match for a new fact among existing candidates.
92
+ *
93
+ * Compares the new fact's embedding against all candidates using cosine
94
+ * similarity. Returns the candidate with the highest similarity above the
95
+ * threshold, or null if no match is found.
96
+ *
97
+ * Candidates without embeddings are skipped (fail-safe).
98
+ *
99
+ * @param newFactEmbedding - Embedding vector for the new fact
100
+ * @param candidates - Existing facts to compare against
101
+ * @param threshold - Cosine similarity threshold (e.g. 0.85)
102
+ * @returns - Best match above threshold, or null
103
+ */
104
+ export function findNearDuplicate(
105
+ newFactEmbedding: number[],
106
+ candidates: DecryptedCandidate[],
107
+ threshold: number,
108
+ ): NearDuplicateMatch | null {
109
+ let bestMatch: NearDuplicateMatch | null = null;
110
+
111
+ for (const candidate of candidates) {
112
+ if (!candidate.embedding || candidate.embedding.length === 0) continue;
113
+
114
+ const similarity = cosineSimilarity(newFactEmbedding, candidate.embedding);
115
+ if (similarity >= threshold) {
116
+ if (!bestMatch || similarity > bestMatch.similarity) {
117
+ bestMatch = { existingFact: candidate, similarity };
118
+ }
119
+ }
120
+ }
121
+
122
+ return bestMatch;
123
+ }
124
+
125
+ // ---------------------------------------------------------------------------
126
+ // Supersede logic
127
+ // ---------------------------------------------------------------------------
128
+
129
+ /**
130
+ * Decide whether a new fact should supersede an existing near-duplicate.
131
+ *
132
+ * - Higher importance wins.
133
+ * - Equal importance: new fact supersedes (newer is preferred).
134
+ *
135
+ * @param newImportance - Importance score of the new fact
136
+ * @param existingFact - The existing near-duplicate candidate
137
+ * @returns - 'supersede' if new fact should replace, 'skip' otherwise
138
+ */
139
+ export function shouldSupersede(
140
+ newImportance: number,
141
+ existingFact: DecryptedCandidate,
142
+ ): 'supersede' | 'skip' {
143
+ if (newImportance >= existingFact.importance) return 'supersede';
144
+ return 'skip';
145
+ }
146
+
147
+ // ---------------------------------------------------------------------------
148
+ // Bulk consolidation
149
+ // ---------------------------------------------------------------------------
150
+
151
+ /**
152
+ * Pick the best representative from a group of near-duplicate facts.
153
+ *
154
+ * Tiebreak order:
155
+ * 1. Highest decayScore
156
+ * 2. Most recent (highest createdAt)
157
+ * 3. Longest text
158
+ */
159
+ function pickRepresentative(facts: DecryptedCandidate[]): DecryptedCandidate {
160
+ let best = facts[0];
161
+ for (let i = 1; i < facts.length; i++) {
162
+ const f = facts[i];
163
+ if (
164
+ f.decayScore > best.decayScore ||
165
+ (f.decayScore === best.decayScore && f.createdAt > best.createdAt) ||
166
+ (f.decayScore === best.decayScore && f.createdAt === best.createdAt && f.text.length > best.text.length)
167
+ ) {
168
+ best = f;
169
+ }
170
+ }
171
+ return best;
172
+ }
173
+
174
+ /**
175
+ * Cluster facts by semantic similarity using greedy single-pass clustering.
176
+ *
177
+ * For each fact (in order), assigns it to the first existing cluster whose
178
+ * representative has cosine similarity >= threshold. If no cluster matches,
179
+ * a new cluster is started.
180
+ *
181
+ * Only returns clusters that have duplicates (i.e. more than one member).
182
+ * Facts without embeddings are not clustered.
183
+ *
184
+ * @param facts - All facts to cluster
185
+ * @param threshold - Cosine similarity threshold (e.g. 0.88)
186
+ * @returns - Clusters with duplicates (representative + duplicates)
187
+ */
188
+ export function clusterFacts(
189
+ facts: DecryptedCandidate[],
190
+ threshold: number,
191
+ ): ConsolidationCluster[] {
192
+ const clusters: { members: DecryptedCandidate[] }[] = [];
193
+
194
+ for (const fact of facts) {
195
+ if (!fact.embedding || fact.embedding.length === 0) continue;
196
+
197
+ let assigned = false;
198
+ for (const cluster of clusters) {
199
+ // Compare against the first member's embedding (cluster seed)
200
+ const seed = cluster.members[0];
201
+ if (!seed.embedding) continue;
202
+
203
+ const similarity = cosineSimilarity(fact.embedding, seed.embedding);
204
+ if (similarity >= threshold) {
205
+ cluster.members.push(fact);
206
+ assigned = true;
207
+ break;
208
+ }
209
+ }
210
+
211
+ if (!assigned) {
212
+ clusters.push({ members: [fact] });
213
+ }
214
+ }
215
+
216
+ // Only return clusters with duplicates, pick representative for each
217
+ const result: ConsolidationCluster[] = [];
218
+ for (const cluster of clusters) {
219
+ if (cluster.members.length < 2) continue;
220
+
221
+ const representative = pickRepresentative(cluster.members);
222
+ const duplicates = cluster.members.filter((m) => m !== representative);
223
+ result.push({ representative, duplicates });
224
+ }
225
+
226
+ return result;
227
+ }