@totalreclaw/totalreclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,392 @@
1
+ /**
2
+ * Unit tests for semantic near-duplicate detection (T330).
3
+ *
4
+ * Run with: npx tsx semantic-dedup.test.ts
5
+ *
6
+ * Uses TAP-style output (no test framework dependency).
7
+ */
8
+
9
+ import { deduplicateBatch, getSemanticDedupThreshold } from './semantic-dedup.js';
10
+ import { cosineSimilarity } from './reranker.js';
11
+ import type { ExtractedFact } from './extractor.js';
12
+
13
+ let passed = 0;
14
+ let failed = 0;
15
+ let testNum = 0;
16
+
17
+ function assert(condition: boolean, message: string): void {
18
+ testNum++;
19
+ if (condition) {
20
+ passed++;
21
+ console.log(`ok ${testNum} - ${message}`);
22
+ } else {
23
+ failed++;
24
+ console.log(`not ok ${testNum} - ${message}`);
25
+ }
26
+ }
27
+
28
+ function assertClose(actual: number, expected: number, epsilon: number, message: string): void {
29
+ const diff = Math.abs(actual - expected);
30
+ assert(diff < epsilon, `${message} (expected ~${expected}, got ${actual}, diff=${diff})`);
31
+ }
32
+
33
+ // Mock logger that collects log calls
34
+ function createMockLogger() {
35
+ const logs: { level: string; args: unknown[] }[] = [];
36
+ return {
37
+ logger: {
38
+ info: (...args: unknown[]) => logs.push({ level: 'info', args }),
39
+ warn: (...args: unknown[]) => logs.push({ level: 'warn', args }),
40
+ error: (...args: unknown[]) => logs.push({ level: 'error', args }),
41
+ },
42
+ logs,
43
+ };
44
+ }
45
+
46
+ // Helper: create an ExtractedFact
47
+ function makeFact(text: string, type: ExtractedFact['type'] = 'fact', importance = 5): ExtractedFact {
48
+ return { text, type, importance };
49
+ }
50
+
51
+ // ---------------------------------------------------------------------------
52
+ // getSemanticDedupThreshold tests
53
+ // ---------------------------------------------------------------------------
54
+
55
+ console.log('# getSemanticDedupThreshold');
56
+
57
+ {
58
+ // Default threshold should be 0.9 (no env var set)
59
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
60
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
61
+ assertClose(getSemanticDedupThreshold(), 0.9, 1e-10, 'default threshold is 0.9');
62
+ if (orig !== undefined) process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
63
+ }
64
+
65
+ {
66
+ // Custom threshold via env var
67
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
68
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '0.85';
69
+ assertClose(getSemanticDedupThreshold(), 0.85, 1e-10, 'custom threshold 0.85 from env');
70
+ if (orig !== undefined) {
71
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
72
+ } else {
73
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
74
+ }
75
+ }
76
+
77
+ {
78
+ // Invalid env var falls back to default
79
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
80
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = 'not-a-number';
81
+ assertClose(getSemanticDedupThreshold(), 0.9, 1e-10, 'invalid env var falls back to 0.9');
82
+ if (orig !== undefined) {
83
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
84
+ } else {
85
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
86
+ }
87
+ }
88
+
89
+ {
90
+ // Out-of-range env var (>1) falls back to default
91
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
92
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '1.5';
93
+ assertClose(getSemanticDedupThreshold(), 0.9, 1e-10, 'threshold > 1 falls back to 0.9');
94
+ if (orig !== undefined) {
95
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
96
+ } else {
97
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
98
+ }
99
+ }
100
+
101
+ {
102
+ // Negative env var falls back to default
103
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
104
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '-0.5';
105
+ assertClose(getSemanticDedupThreshold(), 0.9, 1e-10, 'negative threshold falls back to 0.9');
106
+ if (orig !== undefined) {
107
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
108
+ } else {
109
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
110
+ }
111
+ }
112
+
113
+ // ---------------------------------------------------------------------------
114
+ // deduplicateBatch tests
115
+ // ---------------------------------------------------------------------------
116
+
117
+ console.log('# deduplicateBatch');
118
+
119
+ // Ensure default threshold for remaining tests
120
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
121
+
122
+ {
123
+ // Empty batch returns empty
124
+ const { logger } = createMockLogger();
125
+ const result = deduplicateBatch([], new Map(), logger);
126
+ assert(result.length === 0, 'empty batch returns empty array');
127
+ }
128
+
129
+ {
130
+ // Single fact is always kept
131
+ const { logger } = createMockLogger();
132
+ const facts = [makeFact('I love hiking')];
133
+ const embeddings = new Map([['I love hiking', [1, 0, 0]]]);
134
+ const result = deduplicateBatch(facts, embeddings, logger);
135
+ assert(result.length === 1, 'single fact is always kept');
136
+ assert(result[0].text === 'I love hiking', 'single fact text preserved');
137
+ }
138
+
139
+ {
140
+ // Two identical facts (cosine = 1.0) -- second is removed
141
+ const { logger, logs } = createMockLogger();
142
+ const facts = [
143
+ makeFact('I love hiking'),
144
+ makeFact('I really enjoy hiking in the mountains'),
145
+ ];
146
+ // Use parallel embeddings (cosine = 1.0) to simulate semantic duplicates
147
+ const embeddings = new Map<string, number[]>([
148
+ ['I love hiking', [1, 0, 0]],
149
+ ['I really enjoy hiking in the mountains', [1, 0, 0]],
150
+ ]);
151
+ const result = deduplicateBatch(facts, embeddings, logger);
152
+ assert(result.length === 1, 'identical embeddings: only one fact kept');
153
+ assert(result[0].text === 'I love hiking', 'first fact is the one kept');
154
+ // Should have logged the dedup
155
+ const dedupLogs = logs.filter(l => l.level === 'info' && String(l.args[0]).includes('Semantic dedup: skipping'));
156
+ assert(dedupLogs.length === 1, 'dedup event was logged');
157
+ }
158
+
159
+ {
160
+ // Two dissimilar facts (cosine = 0.0) -- both kept
161
+ const { logger } = createMockLogger();
162
+ const facts = [
163
+ makeFact('I love hiking'),
164
+ makeFact('I work at Google'),
165
+ ];
166
+ const embeddings = new Map<string, number[]>([
167
+ ['I love hiking', [1, 0, 0]],
168
+ ['I work at Google', [0, 1, 0]],
169
+ ]);
170
+ const result = deduplicateBatch(facts, embeddings, logger);
171
+ assert(result.length === 2, 'dissimilar facts: both kept');
172
+ }
173
+
174
+ {
175
+ // Similarity just below threshold (0.89) -- kept; just above (0.91) -- removed
176
+ const { logger: logger1 } = createMockLogger();
177
+
178
+ // Construct vectors with specific cosine similarities.
179
+ // cos(a, b) = dot(a, b) / (|a| * |b|)
180
+ // For unit vectors: cos = dot
181
+ // To get cosine = 0.89: a = [1, 0], b = [0.89, sqrt(1-0.89^2)] = [0.89, 0.4560]
182
+ const factsBelowThreshold = [
183
+ makeFact('fact A'),
184
+ makeFact('fact B'),
185
+ ];
186
+ const vecA = [1, 0];
187
+ const vecBBelow = [0.89, Math.sqrt(1 - 0.89 * 0.89)]; // cosine ~ 0.89
188
+
189
+ // Verify our vectors produce the expected similarity
190
+ const simBelow = cosineSimilarity(vecA, vecBBelow);
191
+ assertClose(simBelow, 0.89, 0.01, 'test vector cosine ~0.89');
192
+
193
+ const embeddingsBelow = new Map<string, number[]>([
194
+ ['fact A', vecA],
195
+ ['fact B', vecBBelow],
196
+ ]);
197
+
198
+ const resultBelow = deduplicateBatch(factsBelowThreshold, embeddingsBelow, logger1);
199
+ assert(resultBelow.length === 2, 'cosine ~0.89 (below 0.9 threshold): both facts kept');
200
+
201
+ // Now test above threshold
202
+ const { logger: logger2 } = createMockLogger();
203
+ const factsAboveThreshold = [
204
+ makeFact('fact C'),
205
+ makeFact('fact D'),
206
+ ];
207
+ const vecBAbove = [0.91, Math.sqrt(1 - 0.91 * 0.91)]; // cosine ~ 0.91
208
+ const simAbove = cosineSimilarity(vecA, vecBAbove);
209
+ assertClose(simAbove, 0.91, 0.01, 'test vector cosine ~0.91');
210
+
211
+ const embeddingsAbove = new Map<string, number[]>([
212
+ ['fact C', vecA],
213
+ ['fact D', vecBAbove],
214
+ ]);
215
+ const resultAbove = deduplicateBatch(factsAboveThreshold, embeddingsAbove, logger2);
216
+ assert(resultAbove.length === 1, 'cosine ~0.91 (above 0.9 threshold): second fact removed');
217
+ }
218
+
219
+ {
220
+ // Facts without embeddings are always kept (fail-open)
221
+ const { logger } = createMockLogger();
222
+ const facts = [
223
+ makeFact('I love hiking'),
224
+ makeFact('I also love hiking a lot'),
225
+ makeFact('No embedding for me'),
226
+ ];
227
+ // Only provide embeddings for the first two (which are near-duplicates)
228
+ const embeddings = new Map<string, number[]>([
229
+ ['I love hiking', [1, 0, 0]],
230
+ ['I also love hiking a lot', [0.99, 0.1, 0]],
231
+ ]);
232
+ // cosine([1,0,0], [0.99,0.1,0]) ~ 0.995 -> duplicate
233
+ const result = deduplicateBatch(facts, embeddings, logger);
234
+ assert(result.length === 2, 'fact without embedding is kept + first fact kept (near-dup removed)');
235
+ assert(result[0].text === 'I love hiking', 'first fact kept');
236
+ assert(result[1].text === 'No embedding for me', 'fact without embedding kept');
237
+ }
238
+
239
+ {
240
+ // Multiple near-duplicates in a batch: only first of each "cluster" kept
241
+ const { logger } = createMockLogger();
242
+ const facts = [
243
+ makeFact('I love hiking'),
244
+ makeFact('I enjoy hiking'), // near-dup of fact 0
245
+ makeFact('I work at Google'),
246
+ makeFact('I am employed at Google'), // near-dup of fact 2
247
+ makeFact('I have a cat'),
248
+ ];
249
+ const embeddings = new Map<string, number[]>([
250
+ ['I love hiking', [1, 0, 0, 0]],
251
+ ['I enjoy hiking', [0.98, 0.1, 0, 0]], // cosine ~ 0.995 with hiking cluster
252
+ ['I work at Google', [0, 1, 0, 0]],
253
+ ['I am employed at Google', [0, 0.97, 0.1, 0]], // cosine ~ 0.995 with Google cluster
254
+ ['I have a cat', [0, 0, 0, 1]],
255
+ ]);
256
+ const result = deduplicateBatch(facts, embeddings, logger);
257
+ assert(result.length === 3, 'three clusters produce three facts');
258
+ assert(result[0].text === 'I love hiking', 'hiking cluster: first fact kept');
259
+ assert(result[1].text === 'I work at Google', 'Google cluster: first fact kept');
260
+ assert(result[2].text === 'I have a cat', 'cat fact kept (unique)');
261
+ }
262
+
263
+ {
264
+ // Custom threshold via env var
265
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
266
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '0.5';
267
+
268
+ const { logger } = createMockLogger();
269
+ const facts = [
270
+ makeFact('fact X'),
271
+ makeFact('fact Y'),
272
+ ];
273
+ // cosine([1, 0], [0.6, 0.8]) = 0.6 -> above 0.5 threshold -> deduped
274
+ const embeddings = new Map<string, number[]>([
275
+ ['fact X', [1, 0]],
276
+ ['fact Y', [0.6, 0.8]],
277
+ ]);
278
+ const result = deduplicateBatch(facts, embeddings, logger);
279
+ assert(result.length === 1, 'custom threshold 0.5: cosine 0.6 triggers dedup');
280
+
281
+ if (orig !== undefined) {
282
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
283
+ } else {
284
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
285
+ }
286
+ }
287
+
288
+ {
289
+ // Threshold = 0 means everything except exact self is a duplicate
290
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
291
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '0';
292
+
293
+ const { logger } = createMockLogger();
294
+ const facts = [
295
+ makeFact('fact A'),
296
+ makeFact('fact B'),
297
+ ];
298
+ // Any non-negative cosine triggers dedup. With non-zero non-orthogonal vectors,
299
+ // cosine will be > 0.
300
+ const embeddings = new Map<string, number[]>([
301
+ ['fact A', [1, 0]],
302
+ ['fact B', [0.01, 0.99]], // cosine ~ 0.01 (very small but > 0)
303
+ ]);
304
+ const result = deduplicateBatch(facts, embeddings, logger);
305
+ assert(result.length === 1, 'threshold 0: even tiny similarity triggers dedup');
306
+
307
+ if (orig !== undefined) {
308
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
309
+ } else {
310
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
311
+ }
312
+ }
313
+
314
+ {
315
+ // Threshold = 1.0 means only exact duplicates (cosine = 1.0) are removed
316
+ const orig = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
317
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = '1.0';
318
+
319
+ const { logger } = createMockLogger();
320
+ const facts = [
321
+ makeFact('fact A'),
322
+ makeFact('fact B'),
323
+ makeFact('fact C'),
324
+ ];
325
+ const embeddings = new Map<string, number[]>([
326
+ ['fact A', [1, 0, 0]],
327
+ ['fact B', [0.999, 0.04, 0]], // cosine ~ 0.999, very close but not 1.0
328
+ ['fact C', [1, 0, 0]], // exact duplicate embedding of A
329
+ ]);
330
+ const result = deduplicateBatch(facts, embeddings, logger);
331
+ assert(result.length === 2, 'threshold 1.0: only exact cosine=1.0 removed');
332
+ assert(result[0].text === 'fact A', 'fact A kept');
333
+ assert(result[1].text === 'fact B', 'fact B kept (cosine < 1.0)');
334
+
335
+ if (orig !== undefined) {
336
+ process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD = orig;
337
+ } else {
338
+ delete process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
339
+ }
340
+ }
341
+
342
+ {
343
+ // Empty embedding map -- all facts kept (fail-open)
344
+ const { logger } = createMockLogger();
345
+ const facts = [
346
+ makeFact('fact 1'),
347
+ makeFact('fact 2'),
348
+ makeFact('fact 3'),
349
+ ];
350
+ const result = deduplicateBatch(facts, new Map(), logger);
351
+ assert(result.length === 3, 'empty embedding map: all facts kept');
352
+ }
353
+
354
+ {
355
+ // Order preservation: deduplication preserves insertion order
356
+ const { logger } = createMockLogger();
357
+ const facts = [
358
+ makeFact('unique 1'),
359
+ makeFact('duplicate of 1'),
360
+ makeFact('unique 2'),
361
+ makeFact('duplicate of 2'),
362
+ makeFact('unique 3'),
363
+ ];
364
+ const embeddings = new Map<string, number[]>([
365
+ ['unique 1', [1, 0, 0]],
366
+ ['duplicate of 1', [1, 0, 0]], // exact dup of unique 1
367
+ ['unique 2', [0, 1, 0]],
368
+ ['duplicate of 2', [0, 1, 0]], // exact dup of unique 2
369
+ ['unique 3', [0, 0, 1]],
370
+ ]);
371
+ const result = deduplicateBatch(facts, embeddings, logger);
372
+ assert(result.length === 3, 'order preservation: 3 unique facts kept');
373
+ assert(result[0].text === 'unique 1', 'order: first is unique 1');
374
+ assert(result[1].text === 'unique 2', 'order: second is unique 2');
375
+ assert(result[2].text === 'unique 3', 'order: third is unique 3');
376
+ }
377
+
378
+ // ---------------------------------------------------------------------------
379
+ // Summary
380
+ // ---------------------------------------------------------------------------
381
+
382
+ console.log(`\n1..${testNum}`);
383
+ console.log(`# pass: ${passed}`);
384
+ console.log(`# fail: ${failed}`);
385
+
386
+ if (failed > 0) {
387
+ console.log('\nFAILED');
388
+ process.exit(1);
389
+ } else {
390
+ console.log('\nALL TESTS PASSED');
391
+ process.exit(0);
392
+ }
@@ -0,0 +1,100 @@
1
+ /**
2
+ * TotalReclaw Plugin - Semantic Near-Duplicate Detection (T330)
3
+ *
4
+ * Provides batch-level deduplication of extracted facts using cosine
5
+ * similarity on their embeddings. Facts within the same extraction batch
6
+ * that are semantically near-duplicates (cosine >= threshold) are reduced
7
+ * to keep only the first occurrence.
8
+ *
9
+ * This module intentionally has minimal dependencies (only reranker for
10
+ * cosineSimilarity and extractor for the ExtractedFact type) so it can
11
+ * be tested without pulling in the full plugin dependency graph.
12
+ */
13
+
14
+ import { cosineSimilarity } from './reranker.js';
15
+ import type { ExtractedFact } from './extractor.js';
16
+
17
+ // ---------------------------------------------------------------------------
18
+ // Configuration
19
+ // ---------------------------------------------------------------------------
20
+
21
+ /**
22
+ * Get the cosine similarity threshold for semantic dedup.
23
+ *
24
+ * Configurable via TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD env var.
25
+ * Must be a number in [0, 1]. Falls back to 0.9 if invalid or unset.
26
+ */
27
+ export function getSemanticDedupThreshold(): number {
28
+ const envVal = process.env.TOTALRECLAW_SEMANTIC_DEDUP_THRESHOLD;
29
+ if (envVal !== undefined) {
30
+ const parsed = parseFloat(envVal);
31
+ if (!isNaN(parsed) && parsed >= 0 && parsed <= 1) return parsed;
32
+ }
33
+ return 0.9;
34
+ }
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Logger interface (minimal, matches OpenClawPluginApi['logger'])
38
+ // ---------------------------------------------------------------------------
39
+
40
+ interface Logger {
41
+ info(...args: unknown[]): void;
42
+ warn(...args: unknown[]): void;
43
+ error(...args: unknown[]): void;
44
+ }
45
+
46
+ // ---------------------------------------------------------------------------
47
+ // Batch deduplication
48
+ // ---------------------------------------------------------------------------
49
+
50
+ /**
51
+ * Deduplicate a batch of extracted facts using cosine similarity on their
52
+ * embeddings. Facts without embeddings are always kept (fail-open).
53
+ *
54
+ * For each fact, compares its embedding against all previously kept facts.
55
+ * If any kept fact has cosine similarity >= threshold, the new fact is
56
+ * considered a near-duplicate and is skipped.
57
+ *
58
+ * @param facts - Array of extracted facts to deduplicate
59
+ * @param embeddings - Map from fact text to its embedding vector
60
+ * @param logger - Logger for reporting skipped duplicates
61
+ * @returns - Deduplicated array (subset of input, preserving order)
62
+ */
63
+ export function deduplicateBatch(
64
+ facts: ExtractedFact[],
65
+ embeddings: Map<string, number[]>,
66
+ logger: Logger,
67
+ ): ExtractedFact[] {
68
+ const threshold = getSemanticDedupThreshold();
69
+ const kept: ExtractedFact[] = [];
70
+
71
+ for (const fact of facts) {
72
+ const factEmb = embeddings.get(fact.text);
73
+ if (!factEmb) {
74
+ // No embedding available -- keep the fact (fail-open)
75
+ kept.push(fact);
76
+ continue;
77
+ }
78
+
79
+ let isDuplicate = false;
80
+ for (const keptFact of kept) {
81
+ const keptEmb = embeddings.get(keptFact.text);
82
+ if (!keptEmb) continue;
83
+
84
+ const similarity = cosineSimilarity(factEmb, keptEmb);
85
+ if (similarity >= threshold) {
86
+ isDuplicate = true;
87
+ logger.info(
88
+ `Semantic dedup: skipping "${fact.text}" (cosine=${similarity.toFixed(3)} >= ${threshold} with "${keptFact.text}")`,
89
+ );
90
+ break;
91
+ }
92
+ }
93
+
94
+ if (!isDuplicate) {
95
+ kept.push(fact);
96
+ }
97
+ }
98
+
99
+ return kept;
100
+ }