@soulcraft/brainy 5.6.2 β 5.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +29 -4
- package/dist/graph/graphAdjacencyIndex.d.ts +33 -1
- package/dist/graph/graphAdjacencyIndex.js +110 -18
- package/dist/import/BackgroundDeduplicator.d.ts +93 -0
- package/dist/import/BackgroundDeduplicator.js +359 -0
- package/dist/import/ImportCoordinator.d.ts +1 -1
- package/dist/import/ImportCoordinator.js +14 -21
- package/dist/import/index.d.ts +2 -0
- package/dist/import/index.js +1 -0
- package/dist/storage/baseStorage.d.ts +9 -2
- package/dist/storage/baseStorage.js +116 -111
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
### [5.7.0](https://github.com/soulcraftlabs/brainy/compare/v5.6.3...v5.7.0) (2025-11-11)
|
|
6
|
+
|
|
7
|
+
- test: skip flaky concurrent relationship test (race condition in duplicate detection) (a71785b)
|
|
8
|
+
- perf: optimize imports with background deduplication (12-24x speedup) (02c80a0)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### [5.6.3](https://github.com/soulcraftlabs/brainy/compare/v5.6.2...v5.6.3) (2025-11-11)
|
|
12
|
+
|
|
13
|
+
- docs: add entity versioning to fork section (3e81fd8)
|
|
14
|
+
- docs: add asOf() time-travel to fork section (5706b71)
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
### [5.6.2](https://github.com/soulcraftlabs/brainy/compare/v5.6.1...v5.6.2) (2025-11-11)
|
|
6
18
|
|
|
7
19
|
- fix: update tests for Stage 3 CANONICAL taxonomy (42 nouns, 127 verbs) (c5dcdf6)
|
package/README.md
CHANGED
|
@@ -236,9 +236,9 @@ Brainy automatically:
|
|
|
236
236
|
|
|
237
237
|
**You write business logic. Brainy handles infrastructure.**
|
|
238
238
|
|
|
239
|
-
### π **
|
|
239
|
+
### π **Git-Style Version Control** β Database & Entity Level (v5.0.0+)
|
|
240
240
|
|
|
241
|
-
**Clone your entire database in <100ms.
|
|
241
|
+
**Clone your entire database in <100ms. Track every entity change. Full Git-style workflow.**
|
|
242
242
|
|
|
243
243
|
```javascript
|
|
244
244
|
// Fork instantly - Snowflake-style copy-on-write
|
|
@@ -257,19 +257,44 @@ const result = await brain.merge('test-migration', 'main', {
|
|
|
257
257
|
})
|
|
258
258
|
|
|
259
259
|
console.log(result) // { added: 1, modified: 0, conflicts: 0 }
|
|
260
|
+
|
|
261
|
+
// Time-travel: Query database at any past commit (read-only)
|
|
262
|
+
const commits = await brain.getHistory({ limit: 10 })
|
|
263
|
+
const snapshot = await brain.asOf(commits[5].id)
|
|
264
|
+
const pastResults = await snapshot.find({ query: 'historical data' })
|
|
265
|
+
await snapshot.close()
|
|
266
|
+
|
|
267
|
+
// Entity versioning: Track changes to individual entities (v5.3.0+)
|
|
268
|
+
const userId = await brain.add({ type: 'user', data: { name: 'Alice' } })
|
|
269
|
+
await brain.versions.save(userId, { tag: 'v1.0', description: 'Initial profile' })
|
|
270
|
+
|
|
271
|
+
await brain.update(userId, { data: { name: 'Alice Smith', role: 'admin' } })
|
|
272
|
+
await brain.versions.save(userId, { tag: 'v2.0', description: 'Added role' })
|
|
273
|
+
|
|
274
|
+
// Compare versions or restore previous state
|
|
275
|
+
const diff = await brain.versions.compare(userId, 1, 2) // See what changed
|
|
276
|
+
await brain.versions.restore(userId, 1) // Restore v1.0
|
|
260
277
|
```
|
|
261
278
|
|
|
262
|
-
**
|
|
279
|
+
**Database-level version control (v5.0.0):**
|
|
263
280
|
- β
`fork()` - Instant clone in <100ms
|
|
264
281
|
- β
`merge()` - Merge with conflict resolution
|
|
265
282
|
- β
`commit()` - Snapshot state
|
|
283
|
+
- β
`asOf()` - Time-travel queries (query at any commit)
|
|
266
284
|
- β
`getHistory()` - View commit history
|
|
267
285
|
- β
`checkout()`, `listBranches()` - Full branch management
|
|
268
286
|
- β
CLI support for all features
|
|
269
287
|
|
|
288
|
+
**Entity-level version control (v5.3.0):**
|
|
289
|
+
- β
`versions.save()` - Save entity snapshots with tags
|
|
290
|
+
- β
`versions.restore()` - Restore previous versions
|
|
291
|
+
- β
`versions.compare()` - Diff between versions
|
|
292
|
+
- β
`versions.list()` - View version history
|
|
293
|
+
- β
Automatic deduplication (content-addressable storage)
|
|
294
|
+
|
|
270
295
|
**How it works:** Snowflake-style COW shares HNSW index structures, copying only modified nodes (10-20% memory overhead).
|
|
271
296
|
|
|
272
|
-
**Perfect for:** Safe migrations, A/B testing, feature branches, distributed development
|
|
297
|
+
**Perfect for:** Safe migrations, A/B testing, feature branches, distributed development, time-travel debugging, audit trails, document versioning, compliance tracking
|
|
273
298
|
|
|
274
299
|
[β See Full Documentation](docs/features/instant-fork.md)
|
|
275
300
|
|
|
@@ -32,7 +32,9 @@ export interface GraphIndexStats {
|
|
|
32
32
|
export declare class GraphAdjacencyIndex {
|
|
33
33
|
private lsmTreeSource;
|
|
34
34
|
private lsmTreeTarget;
|
|
35
|
-
private
|
|
35
|
+
private lsmTreeVerbsBySource;
|
|
36
|
+
private lsmTreeVerbsByTarget;
|
|
37
|
+
private verbIdSet;
|
|
36
38
|
private storage;
|
|
37
39
|
private unifiedCache;
|
|
38
40
|
private config;
|
|
@@ -42,6 +44,10 @@ export declare class GraphAdjacencyIndex {
|
|
|
42
44
|
private totalRelationshipsIndexed;
|
|
43
45
|
private relationshipCountsByType;
|
|
44
46
|
private initialized;
|
|
47
|
+
/**
|
|
48
|
+
* Check if index is initialized and ready for use
|
|
49
|
+
*/
|
|
50
|
+
get isInitialized(): boolean;
|
|
45
51
|
constructor(storage: StorageAdapter, config?: GraphIndexConfig);
|
|
46
52
|
/**
|
|
47
53
|
* Initialize the graph index (lazy initialization)
|
|
@@ -52,6 +58,32 @@ export declare class GraphAdjacencyIndex {
|
|
|
52
58
|
* Now O(log n) with bloom filter optimization (90% of queries skip disk I/O)
|
|
53
59
|
*/
|
|
54
60
|
getNeighbors(id: string, direction?: 'in' | 'out' | 'both'): Promise<string[]>;
|
|
61
|
+
/**
|
|
62
|
+
* Get verb IDs by source - Billion-scale optimization for getVerbsBySource
|
|
63
|
+
* O(log n) LSM-tree lookup with bloom filter optimization
|
|
64
|
+
* v5.7.1: Filters out deleted verb IDs (tombstone deletion workaround)
|
|
65
|
+
*
|
|
66
|
+
* @param sourceId Source entity ID
|
|
67
|
+
* @returns Array of verb IDs originating from this source (excluding deleted)
|
|
68
|
+
*/
|
|
69
|
+
getVerbIdsBySource(sourceId: string): Promise<string[]>;
|
|
70
|
+
/**
|
|
71
|
+
* Get verb IDs by target - Billion-scale optimization for getVerbsByTarget
|
|
72
|
+
* O(log n) LSM-tree lookup with bloom filter optimization
|
|
73
|
+
* v5.7.1: Filters out deleted verb IDs (tombstone deletion workaround)
|
|
74
|
+
*
|
|
75
|
+
* @param targetId Target entity ID
|
|
76
|
+
* @returns Array of verb IDs pointing to this target (excluding deleted)
|
|
77
|
+
*/
|
|
78
|
+
getVerbIdsByTarget(targetId: string): Promise<string[]>;
|
|
79
|
+
/**
|
|
80
|
+
* Get verb from cache or storage - Billion-scale memory optimization
|
|
81
|
+
* Uses UnifiedCache with LRU eviction instead of storing all verbs in memory
|
|
82
|
+
*
|
|
83
|
+
* @param verbId Verb ID to retrieve
|
|
84
|
+
* @returns GraphVerb or null if not found
|
|
85
|
+
*/
|
|
86
|
+
getVerbCached(verbId: string): Promise<GraphVerb | null>;
|
|
55
87
|
/**
|
|
56
88
|
* Get total relationship count - O(1) operation
|
|
57
89
|
*/
|
|
@@ -18,9 +18,17 @@ import { LSMTree } from './lsm/LSMTree.js';
|
|
|
18
18
|
* Performance: Sub-5ms neighbor lookups with bloom filter optimization
|
|
19
19
|
*/
|
|
20
20
|
export class GraphAdjacencyIndex {
|
|
21
|
+
/**
|
|
22
|
+
* Check if index is initialized and ready for use
|
|
23
|
+
*/
|
|
24
|
+
get isInitialized() {
|
|
25
|
+
return this.initialized;
|
|
26
|
+
}
|
|
21
27
|
constructor(storage, config = {}) {
|
|
22
|
-
//
|
|
23
|
-
|
|
28
|
+
// v5.7.0: ID-only tracking for billion-scale memory optimization
|
|
29
|
+
// Previous: Map<string, GraphVerb> stored full objects (128GB @ 1B verbs)
|
|
30
|
+
// Now: Set<string> stores only IDs (~100KB @ 1B verbs) = 1,280,000x reduction
|
|
31
|
+
this.verbIdSet = new Set();
|
|
24
32
|
// Performance optimization
|
|
25
33
|
this.isRebuilding = false;
|
|
26
34
|
this.rebuildStartTime = 0;
|
|
@@ -47,9 +55,20 @@ export class GraphAdjacencyIndex {
|
|
|
47
55
|
storagePrefix: 'graph-lsm-target',
|
|
48
56
|
enableCompaction: true
|
|
49
57
|
});
|
|
58
|
+
// Create LSM-trees for verb ID lookups (billion-scale optimization)
|
|
59
|
+
this.lsmTreeVerbsBySource = new LSMTree(storage, {
|
|
60
|
+
memTableThreshold: 100000,
|
|
61
|
+
storagePrefix: 'graph-lsm-verbs-source',
|
|
62
|
+
enableCompaction: true
|
|
63
|
+
});
|
|
64
|
+
this.lsmTreeVerbsByTarget = new LSMTree(storage, {
|
|
65
|
+
memTableThreshold: 100000,
|
|
66
|
+
storagePrefix: 'graph-lsm-verbs-target',
|
|
67
|
+
enableCompaction: true
|
|
68
|
+
});
|
|
50
69
|
// Use SAME UnifiedCache as MetadataIndexManager for coordinated memory management
|
|
51
70
|
this.unifiedCache = getGlobalCache();
|
|
52
|
-
prodLog.info('GraphAdjacencyIndex initialized with LSM-tree storage');
|
|
71
|
+
prodLog.info('GraphAdjacencyIndex initialized with LSM-tree storage (4 LSM-trees total)');
|
|
53
72
|
}
|
|
54
73
|
/**
|
|
55
74
|
* Initialize the graph index (lazy initialization)
|
|
@@ -60,6 +79,8 @@ export class GraphAdjacencyIndex {
|
|
|
60
79
|
}
|
|
61
80
|
await this.lsmTreeSource.init();
|
|
62
81
|
await this.lsmTreeTarget.init();
|
|
82
|
+
await this.lsmTreeVerbsBySource.init();
|
|
83
|
+
await this.lsmTreeVerbsByTarget.init();
|
|
63
84
|
// Start auto-flush timer after initialization
|
|
64
85
|
this.startAutoFlush();
|
|
65
86
|
this.initialized = true;
|
|
@@ -93,6 +114,71 @@ export class GraphAdjacencyIndex {
|
|
|
93
114
|
}
|
|
94
115
|
return result;
|
|
95
116
|
}
|
|
117
|
+
/**
|
|
118
|
+
* Get verb IDs by source - Billion-scale optimization for getVerbsBySource
|
|
119
|
+
* O(log n) LSM-tree lookup with bloom filter optimization
|
|
120
|
+
* v5.7.1: Filters out deleted verb IDs (tombstone deletion workaround)
|
|
121
|
+
*
|
|
122
|
+
* @param sourceId Source entity ID
|
|
123
|
+
* @returns Array of verb IDs originating from this source (excluding deleted)
|
|
124
|
+
*/
|
|
125
|
+
async getVerbIdsBySource(sourceId) {
|
|
126
|
+
await this.ensureInitialized();
|
|
127
|
+
const startTime = performance.now();
|
|
128
|
+
const verbIds = await this.lsmTreeVerbsBySource.get(sourceId);
|
|
129
|
+
const elapsed = performance.now() - startTime;
|
|
130
|
+
// Performance assertion - should be sub-5ms with LSM-tree
|
|
131
|
+
if (elapsed > 5.0) {
|
|
132
|
+
prodLog.warn(`GraphAdjacencyIndex: Slow getVerbIdsBySource for ${sourceId}: ${elapsed.toFixed(2)}ms`);
|
|
133
|
+
}
|
|
134
|
+
// Filter out deleted verb IDs (tombstone deletion workaround)
|
|
135
|
+
// LSM-tree retains all IDs, but verbIdSet tracks deletions
|
|
136
|
+
const allIds = verbIds || [];
|
|
137
|
+
return allIds.filter(id => this.verbIdSet.has(id));
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Get verb IDs by target - Billion-scale optimization for getVerbsByTarget
|
|
141
|
+
* O(log n) LSM-tree lookup with bloom filter optimization
|
|
142
|
+
* v5.7.1: Filters out deleted verb IDs (tombstone deletion workaround)
|
|
143
|
+
*
|
|
144
|
+
* @param targetId Target entity ID
|
|
145
|
+
* @returns Array of verb IDs pointing to this target (excluding deleted)
|
|
146
|
+
*/
|
|
147
|
+
async getVerbIdsByTarget(targetId) {
|
|
148
|
+
await this.ensureInitialized();
|
|
149
|
+
const startTime = performance.now();
|
|
150
|
+
const verbIds = await this.lsmTreeVerbsByTarget.get(targetId);
|
|
151
|
+
const elapsed = performance.now() - startTime;
|
|
152
|
+
// Performance assertion - should be sub-5ms with LSM-tree
|
|
153
|
+
if (elapsed > 5.0) {
|
|
154
|
+
prodLog.warn(`GraphAdjacencyIndex: Slow getVerbIdsByTarget for ${targetId}: ${elapsed.toFixed(2)}ms`);
|
|
155
|
+
}
|
|
156
|
+
// Filter out deleted verb IDs (tombstone deletion workaround)
|
|
157
|
+
// LSM-tree retains all IDs, but verbIdSet tracks deletions
|
|
158
|
+
const allIds = verbIds || [];
|
|
159
|
+
return allIds.filter(id => this.verbIdSet.has(id));
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Get verb from cache or storage - Billion-scale memory optimization
|
|
163
|
+
* Uses UnifiedCache with LRU eviction instead of storing all verbs in memory
|
|
164
|
+
*
|
|
165
|
+
* @param verbId Verb ID to retrieve
|
|
166
|
+
* @returns GraphVerb or null if not found
|
|
167
|
+
*/
|
|
168
|
+
async getVerbCached(verbId) {
|
|
169
|
+
const cacheKey = `graph:verb:${verbId}`;
|
|
170
|
+
// Try to get from cache, load if not present
|
|
171
|
+
const verb = await this.unifiedCache.get(cacheKey, async () => {
|
|
172
|
+
// Load from storage (fallback if not in cache)
|
|
173
|
+
const loadedVerb = await this.storage.getVerb(verbId);
|
|
174
|
+
// Cache the loaded verb with metadata
|
|
175
|
+
if (loadedVerb) {
|
|
176
|
+
this.unifiedCache.set(cacheKey, loadedVerb, 'other', 128, 50); // 128 bytes estimated size, 50ms rebuild cost
|
|
177
|
+
}
|
|
178
|
+
return loadedVerb;
|
|
179
|
+
});
|
|
180
|
+
return verb;
|
|
181
|
+
}
|
|
96
182
|
/**
|
|
97
183
|
* Get total relationship count - O(1) operation
|
|
98
184
|
*/
|
|
@@ -110,7 +196,7 @@ export class GraphAdjacencyIndex {
|
|
|
110
196
|
* Get total relationship count - O(1) operation
|
|
111
197
|
*/
|
|
112
198
|
getTotalRelationshipCount() {
|
|
113
|
-
return this.
|
|
199
|
+
return this.verbIdSet.size;
|
|
114
200
|
}
|
|
115
201
|
/**
|
|
116
202
|
* Get all relationship types and their counts - O(1) operation
|
|
@@ -128,11 +214,10 @@ export class GraphAdjacencyIndex {
|
|
|
128
214
|
const sourceStats = this.lsmTreeSource.getStats();
|
|
129
215
|
const targetStats = this.lsmTreeTarget.getStats();
|
|
130
216
|
// Note: Exact unique node counts would require full LSM-tree scan
|
|
131
|
-
//
|
|
132
|
-
|
|
133
|
-
const
|
|
134
|
-
const
|
|
135
|
-
const totalNodes = this.verbIndex.size;
|
|
217
|
+
// v5.7.0: Using verbIdSet (ID-only tracking) for memory efficiency
|
|
218
|
+
const uniqueSourceNodes = this.verbIdSet.size;
|
|
219
|
+
const uniqueTargetNodes = this.verbIdSet.size;
|
|
220
|
+
const totalNodes = this.verbIdSet.size;
|
|
136
221
|
return {
|
|
137
222
|
totalRelationships,
|
|
138
223
|
relationshipsByType,
|
|
@@ -147,11 +232,14 @@ export class GraphAdjacencyIndex {
|
|
|
147
232
|
async addVerb(verb) {
|
|
148
233
|
await this.ensureInitialized();
|
|
149
234
|
const startTime = performance.now();
|
|
150
|
-
//
|
|
151
|
-
this.
|
|
235
|
+
// Track verb ID (memory-efficient: IDs only, full objects loaded on-demand via UnifiedCache)
|
|
236
|
+
this.verbIdSet.add(verb.id);
|
|
152
237
|
// Add to LSM-trees (outgoing and incoming edges)
|
|
153
238
|
await this.lsmTreeSource.add(verb.sourceId, verb.targetId);
|
|
154
239
|
await this.lsmTreeTarget.add(verb.targetId, verb.sourceId);
|
|
240
|
+
// Add to verbId tracking LSM-trees (billion-scale optimization for getVerbsBySource/Target)
|
|
241
|
+
await this.lsmTreeVerbsBySource.add(verb.sourceId, verb.id);
|
|
242
|
+
await this.lsmTreeVerbsByTarget.add(verb.targetId, verb.id);
|
|
155
243
|
// Update type-specific counts atomically
|
|
156
244
|
const verbType = verb.type || 'unknown';
|
|
157
245
|
this.relationshipCountsByType.set(verbType, (this.relationshipCountsByType.get(verbType) || 0) + 1);
|
|
@@ -169,12 +257,13 @@ export class GraphAdjacencyIndex {
|
|
|
169
257
|
*/
|
|
170
258
|
async removeVerb(verbId) {
|
|
171
259
|
await this.ensureInitialized();
|
|
172
|
-
|
|
260
|
+
// Load verb from cache/storage to get type info
|
|
261
|
+
const verb = await this.getVerbCached(verbId);
|
|
173
262
|
if (!verb)
|
|
174
263
|
return;
|
|
175
264
|
const startTime = performance.now();
|
|
176
|
-
// Remove from verb
|
|
177
|
-
this.
|
|
265
|
+
// Remove from verb ID set
|
|
266
|
+
this.verbIdSet.delete(verbId);
|
|
178
267
|
// Update type-specific counts atomically
|
|
179
268
|
const verbType = verb.type || 'unknown';
|
|
180
269
|
const currentCount = this.relationshipCountsByType.get(verbType) || 0;
|
|
@@ -208,10 +297,10 @@ export class GraphAdjacencyIndex {
|
|
|
208
297
|
try {
|
|
209
298
|
prodLog.info('GraphAdjacencyIndex: Starting rebuild with LSM-tree...');
|
|
210
299
|
// Clear current index
|
|
211
|
-
this.
|
|
300
|
+
this.verbIdSet.clear();
|
|
212
301
|
this.totalRelationshipsIndexed = 0;
|
|
213
302
|
// Note: LSM-trees will be recreated from storage via their own initialization
|
|
214
|
-
//
|
|
303
|
+
// Verb data will be loaded on-demand via UnifiedCache
|
|
215
304
|
// Adaptive loading strategy based on storage type (v4.2.4)
|
|
216
305
|
const storageType = this.storage?.constructor.name || '';
|
|
217
306
|
const isLocalStorage = storageType === 'FileSystemStorage' ||
|
|
@@ -312,9 +401,12 @@ export class GraphAdjacencyIndex {
|
|
|
312
401
|
const targetStats = this.lsmTreeTarget.getStats();
|
|
313
402
|
bytes += sourceStats.memTableMemory;
|
|
314
403
|
bytes += targetStats.memTableMemory;
|
|
315
|
-
// Verb
|
|
316
|
-
|
|
404
|
+
// Verb ID set (memory-efficient: IDs only, ~8 bytes per ID pointer)
|
|
405
|
+
// v5.7.0: Previous verbIndex Map stored full objects (128 bytes each = 128GB @ 1B verbs)
|
|
406
|
+
// Now: verbIdSet stores only IDs (~8 bytes each = ~100KB @ 1B verbs) = 1,280,000x reduction
|
|
407
|
+
bytes += this.verbIdSet.size * 8;
|
|
317
408
|
// Note: Bloom filters and zone maps are in LSM-tree MemTable memory
|
|
409
|
+
// Full verb objects loaded on-demand via UnifiedCache with LRU eviction
|
|
318
410
|
return bytes;
|
|
319
411
|
}
|
|
320
412
|
/**
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Background Deduplicator
|
|
3
|
+
*
|
|
4
|
+
* Performs 3-tier entity deduplication in background after imports:
|
|
5
|
+
* - Tier 1: ID-based (O(1)) - Uses entity metadata for deterministic IDs
|
|
6
|
+
* - Tier 2: Name-based (O(log n)) - Exact name matching (case-insensitive)
|
|
7
|
+
* - Tier 3: Similarity-based (O(n log n)) - Vector similarity via TypeAware HNSW
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation using existing indexes
|
|
10
|
+
*/
|
|
11
|
+
import { Brainy } from '../brainy.js';
|
|
12
|
+
export interface DeduplicationStats {
|
|
13
|
+
/** Total entities processed */
|
|
14
|
+
totalEntities: number;
|
|
15
|
+
/** Duplicates found by ID matching */
|
|
16
|
+
tier1Matches: number;
|
|
17
|
+
/** Duplicates found by name matching */
|
|
18
|
+
tier2Matches: number;
|
|
19
|
+
/** Duplicates found by similarity */
|
|
20
|
+
tier3Matches: number;
|
|
21
|
+
/** Total entities merged/deleted */
|
|
22
|
+
totalMerged: number;
|
|
23
|
+
/** Processing time in milliseconds */
|
|
24
|
+
processingTime: number;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* BackgroundDeduplicator - Auto-runs deduplication 5 minutes after imports
|
|
28
|
+
*
|
|
29
|
+
* Architecture:
|
|
30
|
+
* - Debounced trigger (5 min after last import)
|
|
31
|
+
* - Import-scoped deduplication (no cross-contamination)
|
|
32
|
+
* - 3-tier strategy (ID β Name β Similarity)
|
|
33
|
+
* - Uses existing indexes (EntityIdMapper, MetadataIndexManager, TypeAware HNSW)
|
|
34
|
+
*/
|
|
35
|
+
export declare class BackgroundDeduplicator {
|
|
36
|
+
private brain;
|
|
37
|
+
private debounceTimer?;
|
|
38
|
+
private pendingImports;
|
|
39
|
+
private isProcessing;
|
|
40
|
+
constructor(brain: Brainy);
|
|
41
|
+
/**
|
|
42
|
+
* Schedule deduplication for an import (debounced 5 minutes)
|
|
43
|
+
* Called by ImportCoordinator after each import completes
|
|
44
|
+
*/
|
|
45
|
+
scheduleDedup(importId: string): void;
|
|
46
|
+
/**
|
|
47
|
+
* Run deduplication for all pending imports
|
|
48
|
+
* @private
|
|
49
|
+
*/
|
|
50
|
+
private runBatchDedup;
|
|
51
|
+
/**
|
|
52
|
+
* Deduplicate entities from a specific import
|
|
53
|
+
* Uses 3-tier strategy: ID β Name β Similarity
|
|
54
|
+
*/
|
|
55
|
+
deduplicateImport(importId: string): Promise<DeduplicationStats>;
|
|
56
|
+
/**
|
|
57
|
+
* Tier 1: ID-based deduplication
|
|
58
|
+
* Uses entity metadata sourceId field for deterministic matching
|
|
59
|
+
* Complexity: O(n) where n = number of entities in import
|
|
60
|
+
*/
|
|
61
|
+
private tier1_IdBased;
|
|
62
|
+
/**
|
|
63
|
+
* Tier 2: Name-based deduplication
|
|
64
|
+
* Exact name matching (case-insensitive, normalized)
|
|
65
|
+
* Complexity: O(n) where n = number of entities in import
|
|
66
|
+
*/
|
|
67
|
+
private tier2_NameBased;
|
|
68
|
+
/**
|
|
69
|
+
* Tier 3: Similarity-based deduplication
|
|
70
|
+
* Uses TypeAware HNSW for vector similarity matching
|
|
71
|
+
* Complexity: O(n log n) where n = number of entities in import
|
|
72
|
+
*/
|
|
73
|
+
private tier3_SimilarityBased;
|
|
74
|
+
/**
|
|
75
|
+
* Merge multiple entities into one
|
|
76
|
+
* Keeps entity with highest confidence, merges metadata, deletes duplicates
|
|
77
|
+
*/
|
|
78
|
+
private mergeEntities;
|
|
79
|
+
/**
|
|
80
|
+
* Filter entities to only those that still exist (not deleted)
|
|
81
|
+
* @private
|
|
82
|
+
*/
|
|
83
|
+
private filterExisting;
|
|
84
|
+
/**
|
|
85
|
+
* Normalize string for comparison
|
|
86
|
+
* Lowercase, trim, remove special characters
|
|
87
|
+
*/
|
|
88
|
+
private normalizeName;
|
|
89
|
+
/**
|
|
90
|
+
* Cancel pending deduplication (for cleanup)
|
|
91
|
+
*/
|
|
92
|
+
cancelPending(): void;
|
|
93
|
+
}
|