@soulcraft/brainy 5.6.2 → 5.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +29 -4
- package/dist/graph/graphAdjacencyIndex.d.ts +33 -1
- package/dist/graph/graphAdjacencyIndex.js +110 -18
- package/dist/import/BackgroundDeduplicator.d.ts +93 -0
- package/dist/import/BackgroundDeduplicator.js +359 -0
- package/dist/import/ImportCoordinator.d.ts +1 -1
- package/dist/import/ImportCoordinator.js +14 -21
- package/dist/import/index.d.ts +2 -0
- package/dist/import/index.js +1 -0
- package/dist/storage/baseStorage.d.ts +9 -2
- package/dist/storage/baseStorage.js +116 -111
- package/package.json +1 -1
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Background Deduplicator
|
|
3
|
+
*
|
|
4
|
+
* Performs 3-tier entity deduplication in background after imports:
|
|
5
|
+
* - Tier 1: ID-based (O(1)) - Uses entity metadata for deterministic IDs
|
|
6
|
+
* - Tier 2: Name-based (O(log n)) - Exact name matching (case-insensitive)
|
|
7
|
+
* - Tier 3: Similarity-based (O(n log n)) - Vector similarity via TypeAware HNSW
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation using existing indexes
|
|
10
|
+
*/
|
|
11
|
+
import { prodLog } from '../utils/logger.js';
|
|
12
|
+
/**
|
|
13
|
+
* BackgroundDeduplicator - Auto-runs deduplication 5 minutes after imports
|
|
14
|
+
*
|
|
15
|
+
* Architecture:
|
|
16
|
+
* - Debounced trigger (5 min after last import)
|
|
17
|
+
* - Import-scoped deduplication (no cross-contamination)
|
|
18
|
+
* - 3-tier strategy (ID → Name → Similarity)
|
|
19
|
+
* - Uses existing indexes (EntityIdMapper, MetadataIndexManager, TypeAware HNSW)
|
|
20
|
+
*/
|
|
21
|
+
export class BackgroundDeduplicator {
|
|
22
|
+
constructor(brain) {
|
|
23
|
+
this.pendingImports = new Set();
|
|
24
|
+
this.isProcessing = false;
|
|
25
|
+
this.brain = brain;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Schedule deduplication for an import (debounced 5 minutes)
|
|
29
|
+
* Called by ImportCoordinator after each import completes
|
|
30
|
+
*/
|
|
31
|
+
scheduleDedup(importId) {
|
|
32
|
+
prodLog.info(`[BackgroundDedup] Scheduled deduplication for import ${importId}`);
|
|
33
|
+
// Add to pending queue
|
|
34
|
+
this.pendingImports.add(importId);
|
|
35
|
+
// Clear existing timer (debouncing)
|
|
36
|
+
if (this.debounceTimer) {
|
|
37
|
+
clearTimeout(this.debounceTimer);
|
|
38
|
+
}
|
|
39
|
+
// Schedule for 5 minutes from now
|
|
40
|
+
this.debounceTimer = setTimeout(() => {
|
|
41
|
+
this.runBatchDedup().catch(error => {
|
|
42
|
+
prodLog.error('[BackgroundDedup] Batch dedup failed:', error);
|
|
43
|
+
});
|
|
44
|
+
}, 5 * 60 * 1000);
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Run deduplication for all pending imports
|
|
48
|
+
* @private
|
|
49
|
+
*/
|
|
50
|
+
async runBatchDedup() {
|
|
51
|
+
if (this.isProcessing) {
|
|
52
|
+
prodLog.warn('[BackgroundDedup] Already processing, skipping');
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
this.isProcessing = true;
|
|
56
|
+
try {
|
|
57
|
+
const imports = Array.from(this.pendingImports);
|
|
58
|
+
prodLog.info(`[BackgroundDedup] Processing ${imports.length} pending import(s)`);
|
|
59
|
+
for (const importId of imports) {
|
|
60
|
+
await this.deduplicateImport(importId);
|
|
61
|
+
}
|
|
62
|
+
this.pendingImports.clear();
|
|
63
|
+
prodLog.info('[BackgroundDedup] Batch deduplication complete');
|
|
64
|
+
}
|
|
65
|
+
finally {
|
|
66
|
+
this.isProcessing = false;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Deduplicate entities from a specific import
|
|
71
|
+
* Uses 3-tier strategy: ID → Name → Similarity
|
|
72
|
+
*/
|
|
73
|
+
async deduplicateImport(importId) {
|
|
74
|
+
const startTime = performance.now();
|
|
75
|
+
prodLog.info(`[BackgroundDedup] Starting deduplication for import ${importId}`);
|
|
76
|
+
const stats = {
|
|
77
|
+
totalEntities: 0,
|
|
78
|
+
tier1Matches: 0,
|
|
79
|
+
tier2Matches: 0,
|
|
80
|
+
tier3Matches: 0,
|
|
81
|
+
totalMerged: 0,
|
|
82
|
+
processingTime: 0
|
|
83
|
+
};
|
|
84
|
+
try {
|
|
85
|
+
// Get all entities from this import using brain.find()
|
|
86
|
+
const results = await this.brain.find({
|
|
87
|
+
where: { importId },
|
|
88
|
+
limit: 100000 // Large limit to get all entities from import
|
|
89
|
+
});
|
|
90
|
+
const entities = results.map(r => r.entity);
|
|
91
|
+
stats.totalEntities = entities.length;
|
|
92
|
+
if (entities.length === 0) {
|
|
93
|
+
prodLog.info(`[BackgroundDedup] No entities found for import ${importId}`);
|
|
94
|
+
return stats;
|
|
95
|
+
}
|
|
96
|
+
prodLog.info(`[BackgroundDedup] Processing ${entities.length} entities from import ${importId}`);
|
|
97
|
+
// Tier 1: ID-based deduplication (O(1) per entity)
|
|
98
|
+
const tier1Merged = await this.tier1_IdBased(entities, importId);
|
|
99
|
+
stats.tier1Matches = tier1Merged;
|
|
100
|
+
stats.totalMerged += tier1Merged;
|
|
101
|
+
// Re-check which entities still exist after Tier 1
|
|
102
|
+
let remainingEntities = entities;
|
|
103
|
+
if (tier1Merged > 0) {
|
|
104
|
+
remainingEntities = await this.filterExisting(entities);
|
|
105
|
+
prodLog.info(`[BackgroundDedup] After Tier 1: ${entities.length} → ${remainingEntities.length} entities`);
|
|
106
|
+
}
|
|
107
|
+
// Tier 2: Name-based deduplication on reduced set
|
|
108
|
+
const tier2Merged = await this.tier2_NameBased(remainingEntities, importId);
|
|
109
|
+
stats.tier2Matches = tier2Merged;
|
|
110
|
+
stats.totalMerged += tier2Merged;
|
|
111
|
+
// Re-check which entities still exist after Tier 2
|
|
112
|
+
if (tier2Merged > 0) {
|
|
113
|
+
remainingEntities = await this.filterExisting(remainingEntities);
|
|
114
|
+
prodLog.info(`[BackgroundDedup] After Tier 2: ${remainingEntities.length} entities remaining`);
|
|
115
|
+
}
|
|
116
|
+
// Tier 3: Similarity-based deduplication on final reduced set
|
|
117
|
+
const tier3Merged = await this.tier3_SimilarityBased(remainingEntities, importId);
|
|
118
|
+
stats.tier3Matches = tier3Merged;
|
|
119
|
+
stats.totalMerged += tier3Merged;
|
|
120
|
+
stats.processingTime = performance.now() - startTime;
|
|
121
|
+
prodLog.info(`[BackgroundDedup] Completed for import ${importId}: ` +
|
|
122
|
+
`${stats.totalMerged} merged (T1: ${stats.tier1Matches}, T2: ${stats.tier2Matches}, T3: ${stats.tier3Matches}) ` +
|
|
123
|
+
`in ${stats.processingTime.toFixed(0)}ms`);
|
|
124
|
+
return stats;
|
|
125
|
+
}
|
|
126
|
+
catch (error) {
|
|
127
|
+
prodLog.error(`[BackgroundDedup] Error deduplicating import ${importId}:`, error);
|
|
128
|
+
stats.processingTime = performance.now() - startTime;
|
|
129
|
+
return stats;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Tier 1: ID-based deduplication
|
|
134
|
+
* Uses entity metadata sourceId field for deterministic matching
|
|
135
|
+
* Complexity: O(n) where n = number of entities in import
|
|
136
|
+
*/
|
|
137
|
+
async tier1_IdBased(entities, importId) {
|
|
138
|
+
const startTime = performance.now();
|
|
139
|
+
let merged = 0;
|
|
140
|
+
// Group entities by sourceId (if available)
|
|
141
|
+
const sourceIdGroups = new Map();
|
|
142
|
+
for (const entity of entities) {
|
|
143
|
+
const sourceId = entity.metadata?.sourceId || entity.metadata?.sourceRow;
|
|
144
|
+
if (sourceId) {
|
|
145
|
+
const key = `${sourceId}`;
|
|
146
|
+
if (!sourceIdGroups.has(key)) {
|
|
147
|
+
sourceIdGroups.set(key, []);
|
|
148
|
+
}
|
|
149
|
+
sourceIdGroups.get(key).push(entity);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
// Merge duplicates with same sourceId
|
|
153
|
+
for (const [sourceId, group] of sourceIdGroups) {
|
|
154
|
+
if (group.length > 1) {
|
|
155
|
+
await this.mergeEntities(group, 'ID');
|
|
156
|
+
merged += group.length - 1;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
const elapsed = performance.now() - startTime;
|
|
160
|
+
if (merged > 0) {
|
|
161
|
+
prodLog.info(`[BackgroundDedup] Tier 1 (ID): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
|
|
162
|
+
}
|
|
163
|
+
return merged;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Tier 2: Name-based deduplication
|
|
167
|
+
* Exact name matching (case-insensitive, normalized)
|
|
168
|
+
* Complexity: O(n) where n = number of entities in import
|
|
169
|
+
*/
|
|
170
|
+
async tier2_NameBased(entities, importId) {
|
|
171
|
+
const startTime = performance.now();
|
|
172
|
+
let merged = 0;
|
|
173
|
+
// Group entities by normalized name
|
|
174
|
+
const nameGroups = new Map();
|
|
175
|
+
for (const entity of entities) {
|
|
176
|
+
const name = entity.metadata?.name;
|
|
177
|
+
if (name && typeof name === 'string') {
|
|
178
|
+
const normalized = this.normalizeName(name);
|
|
179
|
+
if (!nameGroups.has(normalized)) {
|
|
180
|
+
nameGroups.set(normalized, []);
|
|
181
|
+
}
|
|
182
|
+
nameGroups.get(normalized).push(entity);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Merge duplicates with same normalized name and type
|
|
186
|
+
for (const [name, group] of nameGroups) {
|
|
187
|
+
if (group.length > 1) {
|
|
188
|
+
// Further group by type (only merge same types)
|
|
189
|
+
const typeGroups = new Map();
|
|
190
|
+
for (const entity of group) {
|
|
191
|
+
const type = entity.type || 'unknown';
|
|
192
|
+
if (!typeGroups.has(type)) {
|
|
193
|
+
typeGroups.set(type, []);
|
|
194
|
+
}
|
|
195
|
+
typeGroups.get(type).push(entity);
|
|
196
|
+
}
|
|
197
|
+
// Merge within each type group
|
|
198
|
+
for (const [type, typeGroup] of typeGroups) {
|
|
199
|
+
if (typeGroup.length > 1) {
|
|
200
|
+
await this.mergeEntities(typeGroup, 'Name');
|
|
201
|
+
merged += typeGroup.length - 1;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
const elapsed = performance.now() - startTime;
|
|
207
|
+
if (merged > 0) {
|
|
208
|
+
prodLog.info(`[BackgroundDedup] Tier 2 (Name): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
|
|
209
|
+
}
|
|
210
|
+
return merged;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Tier 3: Similarity-based deduplication
|
|
214
|
+
* Uses TypeAware HNSW for vector similarity matching
|
|
215
|
+
* Complexity: O(n log n) where n = number of entities in import
|
|
216
|
+
*/
|
|
217
|
+
async tier3_SimilarityBased(entities, importId) {
|
|
218
|
+
const startTime = performance.now();
|
|
219
|
+
let merged = 0;
|
|
220
|
+
// Process in batches to avoid memory spikes
|
|
221
|
+
const batchSize = 100;
|
|
222
|
+
const similarityThreshold = 0.85;
|
|
223
|
+
for (let i = 0; i < entities.length; i += batchSize) {
|
|
224
|
+
const batch = entities.slice(i, i + batchSize);
|
|
225
|
+
// Batch vector searches using brain.find() (uses TypeAware HNSW)
|
|
226
|
+
const searches = batch.map(entity => {
|
|
227
|
+
const query = `${entity.metadata?.name || ''} ${entity.metadata?.description || ''}`.trim();
|
|
228
|
+
if (!query)
|
|
229
|
+
return Promise.resolve([]);
|
|
230
|
+
return this.brain.find({
|
|
231
|
+
query,
|
|
232
|
+
limit: 5,
|
|
233
|
+
where: { type: entity.type } // Type-aware search
|
|
234
|
+
});
|
|
235
|
+
});
|
|
236
|
+
const results = await Promise.all(searches);
|
|
237
|
+
// Process matches
|
|
238
|
+
for (let j = 0; j < batch.length; j++) {
|
|
239
|
+
const entity = batch[j];
|
|
240
|
+
const matches = results[j];
|
|
241
|
+
for (const match of matches) {
|
|
242
|
+
// Skip self-matches
|
|
243
|
+
if (match.id === entity.id)
|
|
244
|
+
continue;
|
|
245
|
+
// Only merge high-similarity matches from same import
|
|
246
|
+
if (match.score >= similarityThreshold && match.entity.metadata?.importId === importId) {
|
|
247
|
+
// Check if not already merged
|
|
248
|
+
const stillExists = await this.brain.get(entity.id);
|
|
249
|
+
if (stillExists) {
|
|
250
|
+
// Cast match.entity to HNSWNounWithMetadata (it comes from brain.find results)
|
|
251
|
+
const matchEntity = match.entity;
|
|
252
|
+
await this.mergeEntities([entity, matchEntity], 'Similarity');
|
|
253
|
+
merged++;
|
|
254
|
+
break; // Only merge with first high-similarity match
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
const elapsed = performance.now() - startTime;
|
|
261
|
+
if (merged > 0) {
|
|
262
|
+
prodLog.info(`[BackgroundDedup] Tier 3 (Similarity): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
|
|
263
|
+
}
|
|
264
|
+
return merged;
|
|
265
|
+
}
|
|
266
|
+
/**
|
|
267
|
+
* Merge multiple entities into one
|
|
268
|
+
* Keeps entity with highest confidence, merges metadata, deletes duplicates
|
|
269
|
+
*/
|
|
270
|
+
async mergeEntities(entities, reason) {
|
|
271
|
+
if (entities.length < 2)
|
|
272
|
+
return;
|
|
273
|
+
// Find entity with highest confidence
|
|
274
|
+
const primary = entities.reduce((best, curr) => {
|
|
275
|
+
const bestConf = best.metadata?.confidence || 0.5;
|
|
276
|
+
const currConf = curr.metadata?.confidence || 0.5;
|
|
277
|
+
return currConf > bestConf ? curr : best;
|
|
278
|
+
});
|
|
279
|
+
// Merge metadata from all entities
|
|
280
|
+
const primaryMeta = primary.metadata || {};
|
|
281
|
+
const mergedMetadata = {
|
|
282
|
+
...primaryMeta,
|
|
283
|
+
// Merge import IDs
|
|
284
|
+
importIds: Array.from(new Set([
|
|
285
|
+
...(Array.isArray(primaryMeta.importIds) ? primaryMeta.importIds : []),
|
|
286
|
+
...entities.flatMap(e => Array.isArray(e.metadata?.importIds) ? e.metadata.importIds : [])
|
|
287
|
+
])),
|
|
288
|
+
// Merge VFS paths
|
|
289
|
+
vfsPaths: Array.from(new Set([
|
|
290
|
+
...(Array.isArray(primaryMeta.vfsPaths) ? primaryMeta.vfsPaths : []),
|
|
291
|
+
...entities.flatMap(e => Array.isArray(e.metadata?.vfsPaths) ? e.metadata.vfsPaths : [])
|
|
292
|
+
])),
|
|
293
|
+
// Merge concepts
|
|
294
|
+
concepts: Array.from(new Set([
|
|
295
|
+
...(Array.isArray(primaryMeta.concepts) ? primaryMeta.concepts : []),
|
|
296
|
+
...entities.flatMap(e => Array.isArray(e.metadata?.concepts) ? e.metadata.concepts : [])
|
|
297
|
+
])),
|
|
298
|
+
// Track merge
|
|
299
|
+
mergeCount: (typeof primaryMeta.mergeCount === 'number' ? primaryMeta.mergeCount : 0) + (entities.length - 1),
|
|
300
|
+
mergedWith: entities.filter(e => e.id !== primary.id).map(e => e.id),
|
|
301
|
+
lastMerged: Date.now(),
|
|
302
|
+
mergeReason: reason
|
|
303
|
+
};
|
|
304
|
+
// Update primary entity with merged metadata
|
|
305
|
+
await this.brain.update({
|
|
306
|
+
id: primary.id,
|
|
307
|
+
metadata: mergedMetadata,
|
|
308
|
+
merge: true
|
|
309
|
+
});
|
|
310
|
+
// Delete duplicate entities
|
|
311
|
+
for (const entity of entities) {
|
|
312
|
+
if (entity.id !== primary.id) {
|
|
313
|
+
try {
|
|
314
|
+
await this.brain.delete(entity.id);
|
|
315
|
+
}
|
|
316
|
+
catch (error) {
|
|
317
|
+
// Entity might already be deleted, continue
|
|
318
|
+
prodLog.debug(`[BackgroundDedup] Could not delete ${entity.id}:`, error);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Filter entities to only those that still exist (not deleted)
|
|
325
|
+
* @private
|
|
326
|
+
*/
|
|
327
|
+
async filterExisting(entities) {
|
|
328
|
+
const existing = [];
|
|
329
|
+
for (const entity of entities) {
|
|
330
|
+
const stillExists = await this.brain.get(entity.id);
|
|
331
|
+
if (stillExists) {
|
|
332
|
+
existing.push(entity);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
return existing;
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Normalize string for comparison
|
|
339
|
+
* Lowercase, trim, remove special characters
|
|
340
|
+
*/
|
|
341
|
+
normalizeName(str) {
|
|
342
|
+
return str
|
|
343
|
+
.toLowerCase()
|
|
344
|
+
.trim()
|
|
345
|
+
.replace(/[^a-z0-9\s]/g, '')
|
|
346
|
+
.replace(/\s+/g, ' ');
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* Cancel pending deduplication (for cleanup)
|
|
350
|
+
*/
|
|
351
|
+
cancelPending() {
|
|
352
|
+
if (this.debounceTimer) {
|
|
353
|
+
clearTimeout(this.debounceTimer);
|
|
354
|
+
this.debounceTimer = undefined;
|
|
355
|
+
}
|
|
356
|
+
this.pendingImports.clear();
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
//# sourceMappingURL=BackgroundDeduplicator.js.map
|
|
@@ -248,8 +248,8 @@ export interface ImportResult {
|
|
|
248
248
|
export declare class ImportCoordinator {
|
|
249
249
|
private brain;
|
|
250
250
|
private detector;
|
|
251
|
-
private deduplicator;
|
|
252
251
|
private history;
|
|
252
|
+
private backgroundDedup;
|
|
253
253
|
private excelImporter;
|
|
254
254
|
private pdfImporter;
|
|
255
255
|
private csvImporter;
|
|
@@ -10,8 +10,8 @@
|
|
|
10
10
|
* NO MOCKS - Production-ready implementation
|
|
11
11
|
*/
|
|
12
12
|
import { FormatDetector } from './FormatDetector.js';
|
|
13
|
-
import { EntityDeduplicator } from './EntityDeduplicator.js';
|
|
14
13
|
import { ImportHistory } from './ImportHistory.js';
|
|
14
|
+
import { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
|
|
15
15
|
import { SmartExcelImporter } from '../importers/SmartExcelImporter.js';
|
|
16
16
|
import { SmartPDFImporter } from '../importers/SmartPDFImporter.js';
|
|
17
17
|
import { SmartCSVImporter } from '../importers/SmartCSVImporter.js';
|
|
@@ -31,8 +31,8 @@ export class ImportCoordinator {
|
|
|
31
31
|
constructor(brain) {
|
|
32
32
|
this.brain = brain;
|
|
33
33
|
this.detector = new FormatDetector();
|
|
34
|
-
this.deduplicator = new EntityDeduplicator(brain);
|
|
35
34
|
this.history = new ImportHistory(brain);
|
|
35
|
+
this.backgroundDedup = new BackgroundDeduplicator(brain);
|
|
36
36
|
this.excelImporter = new SmartExcelImporter(brain);
|
|
37
37
|
this.pdfImporter = new SmartPDFImporter(brain);
|
|
38
38
|
this.csvImporter = new SmartCSVImporter(brain);
|
|
@@ -683,20 +683,20 @@ export class ImportCoordinator {
|
|
|
683
683
|
try {
|
|
684
684
|
const importSource = vfsResult.rootPath;
|
|
685
685
|
let entityId;
|
|
686
|
-
|
|
687
|
-
//
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
name: entity.name,
|
|
686
|
+
// v5.7.0: No deduplication during import (12-24x speedup)
|
|
687
|
+
// Background deduplication runs 5 minutes after import completes
|
|
688
|
+
entityId = await this.brain.add({
|
|
689
|
+
data: entity.description || entity.name,
|
|
691
690
|
type: entity.type,
|
|
692
|
-
description: entity.description || entity.name,
|
|
693
|
-
confidence: entity.confidence,
|
|
694
691
|
metadata: {
|
|
695
692
|
...entity.metadata,
|
|
693
|
+
name: entity.name,
|
|
694
|
+
confidence: entity.confidence,
|
|
696
695
|
vfsPath: vfsFile?.path,
|
|
697
696
|
importedFrom: 'import-coordinator',
|
|
698
697
|
// v4.10.0: Import tracking metadata
|
|
699
698
|
...(trackingContext && {
|
|
699
|
+
importId: trackingContext.importId, // Used for background dedup
|
|
700
700
|
importIds: [trackingContext.importId],
|
|
701
701
|
projectId: trackingContext.projectId,
|
|
702
702
|
importedAt: trackingContext.importedAt,
|
|
@@ -707,19 +707,8 @@ export class ImportCoordinator {
|
|
|
707
707
|
...trackingContext.customMetadata
|
|
708
708
|
})
|
|
709
709
|
}
|
|
710
|
-
}, importSource, {
|
|
711
|
-
similarityThreshold: options.deduplicationThreshold || 0.85,
|
|
712
|
-
strictTypeMatching: true,
|
|
713
|
-
enableFuzzyMatching: true
|
|
714
710
|
});
|
|
715
|
-
|
|
716
|
-
wasMerged = mergeResult.wasMerged;
|
|
717
|
-
if (wasMerged) {
|
|
718
|
-
mergedCount++;
|
|
719
|
-
}
|
|
720
|
-
else {
|
|
721
|
-
newCount++;
|
|
722
|
-
}
|
|
711
|
+
newCount++;
|
|
723
712
|
// Update entity ID in extraction result
|
|
724
713
|
entity.id = entityId;
|
|
725
714
|
entities.push({
|
|
@@ -943,6 +932,10 @@ export class ImportCoordinator {
|
|
|
943
932
|
// Continue - relationships are optional
|
|
944
933
|
}
|
|
945
934
|
}
|
|
935
|
+
// v5.7.0: Schedule background deduplication (debounced 5 minutes)
|
|
936
|
+
if (trackingContext && trackingContext.importId) {
|
|
937
|
+
this.backgroundDedup.scheduleDedup(trackingContext.importId);
|
|
938
|
+
}
|
|
946
939
|
return {
|
|
947
940
|
entities,
|
|
948
941
|
relationships,
|
package/dist/import/index.d.ts
CHANGED
|
@@ -10,7 +10,9 @@
|
|
|
10
10
|
export { ImportCoordinator } from './ImportCoordinator.js';
|
|
11
11
|
export { FormatDetector, SupportedFormat, DetectionResult } from './FormatDetector.js';
|
|
12
12
|
export { EntityDeduplicator } from './EntityDeduplicator.js';
|
|
13
|
+
export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
|
|
13
14
|
export { ImportHistory } from './ImportHistory.js';
|
|
14
15
|
export type { ImportSource, ImportOptions, ImportProgress, ImportResult } from './ImportCoordinator.js';
|
|
15
16
|
export type { EntityCandidate, DuplicateMatch, EntityDeduplicationOptions, MergeResult } from './EntityDeduplicator.js';
|
|
17
|
+
export type { DeduplicationStats } from './BackgroundDeduplicator.js';
|
|
16
18
|
export type { ImportHistoryEntry, RollbackResult } from './ImportHistory.js';
|
package/dist/import/index.js
CHANGED
|
@@ -10,5 +10,6 @@
|
|
|
10
10
|
export { ImportCoordinator } from './ImportCoordinator.js';
|
|
11
11
|
export { FormatDetector } from './FormatDetector.js';
|
|
12
12
|
export { EntityDeduplicator } from './EntityDeduplicator.js';
|
|
13
|
+
export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
|
|
13
14
|
export { ImportHistory } from './ImportHistory.js';
|
|
14
15
|
//# sourceMappingURL=index.js.map
|
|
@@ -51,6 +51,7 @@ export declare function getDirectoryPath(entityType: 'noun' | 'verb', dataType:
|
|
|
51
51
|
export declare abstract class BaseStorage extends BaseStorageAdapter {
|
|
52
52
|
protected isInitialized: boolean;
|
|
53
53
|
protected graphIndex?: GraphAdjacencyIndex;
|
|
54
|
+
protected graphIndexPromise?: Promise<GraphAdjacencyIndex>;
|
|
54
55
|
protected readOnly: boolean;
|
|
55
56
|
refManager?: RefManager;
|
|
56
57
|
blobStorage?: BlobStorage;
|
|
@@ -311,9 +312,15 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
|
|
|
311
312
|
*/
|
|
312
313
|
deleteVerb(id: string): Promise<void>;
|
|
313
314
|
/**
|
|
314
|
-
* Get graph index (lazy initialization)
|
|
315
|
+
* Get graph index (lazy initialization with concurrent access protection)
|
|
316
|
+
* v5.7.1: Fixed race condition where concurrent calls could trigger multiple rebuilds
|
|
315
317
|
*/
|
|
316
318
|
getGraphIndex(): Promise<GraphAdjacencyIndex>;
|
|
319
|
+
/**
|
|
320
|
+
* Internal method to initialize graph index (called once by getGraphIndex)
|
|
321
|
+
* @private
|
|
322
|
+
*/
|
|
323
|
+
private _initializeGraphIndex;
|
|
317
324
|
/**
|
|
318
325
|
* Clear all data from storage
|
|
319
326
|
* This method should be implemented by each specific adapter
|
|
@@ -481,7 +488,7 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
|
|
|
481
488
|
protected getVerbsBySource_internal(sourceId: string): Promise<HNSWVerbWithMetadata[]>;
|
|
482
489
|
/**
|
|
483
490
|
* Get verbs by target (COW-aware implementation)
|
|
484
|
-
* v5.
|
|
491
|
+
* v5.7.0: BILLION-SCALE OPTIMIZATION - Use GraphAdjacencyIndex for O(log n) lookup
|
|
485
492
|
*/
|
|
486
493
|
protected getVerbsByTarget_internal(targetId: string): Promise<HNSWVerbWithMetadata[]>;
|
|
487
494
|
/**
|