@soulcraft/brainy 5.6.2 → 5.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,359 @@
1
+ /**
2
+ * Background Deduplicator
3
+ *
4
+ * Performs 3-tier entity deduplication in background after imports:
5
+ * - Tier 1: ID-based (O(1)) - Uses entity metadata for deterministic IDs
6
+ * - Tier 2: Name-based (O(log n)) - Exact name matching (case-insensitive)
7
+ * - Tier 3: Similarity-based (O(n log n)) - Vector similarity via TypeAware HNSW
8
+ *
9
+ * NO MOCKS - Production-ready implementation using existing indexes
10
+ */
11
+ import { prodLog } from '../utils/logger.js';
12
+ /**
13
+ * BackgroundDeduplicator - Auto-runs deduplication 5 minutes after imports
14
+ *
15
+ * Architecture:
16
+ * - Debounced trigger (5 min after last import)
17
+ * - Import-scoped deduplication (no cross-contamination)
18
+ * - 3-tier strategy (ID → Name → Similarity)
19
+ * - Uses existing indexes (EntityIdMapper, MetadataIndexManager, TypeAware HNSW)
20
+ */
21
+ export class BackgroundDeduplicator {
22
+ constructor(brain) {
23
+ this.pendingImports = new Set();
24
+ this.isProcessing = false;
25
+ this.brain = brain;
26
+ }
27
+ /**
28
+ * Schedule deduplication for an import (debounced 5 minutes)
29
+ * Called by ImportCoordinator after each import completes
30
+ */
31
+ scheduleDedup(importId) {
32
+ prodLog.info(`[BackgroundDedup] Scheduled deduplication for import ${importId}`);
33
+ // Add to pending queue
34
+ this.pendingImports.add(importId);
35
+ // Clear existing timer (debouncing)
36
+ if (this.debounceTimer) {
37
+ clearTimeout(this.debounceTimer);
38
+ }
39
+ // Schedule for 5 minutes from now
40
+ this.debounceTimer = setTimeout(() => {
41
+ this.runBatchDedup().catch(error => {
42
+ prodLog.error('[BackgroundDedup] Batch dedup failed:', error);
43
+ });
44
+ }, 5 * 60 * 1000);
45
+ }
46
+ /**
47
+ * Run deduplication for all pending imports
48
+ * @private
49
+ */
50
+ async runBatchDedup() {
51
+ if (this.isProcessing) {
52
+ prodLog.warn('[BackgroundDedup] Already processing, skipping');
53
+ return;
54
+ }
55
+ this.isProcessing = true;
56
+ try {
57
+ const imports = Array.from(this.pendingImports);
58
+ prodLog.info(`[BackgroundDedup] Processing ${imports.length} pending import(s)`);
59
+ for (const importId of imports) {
60
+ await this.deduplicateImport(importId);
61
+ }
62
+ this.pendingImports.clear();
63
+ prodLog.info('[BackgroundDedup] Batch deduplication complete');
64
+ }
65
+ finally {
66
+ this.isProcessing = false;
67
+ }
68
+ }
69
+ /**
70
+ * Deduplicate entities from a specific import
71
+ * Uses 3-tier strategy: ID → Name → Similarity
72
+ */
73
+ async deduplicateImport(importId) {
74
+ const startTime = performance.now();
75
+ prodLog.info(`[BackgroundDedup] Starting deduplication for import ${importId}`);
76
+ const stats = {
77
+ totalEntities: 0,
78
+ tier1Matches: 0,
79
+ tier2Matches: 0,
80
+ tier3Matches: 0,
81
+ totalMerged: 0,
82
+ processingTime: 0
83
+ };
84
+ try {
85
+ // Get all entities from this import using brain.find()
86
+ const results = await this.brain.find({
87
+ where: { importId },
88
+ limit: 100000 // Large limit to get all entities from import
89
+ });
90
+ const entities = results.map(r => r.entity);
91
+ stats.totalEntities = entities.length;
92
+ if (entities.length === 0) {
93
+ prodLog.info(`[BackgroundDedup] No entities found for import ${importId}`);
94
+ return stats;
95
+ }
96
+ prodLog.info(`[BackgroundDedup] Processing ${entities.length} entities from import ${importId}`);
97
+ // Tier 1: ID-based deduplication (O(1) per entity)
98
+ const tier1Merged = await this.tier1_IdBased(entities, importId);
99
+ stats.tier1Matches = tier1Merged;
100
+ stats.totalMerged += tier1Merged;
101
+ // Re-check which entities still exist after Tier 1
102
+ let remainingEntities = entities;
103
+ if (tier1Merged > 0) {
104
+ remainingEntities = await this.filterExisting(entities);
105
+ prodLog.info(`[BackgroundDedup] After Tier 1: ${entities.length} → ${remainingEntities.length} entities`);
106
+ }
107
+ // Tier 2: Name-based deduplication on reduced set
108
+ const tier2Merged = await this.tier2_NameBased(remainingEntities, importId);
109
+ stats.tier2Matches = tier2Merged;
110
+ stats.totalMerged += tier2Merged;
111
+ // Re-check which entities still exist after Tier 2
112
+ if (tier2Merged > 0) {
113
+ remainingEntities = await this.filterExisting(remainingEntities);
114
+ prodLog.info(`[BackgroundDedup] After Tier 2: ${remainingEntities.length} entities remaining`);
115
+ }
116
+ // Tier 3: Similarity-based deduplication on final reduced set
117
+ const tier3Merged = await this.tier3_SimilarityBased(remainingEntities, importId);
118
+ stats.tier3Matches = tier3Merged;
119
+ stats.totalMerged += tier3Merged;
120
+ stats.processingTime = performance.now() - startTime;
121
+ prodLog.info(`[BackgroundDedup] Completed for import ${importId}: ` +
122
+ `${stats.totalMerged} merged (T1: ${stats.tier1Matches}, T2: ${stats.tier2Matches}, T3: ${stats.tier3Matches}) ` +
123
+ `in ${stats.processingTime.toFixed(0)}ms`);
124
+ return stats;
125
+ }
126
+ catch (error) {
127
+ prodLog.error(`[BackgroundDedup] Error deduplicating import ${importId}:`, error);
128
+ stats.processingTime = performance.now() - startTime;
129
+ return stats;
130
+ }
131
+ }
132
+ /**
133
+ * Tier 1: ID-based deduplication
134
+ * Uses entity metadata sourceId field for deterministic matching
135
+ * Complexity: O(n) where n = number of entities in import
136
+ */
137
+ async tier1_IdBased(entities, importId) {
138
+ const startTime = performance.now();
139
+ let merged = 0;
140
+ // Group entities by sourceId (if available)
141
+ const sourceIdGroups = new Map();
142
+ for (const entity of entities) {
143
+ const sourceId = entity.metadata?.sourceId || entity.metadata?.sourceRow;
144
+ if (sourceId) {
145
+ const key = `${sourceId}`;
146
+ if (!sourceIdGroups.has(key)) {
147
+ sourceIdGroups.set(key, []);
148
+ }
149
+ sourceIdGroups.get(key).push(entity);
150
+ }
151
+ }
152
+ // Merge duplicates with same sourceId
153
+ for (const [sourceId, group] of sourceIdGroups) {
154
+ if (group.length > 1) {
155
+ await this.mergeEntities(group, 'ID');
156
+ merged += group.length - 1;
157
+ }
158
+ }
159
+ const elapsed = performance.now() - startTime;
160
+ if (merged > 0) {
161
+ prodLog.info(`[BackgroundDedup] Tier 1 (ID): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
162
+ }
163
+ return merged;
164
+ }
165
+ /**
166
+ * Tier 2: Name-based deduplication
167
+ * Exact name matching (case-insensitive, normalized)
168
+ * Complexity: O(n) where n = number of entities in import
169
+ */
170
+ async tier2_NameBased(entities, importId) {
171
+ const startTime = performance.now();
172
+ let merged = 0;
173
+ // Group entities by normalized name
174
+ const nameGroups = new Map();
175
+ for (const entity of entities) {
176
+ const name = entity.metadata?.name;
177
+ if (name && typeof name === 'string') {
178
+ const normalized = this.normalizeName(name);
179
+ if (!nameGroups.has(normalized)) {
180
+ nameGroups.set(normalized, []);
181
+ }
182
+ nameGroups.get(normalized).push(entity);
183
+ }
184
+ }
185
+ // Merge duplicates with same normalized name and type
186
+ for (const [name, group] of nameGroups) {
187
+ if (group.length > 1) {
188
+ // Further group by type (only merge same types)
189
+ const typeGroups = new Map();
190
+ for (const entity of group) {
191
+ const type = entity.type || 'unknown';
192
+ if (!typeGroups.has(type)) {
193
+ typeGroups.set(type, []);
194
+ }
195
+ typeGroups.get(type).push(entity);
196
+ }
197
+ // Merge within each type group
198
+ for (const [type, typeGroup] of typeGroups) {
199
+ if (typeGroup.length > 1) {
200
+ await this.mergeEntities(typeGroup, 'Name');
201
+ merged += typeGroup.length - 1;
202
+ }
203
+ }
204
+ }
205
+ }
206
+ const elapsed = performance.now() - startTime;
207
+ if (merged > 0) {
208
+ prodLog.info(`[BackgroundDedup] Tier 2 (Name): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
209
+ }
210
+ return merged;
211
+ }
212
+ /**
213
+ * Tier 3: Similarity-based deduplication
214
+ * Uses TypeAware HNSW for vector similarity matching
215
+ * Complexity: O(n log n) where n = number of entities in import
216
+ */
217
+ async tier3_SimilarityBased(entities, importId) {
218
+ const startTime = performance.now();
219
+ let merged = 0;
220
+ // Process in batches to avoid memory spikes
221
+ const batchSize = 100;
222
+ const similarityThreshold = 0.85;
223
+ for (let i = 0; i < entities.length; i += batchSize) {
224
+ const batch = entities.slice(i, i + batchSize);
225
+ // Batch vector searches using brain.find() (uses TypeAware HNSW)
226
+ const searches = batch.map(entity => {
227
+ const query = `${entity.metadata?.name || ''} ${entity.metadata?.description || ''}`.trim();
228
+ if (!query)
229
+ return Promise.resolve([]);
230
+ return this.brain.find({
231
+ query,
232
+ limit: 5,
233
+ where: { type: entity.type } // Type-aware search
234
+ });
235
+ });
236
+ const results = await Promise.all(searches);
237
+ // Process matches
238
+ for (let j = 0; j < batch.length; j++) {
239
+ const entity = batch[j];
240
+ const matches = results[j];
241
+ for (const match of matches) {
242
+ // Skip self-matches
243
+ if (match.id === entity.id)
244
+ continue;
245
+ // Only merge high-similarity matches from same import
246
+ if (match.score >= similarityThreshold && match.entity.metadata?.importId === importId) {
247
+ // Check if not already merged
248
+ const stillExists = await this.brain.get(entity.id);
249
+ if (stillExists) {
250
+ // Cast match.entity to HNSWNounWithMetadata (it comes from brain.find results)
251
+ const matchEntity = match.entity;
252
+ await this.mergeEntities([entity, matchEntity], 'Similarity');
253
+ merged++;
254
+ break; // Only merge with first high-similarity match
255
+ }
256
+ }
257
+ }
258
+ }
259
+ }
260
+ const elapsed = performance.now() - startTime;
261
+ if (merged > 0) {
262
+ prodLog.info(`[BackgroundDedup] Tier 3 (Similarity): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
263
+ }
264
+ return merged;
265
+ }
266
+ /**
267
+ * Merge multiple entities into one
268
+ * Keeps entity with highest confidence, merges metadata, deletes duplicates
269
+ */
270
+ async mergeEntities(entities, reason) {
271
+ if (entities.length < 2)
272
+ return;
273
+ // Find entity with highest confidence
274
+ const primary = entities.reduce((best, curr) => {
275
+ const bestConf = best.metadata?.confidence || 0.5;
276
+ const currConf = curr.metadata?.confidence || 0.5;
277
+ return currConf > bestConf ? curr : best;
278
+ });
279
+ // Merge metadata from all entities
280
+ const primaryMeta = primary.metadata || {};
281
+ const mergedMetadata = {
282
+ ...primaryMeta,
283
+ // Merge import IDs
284
+ importIds: Array.from(new Set([
285
+ ...(Array.isArray(primaryMeta.importIds) ? primaryMeta.importIds : []),
286
+ ...entities.flatMap(e => Array.isArray(e.metadata?.importIds) ? e.metadata.importIds : [])
287
+ ])),
288
+ // Merge VFS paths
289
+ vfsPaths: Array.from(new Set([
290
+ ...(Array.isArray(primaryMeta.vfsPaths) ? primaryMeta.vfsPaths : []),
291
+ ...entities.flatMap(e => Array.isArray(e.metadata?.vfsPaths) ? e.metadata.vfsPaths : [])
292
+ ])),
293
+ // Merge concepts
294
+ concepts: Array.from(new Set([
295
+ ...(Array.isArray(primaryMeta.concepts) ? primaryMeta.concepts : []),
296
+ ...entities.flatMap(e => Array.isArray(e.metadata?.concepts) ? e.metadata.concepts : [])
297
+ ])),
298
+ // Track merge
299
+ mergeCount: (typeof primaryMeta.mergeCount === 'number' ? primaryMeta.mergeCount : 0) + (entities.length - 1),
300
+ mergedWith: entities.filter(e => e.id !== primary.id).map(e => e.id),
301
+ lastMerged: Date.now(),
302
+ mergeReason: reason
303
+ };
304
+ // Update primary entity with merged metadata
305
+ await this.brain.update({
306
+ id: primary.id,
307
+ metadata: mergedMetadata,
308
+ merge: true
309
+ });
310
+ // Delete duplicate entities
311
+ for (const entity of entities) {
312
+ if (entity.id !== primary.id) {
313
+ try {
314
+ await this.brain.delete(entity.id);
315
+ }
316
+ catch (error) {
317
+ // Entity might already be deleted, continue
318
+ prodLog.debug(`[BackgroundDedup] Could not delete ${entity.id}:`, error);
319
+ }
320
+ }
321
+ }
322
+ }
323
+ /**
324
+ * Filter entities to only those that still exist (not deleted)
325
+ * @private
326
+ */
327
+ async filterExisting(entities) {
328
+ const existing = [];
329
+ for (const entity of entities) {
330
+ const stillExists = await this.brain.get(entity.id);
331
+ if (stillExists) {
332
+ existing.push(entity);
333
+ }
334
+ }
335
+ return existing;
336
+ }
337
+ /**
338
+ * Normalize string for comparison
339
+ * Lowercase, trim, remove special characters
340
+ */
341
+ normalizeName(str) {
342
+ return str
343
+ .toLowerCase()
344
+ .trim()
345
+ .replace(/[^a-z0-9\s]/g, '')
346
+ .replace(/\s+/g, ' ');
347
+ }
348
+ /**
349
+ * Cancel pending deduplication (for cleanup)
350
+ */
351
+ cancelPending() {
352
+ if (this.debounceTimer) {
353
+ clearTimeout(this.debounceTimer);
354
+ this.debounceTimer = undefined;
355
+ }
356
+ this.pendingImports.clear();
357
+ }
358
+ }
359
+ //# sourceMappingURL=BackgroundDeduplicator.js.map
@@ -248,8 +248,8 @@ export interface ImportResult {
248
248
  export declare class ImportCoordinator {
249
249
  private brain;
250
250
  private detector;
251
- private deduplicator;
252
251
  private history;
252
+ private backgroundDedup;
253
253
  private excelImporter;
254
254
  private pdfImporter;
255
255
  private csvImporter;
@@ -10,8 +10,8 @@
10
10
  * NO MOCKS - Production-ready implementation
11
11
  */
12
12
  import { FormatDetector } from './FormatDetector.js';
13
- import { EntityDeduplicator } from './EntityDeduplicator.js';
14
13
  import { ImportHistory } from './ImportHistory.js';
14
+ import { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
15
15
  import { SmartExcelImporter } from '../importers/SmartExcelImporter.js';
16
16
  import { SmartPDFImporter } from '../importers/SmartPDFImporter.js';
17
17
  import { SmartCSVImporter } from '../importers/SmartCSVImporter.js';
@@ -31,8 +31,8 @@ export class ImportCoordinator {
31
31
  constructor(brain) {
32
32
  this.brain = brain;
33
33
  this.detector = new FormatDetector();
34
- this.deduplicator = new EntityDeduplicator(brain);
35
34
  this.history = new ImportHistory(brain);
35
+ this.backgroundDedup = new BackgroundDeduplicator(brain);
36
36
  this.excelImporter = new SmartExcelImporter(brain);
37
37
  this.pdfImporter = new SmartPDFImporter(brain);
38
38
  this.csvImporter = new SmartCSVImporter(brain);
@@ -683,20 +683,20 @@ export class ImportCoordinator {
683
683
  try {
684
684
  const importSource = vfsResult.rootPath;
685
685
  let entityId;
686
- let wasMerged = false;
687
- // Use deduplicator to check for existing entities
688
- const mergeResult = await this.deduplicator.createOrMerge({
689
- id: entity.id,
690
- name: entity.name,
686
+ // v5.7.0: No deduplication during import (12-24x speedup)
687
+ // Background deduplication runs 5 minutes after import completes
688
+ entityId = await this.brain.add({
689
+ data: entity.description || entity.name,
691
690
  type: entity.type,
692
- description: entity.description || entity.name,
693
- confidence: entity.confidence,
694
691
  metadata: {
695
692
  ...entity.metadata,
693
+ name: entity.name,
694
+ confidence: entity.confidence,
696
695
  vfsPath: vfsFile?.path,
697
696
  importedFrom: 'import-coordinator',
698
697
  // v4.10.0: Import tracking metadata
699
698
  ...(trackingContext && {
699
+ importId: trackingContext.importId, // Used for background dedup
700
700
  importIds: [trackingContext.importId],
701
701
  projectId: trackingContext.projectId,
702
702
  importedAt: trackingContext.importedAt,
@@ -707,19 +707,8 @@ export class ImportCoordinator {
707
707
  ...trackingContext.customMetadata
708
708
  })
709
709
  }
710
- }, importSource, {
711
- similarityThreshold: options.deduplicationThreshold || 0.85,
712
- strictTypeMatching: true,
713
- enableFuzzyMatching: true
714
710
  });
715
- entityId = mergeResult.mergedEntityId;
716
- wasMerged = mergeResult.wasMerged;
717
- if (wasMerged) {
718
- mergedCount++;
719
- }
720
- else {
721
- newCount++;
722
- }
711
+ newCount++;
723
712
  // Update entity ID in extraction result
724
713
  entity.id = entityId;
725
714
  entities.push({
@@ -943,6 +932,10 @@ export class ImportCoordinator {
943
932
  // Continue - relationships are optional
944
933
  }
945
934
  }
935
+ // v5.7.0: Schedule background deduplication (debounced 5 minutes)
936
+ if (trackingContext && trackingContext.importId) {
937
+ this.backgroundDedup.scheduleDedup(trackingContext.importId);
938
+ }
946
939
  return {
947
940
  entities,
948
941
  relationships,
@@ -10,7 +10,9 @@
10
10
  export { ImportCoordinator } from './ImportCoordinator.js';
11
11
  export { FormatDetector, SupportedFormat, DetectionResult } from './FormatDetector.js';
12
12
  export { EntityDeduplicator } from './EntityDeduplicator.js';
13
+ export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
13
14
  export { ImportHistory } from './ImportHistory.js';
14
15
  export type { ImportSource, ImportOptions, ImportProgress, ImportResult } from './ImportCoordinator.js';
15
16
  export type { EntityCandidate, DuplicateMatch, EntityDeduplicationOptions, MergeResult } from './EntityDeduplicator.js';
17
+ export type { DeduplicationStats } from './BackgroundDeduplicator.js';
16
18
  export type { ImportHistoryEntry, RollbackResult } from './ImportHistory.js';
@@ -10,5 +10,6 @@
10
10
  export { ImportCoordinator } from './ImportCoordinator.js';
11
11
  export { FormatDetector } from './FormatDetector.js';
12
12
  export { EntityDeduplicator } from './EntityDeduplicator.js';
13
+ export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
13
14
  export { ImportHistory } from './ImportHistory.js';
14
15
  //# sourceMappingURL=index.js.map
@@ -51,6 +51,7 @@ export declare function getDirectoryPath(entityType: 'noun' | 'verb', dataType:
51
51
  export declare abstract class BaseStorage extends BaseStorageAdapter {
52
52
  protected isInitialized: boolean;
53
53
  protected graphIndex?: GraphAdjacencyIndex;
54
+ protected graphIndexPromise?: Promise<GraphAdjacencyIndex>;
54
55
  protected readOnly: boolean;
55
56
  refManager?: RefManager;
56
57
  blobStorage?: BlobStorage;
@@ -311,9 +312,15 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
311
312
  */
312
313
  deleteVerb(id: string): Promise<void>;
313
314
  /**
314
- * Get graph index (lazy initialization)
315
+ * Get graph index (lazy initialization with concurrent access protection)
316
+ * v5.7.1: Fixed race condition where concurrent calls could trigger multiple rebuilds
315
317
  */
316
318
  getGraphIndex(): Promise<GraphAdjacencyIndex>;
319
+ /**
320
+ * Internal method to initialize graph index (called once by getGraphIndex)
321
+ * @private
322
+ */
323
+ private _initializeGraphIndex;
317
324
  /**
318
325
  * Clear all data from storage
319
326
  * This method should be implemented by each specific adapter
@@ -481,7 +488,7 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
481
488
  protected getVerbsBySource_internal(sourceId: string): Promise<HNSWVerbWithMetadata[]>;
482
489
  /**
483
490
  * Get verbs by target (COW-aware implementation)
484
- * v5.4.0: Fixed to directly list verb files instead of directories
491
+ * v5.7.0: BILLION-SCALE OPTIMIZATION - Use GraphAdjacencyIndex for O(log n) lookup
485
492
  */
486
493
  protected getVerbsByTarget_internal(targetId: string): Promise<HNSWVerbWithMetadata[]>;
487
494
  /**