npm - @soulcraft/brainy - Versions diffs - 5.6.2 → 5.7.0 - Mend

@soulcraft/brainy 5.6.2 → 5.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/CHANGELOG.md +12 -0
package/README.md +29 -4
package/dist/graph/graphAdjacencyIndex.d.ts +33 -1
package/dist/graph/graphAdjacencyIndex.js +110 -18
package/dist/import/BackgroundDeduplicator.d.ts +93 -0
package/dist/import/BackgroundDeduplicator.js +359 -0
package/dist/import/ImportCoordinator.d.ts +1 -1
package/dist/import/ImportCoordinator.js +14 -21
package/dist/import/index.d.ts +2 -0
package/dist/import/index.js +1 -0
package/dist/storage/baseStorage.d.ts +9 -2
package/dist/storage/baseStorage.js +116 -111
package/package.json +1 -1

package/dist/import/BackgroundDeduplicator.js ADDED Viewed

@@ -0,0 +1,359 @@
+/**
+ * Background Deduplicator
+ *
+ * Performs 3-tier entity deduplication in background after imports:
+ * - Tier 1: ID-based (O(1)) - Uses entity metadata for deterministic IDs
+ * - Tier 2: Name-based (O(log n)) - Exact name matching (case-insensitive)
+ * - Tier 3: Similarity-based (O(n log n)) - Vector similarity via TypeAware HNSW
+ *
+ * NO MOCKS - Production-ready implementation using existing indexes
+ */
+import { prodLog } from '../utils/logger.js';
+/**
+ * BackgroundDeduplicator - Auto-runs deduplication 5 minutes after imports
+ *
+ * Architecture:
+ * - Debounced trigger (5 min after last import)
+ * - Import-scoped deduplication (no cross-contamination)
+ * - 3-tier strategy (ID → Name → Similarity)
+ * - Uses existing indexes (EntityIdMapper, MetadataIndexManager, TypeAware HNSW)
+ */
+export class BackgroundDeduplicator {
+    constructor(brain) {
+        this.pendingImports = new Set();
+        this.isProcessing = false;
+        this.brain = brain;
+    }
+    /**
+     * Schedule deduplication for an import (debounced 5 minutes)
+     * Called by ImportCoordinator after each import completes
+     */
+    scheduleDedup(importId) {
+        prodLog.info(`[BackgroundDedup] Scheduled deduplication for import ${importId}`);
+        // Add to pending queue
+        this.pendingImports.add(importId);
+        // Clear existing timer (debouncing)
+        if (this.debounceTimer) {
+            clearTimeout(this.debounceTimer);
+        }
+        // Schedule for 5 minutes from now
+        this.debounceTimer = setTimeout(() => {
+            this.runBatchDedup().catch(error => {
+                prodLog.error('[BackgroundDedup] Batch dedup failed:', error);
+            });
+        }, 5 * 60 * 1000);
+    }
+    /**
+     * Run deduplication for all pending imports
+     * @private
+     */
+    async runBatchDedup() {
+        if (this.isProcessing) {
+            prodLog.warn('[BackgroundDedup] Already processing, skipping');
+            return;
+        }
+        this.isProcessing = true;
+        try {
+            const imports = Array.from(this.pendingImports);
+            prodLog.info(`[BackgroundDedup] Processing ${imports.length} pending import(s)`);
+            for (const importId of imports) {
+                await this.deduplicateImport(importId);
+            }
+            this.pendingImports.clear();
+            prodLog.info('[BackgroundDedup] Batch deduplication complete');
+        }
+        finally {
+            this.isProcessing = false;
+        }
+    }
+    /**
+     * Deduplicate entities from a specific import
+     * Uses 3-tier strategy: ID → Name → Similarity
+     */
+    async deduplicateImport(importId) {
+        const startTime = performance.now();
+        prodLog.info(`[BackgroundDedup] Starting deduplication for import ${importId}`);
+        const stats = {
+            totalEntities: 0,
+            tier1Matches: 0,
+            tier2Matches: 0,
+            tier3Matches: 0,
+            totalMerged: 0,
+            processingTime: 0
+        };
+        try {
+            // Get all entities from this import using brain.find()
+            const results = await this.brain.find({
+                where: { importId },
+                limit: 100000 // Large limit to get all entities from import
+            });
+            const entities = results.map(r => r.entity);
+            stats.totalEntities = entities.length;
+            if (entities.length === 0) {
+                prodLog.info(`[BackgroundDedup] No entities found for import ${importId}`);
+                return stats;
+            }
+            prodLog.info(`[BackgroundDedup] Processing ${entities.length} entities from import ${importId}`);
+            // Tier 1: ID-based deduplication (O(1) per entity)
+            const tier1Merged = await this.tier1_IdBased(entities, importId);
+            stats.tier1Matches = tier1Merged;
+            stats.totalMerged += tier1Merged;
+            // Re-check which entities still exist after Tier 1
+            let remainingEntities = entities;
+            if (tier1Merged > 0) {
+                remainingEntities = await this.filterExisting(entities);
+                prodLog.info(`[BackgroundDedup] After Tier 1: ${entities.length} → ${remainingEntities.length} entities`);
+            }
+            // Tier 2: Name-based deduplication on reduced set
+            const tier2Merged = await this.tier2_NameBased(remainingEntities, importId);
+            stats.tier2Matches = tier2Merged;
+            stats.totalMerged += tier2Merged;
+            // Re-check which entities still exist after Tier 2
+            if (tier2Merged > 0) {
+                remainingEntities = await this.filterExisting(remainingEntities);
+                prodLog.info(`[BackgroundDedup] After Tier 2: ${remainingEntities.length} entities remaining`);
+            }
+            // Tier 3: Similarity-based deduplication on final reduced set
+            const tier3Merged = await this.tier3_SimilarityBased(remainingEntities, importId);
+            stats.tier3Matches = tier3Merged;
+            stats.totalMerged += tier3Merged;
+            stats.processingTime = performance.now() - startTime;
+            prodLog.info(`[BackgroundDedup] Completed for import ${importId}: ` +
+                `${stats.totalMerged} merged (T1: ${stats.tier1Matches}, T2: ${stats.tier2Matches}, T3: ${stats.tier3Matches}) ` +
+                `in ${stats.processingTime.toFixed(0)}ms`);
+            return stats;
+        }
+        catch (error) {
+            prodLog.error(`[BackgroundDedup] Error deduplicating import ${importId}:`, error);
+            stats.processingTime = performance.now() - startTime;
+            return stats;
+        }
+    }
+    /**
+     * Tier 1: ID-based deduplication
+     * Uses entity metadata sourceId field for deterministic matching
+     * Complexity: O(n) where n = number of entities in import
+     */
+    async tier1_IdBased(entities, importId) {
+        const startTime = performance.now();
+        let merged = 0;
+        // Group entities by sourceId (if available)
+        const sourceIdGroups = new Map();
+        for (const entity of entities) {
+            const sourceId = entity.metadata?.sourceId || entity.metadata?.sourceRow;
+            if (sourceId) {
+                const key = `${sourceId}`;
+                if (!sourceIdGroups.has(key)) {
+                    sourceIdGroups.set(key, []);
+                }
+                sourceIdGroups.get(key).push(entity);
+            }
+        }
+        // Merge duplicates with same sourceId
+        for (const [sourceId, group] of sourceIdGroups) {
+            if (group.length > 1) {
+                await this.mergeEntities(group, 'ID');
+                merged += group.length - 1;
+            }
+        }
+        const elapsed = performance.now() - startTime;
+        if (merged > 0) {
+            prodLog.info(`[BackgroundDedup] Tier 1 (ID): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
+        }
+        return merged;
+    }
+    /**
+     * Tier 2: Name-based deduplication
+     * Exact name matching (case-insensitive, normalized)
+     * Complexity: O(n) where n = number of entities in import
+     */
+    async tier2_NameBased(entities, importId) {
+        const startTime = performance.now();
+        let merged = 0;
+        // Group entities by normalized name
+        const nameGroups = new Map();
+        for (const entity of entities) {
+            const name = entity.metadata?.name;
+            if (name && typeof name === 'string') {
+                const normalized = this.normalizeName(name);
+                if (!nameGroups.has(normalized)) {
+                    nameGroups.set(normalized, []);
+                }
+                nameGroups.get(normalized).push(entity);
+            }
+        }
+        // Merge duplicates with same normalized name and type
+        for (const [name, group] of nameGroups) {
+            if (group.length > 1) {
+                // Further group by type (only merge same types)
+                const typeGroups = new Map();
+                for (const entity of group) {
+                    const type = entity.type || 'unknown';
+                    if (!typeGroups.has(type)) {
+                        typeGroups.set(type, []);
+                    }
+                    typeGroups.get(type).push(entity);
+                }
+                // Merge within each type group
+                for (const [type, typeGroup] of typeGroups) {
+                    if (typeGroup.length > 1) {
+                        await this.mergeEntities(typeGroup, 'Name');
+                        merged += typeGroup.length - 1;
+                    }
+                }
+            }
+        }
+        const elapsed = performance.now() - startTime;
+        if (merged > 0) {
+            prodLog.info(`[BackgroundDedup] Tier 2 (Name): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
+        }
+        return merged;
+    }
+    /**
+     * Tier 3: Similarity-based deduplication
+     * Uses TypeAware HNSW for vector similarity matching
+     * Complexity: O(n log n) where n = number of entities in import
+     */
+    async tier3_SimilarityBased(entities, importId) {
+        const startTime = performance.now();
+        let merged = 0;
+        // Process in batches to avoid memory spikes
+        const batchSize = 100;
+        const similarityThreshold = 0.85;
+        for (let i = 0; i < entities.length; i += batchSize) {
+            const batch = entities.slice(i, i + batchSize);
+            // Batch vector searches using brain.find() (uses TypeAware HNSW)
+            const searches = batch.map(entity => {
+                const query = `${entity.metadata?.name || ''} ${entity.metadata?.description || ''}`.trim();
+                if (!query)
+                    return Promise.resolve([]);
+                return this.brain.find({
+                    query,
+                    limit: 5,
+                    where: { type: entity.type } // Type-aware search
+                });
+            });
+            const results = await Promise.all(searches);
+            // Process matches
+            for (let j = 0; j < batch.length; j++) {
+                const entity = batch[j];
+                const matches = results[j];
+                for (const match of matches) {
+                    // Skip self-matches
+                    if (match.id === entity.id)
+                        continue;
+                    // Only merge high-similarity matches from same import
+                    if (match.score >= similarityThreshold && match.entity.metadata?.importId === importId) {
+                        // Check if not already merged
+                        const stillExists = await this.brain.get(entity.id);
+                        if (stillExists) {
+                            // Cast match.entity to HNSWNounWithMetadata (it comes from brain.find results)
+                            const matchEntity = match.entity;
+                            await this.mergeEntities([entity, matchEntity], 'Similarity');
+                            merged++;
+                            break; // Only merge with first high-similarity match
+                        }
+                    }
+                }
+            }
+        }
+        const elapsed = performance.now() - startTime;
+        if (merged > 0) {
+            prodLog.info(`[BackgroundDedup] Tier 3 (Similarity): Merged ${merged} duplicates in ${elapsed.toFixed(0)}ms`);
+        }
+        return merged;
+    }
+    /**
+     * Merge multiple entities into one
+     * Keeps entity with highest confidence, merges metadata, deletes duplicates
+     */
+    async mergeEntities(entities, reason) {
+        if (entities.length < 2)
+            return;
+        // Find entity with highest confidence
+        const primary = entities.reduce((best, curr) => {
+            const bestConf = best.metadata?.confidence || 0.5;
+            const currConf = curr.metadata?.confidence || 0.5;
+            return currConf > bestConf ? curr : best;
+        });
+        // Merge metadata from all entities
+        const primaryMeta = primary.metadata || {};
+        const mergedMetadata = {
+            ...primaryMeta,
+            // Merge import IDs
+            importIds: Array.from(new Set([
+                ...(Array.isArray(primaryMeta.importIds) ? primaryMeta.importIds : []),
+                ...entities.flatMap(e => Array.isArray(e.metadata?.importIds) ? e.metadata.importIds : [])
+            ])),
+            // Merge VFS paths
+            vfsPaths: Array.from(new Set([
+                ...(Array.isArray(primaryMeta.vfsPaths) ? primaryMeta.vfsPaths : []),
+                ...entities.flatMap(e => Array.isArray(e.metadata?.vfsPaths) ? e.metadata.vfsPaths : [])
+            ])),
+            // Merge concepts
+            concepts: Array.from(new Set([
+                ...(Array.isArray(primaryMeta.concepts) ? primaryMeta.concepts : []),
+                ...entities.flatMap(e => Array.isArray(e.metadata?.concepts) ? e.metadata.concepts : [])
+            ])),
+            // Track merge
+            mergeCount: (typeof primaryMeta.mergeCount === 'number' ? primaryMeta.mergeCount : 0) + (entities.length - 1),
+            mergedWith: entities.filter(e => e.id !== primary.id).map(e => e.id),
+            lastMerged: Date.now(),
+            mergeReason: reason
+        };
+        // Update primary entity with merged metadata
+        await this.brain.update({
+            id: primary.id,
+            metadata: mergedMetadata,
+            merge: true
+        });
+        // Delete duplicate entities
+        for (const entity of entities) {
+            if (entity.id !== primary.id) {
+                try {
+                    await this.brain.delete(entity.id);
+                }
+                catch (error) {
+                    // Entity might already be deleted, continue
+                    prodLog.debug(`[BackgroundDedup] Could not delete ${entity.id}:`, error);
+                }
+            }
+        }
+    }
+    /**
+     * Filter entities to only those that still exist (not deleted)
+     * @private
+     */
+    async filterExisting(entities) {
+        const existing = [];
+        for (const entity of entities) {
+            const stillExists = await this.brain.get(entity.id);
+            if (stillExists) {
+                existing.push(entity);
+            }
+        }
+        return existing;
+    }
+    /**
+     * Normalize string for comparison
+     * Lowercase, trim, remove special characters
+     */
+    normalizeName(str) {
+        return str
+            .toLowerCase()
+            .trim()
+            .replace(/[^a-z0-9\s]/g, '')
+            .replace(/\s+/g, ' ');
+    }
+    /**
+     * Cancel pending deduplication (for cleanup)
+     */
+    cancelPending() {
+        if (this.debounceTimer) {
+            clearTimeout(this.debounceTimer);
+            this.debounceTimer = undefined;
+        }
+        this.pendingImports.clear();
+    }
+}
+//# sourceMappingURL=BackgroundDeduplicator.js.map

package/dist/import/ImportCoordinator.d.ts CHANGED Viewed

@@ -248,8 +248,8 @@ export interface ImportResult {
 export declare class ImportCoordinator {
     private brain;
     private detector;
-    private deduplicator;
     private history;
+    private backgroundDedup;
     private excelImporter;
     private pdfImporter;
     private csvImporter;

package/dist/import/ImportCoordinator.js CHANGED Viewed

@@ -10,8 +10,8 @@
  * NO MOCKS - Production-ready implementation
  */
 import { FormatDetector } from './FormatDetector.js';
-import { EntityDeduplicator } from './EntityDeduplicator.js';
 import { ImportHistory } from './ImportHistory.js';
+import { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
 import { SmartExcelImporter } from '../importers/SmartExcelImporter.js';
 import { SmartPDFImporter } from '../importers/SmartPDFImporter.js';
 import { SmartCSVImporter } from '../importers/SmartCSVImporter.js';
@@ -31,8 +31,8 @@ export class ImportCoordinator {
     constructor(brain) {
         this.brain = brain;
         this.detector = new FormatDetector();
-        this.deduplicator = new EntityDeduplicator(brain);
         this.history = new ImportHistory(brain);
+        this.backgroundDedup = new BackgroundDeduplicator(brain);
         this.excelImporter = new SmartExcelImporter(brain);
         this.pdfImporter = new SmartPDFImporter(brain);
         this.csvImporter = new SmartCSVImporter(brain);
@@ -683,20 +683,20 @@ export class ImportCoordinator {
                 try {
                     const importSource = vfsResult.rootPath;
                     let entityId;
-                    let wasMerged = false;
-                    // Use deduplicator to check for existing entities
-                    const mergeResult = await this.deduplicator.createOrMerge({
-                        id: entity.id,
-                        name: entity.name,
+                    // v5.7.0: No deduplication during import (12-24x speedup)
+                    // Background deduplication runs 5 minutes after import completes
+                    entityId = await this.brain.add({
+                        data: entity.description || entity.name,
                         type: entity.type,
-                        description: entity.description || entity.name,
-                        confidence: entity.confidence,
                         metadata: {
                             ...entity.metadata,
+                            name: entity.name,
+                            confidence: entity.confidence,
                             vfsPath: vfsFile?.path,
                             importedFrom: 'import-coordinator',
                             // v4.10.0: Import tracking metadata
                             ...(trackingContext && {
+                                importId: trackingContext.importId, // Used for background dedup
                                 importIds: [trackingContext.importId],
                                 projectId: trackingContext.projectId,
                                 importedAt: trackingContext.importedAt,
@@ -707,19 +707,8 @@ export class ImportCoordinator {
                                 ...trackingContext.customMetadata
                             })
                         }
-                    }, importSource, {
-                        similarityThreshold: options.deduplicationThreshold || 0.85,
-                        strictTypeMatching: true,
-                        enableFuzzyMatching: true
                     });
-                    entityId = mergeResult.mergedEntityId;
-                    wasMerged = mergeResult.wasMerged;
-                    if (wasMerged) {
-                        mergedCount++;
-                    }
-                    else {
-                        newCount++;
-                    }
+                    newCount++;
                     // Update entity ID in extraction result
                     entity.id = entityId;
                     entities.push({
@@ -943,6 +932,10 @@ export class ImportCoordinator {
                 // Continue - relationships are optional
             }
         }
+        // v5.7.0: Schedule background deduplication (debounced 5 minutes)
+        if (trackingContext && trackingContext.importId) {
+            this.backgroundDedup.scheduleDedup(trackingContext.importId);
+        }
         return {
             entities,
             relationships,

package/dist/import/index.d.ts CHANGED Viewed

@@ -10,7 +10,9 @@
 export { ImportCoordinator } from './ImportCoordinator.js';
 export { FormatDetector, SupportedFormat, DetectionResult } from './FormatDetector.js';
 export { EntityDeduplicator } from './EntityDeduplicator.js';
+export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
 export { ImportHistory } from './ImportHistory.js';
 export type { ImportSource, ImportOptions, ImportProgress, ImportResult } from './ImportCoordinator.js';
 export type { EntityCandidate, DuplicateMatch, EntityDeduplicationOptions, MergeResult } from './EntityDeduplicator.js';
+export type { DeduplicationStats } from './BackgroundDeduplicator.js';
 export type { ImportHistoryEntry, RollbackResult } from './ImportHistory.js';

package/dist/import/index.js CHANGED Viewed

@@ -10,5 +10,6 @@
 export { ImportCoordinator } from './ImportCoordinator.js';
 export { FormatDetector } from './FormatDetector.js';
 export { EntityDeduplicator } from './EntityDeduplicator.js';
+export { BackgroundDeduplicator } from './BackgroundDeduplicator.js';
 export { ImportHistory } from './ImportHistory.js';
 //# sourceMappingURL=index.js.map

package/dist/storage/baseStorage.d.ts CHANGED Viewed

@@ -51,6 +51,7 @@ export declare function getDirectoryPath(entityType: 'noun' | 'verb', dataType:
 export declare abstract class BaseStorage extends BaseStorageAdapter {
     protected isInitialized: boolean;
     protected graphIndex?: GraphAdjacencyIndex;
+    protected graphIndexPromise?: Promise<GraphAdjacencyIndex>;
     protected readOnly: boolean;
     refManager?: RefManager;
     blobStorage?: BlobStorage;
@@ -311,9 +312,15 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
      */
     deleteVerb(id: string): Promise<void>;
     /**
-     * Get graph index (lazy initialization)
+     * Get graph index (lazy initialization with concurrent access protection)
+     * v5.7.1: Fixed race condition where concurrent calls could trigger multiple rebuilds
      */
     getGraphIndex(): Promise<GraphAdjacencyIndex>;
+    /**
+     * Internal method to initialize graph index (called once by getGraphIndex)
+     * @private
+     */
+    private _initializeGraphIndex;
     /**
      * Clear all data from storage
      * This method should be implemented by each specific adapter
@@ -481,7 +488,7 @@ export declare abstract class BaseStorage extends BaseStorageAdapter {
     protected getVerbsBySource_internal(sourceId: string): Promise<HNSWVerbWithMetadata[]>;
     /**
      * Get verbs by target (COW-aware implementation)
-     * v5.4.0: Fixed to directly list verb files instead of directories
+     * v5.7.0: BILLION-SCALE OPTIMIZATION - Use GraphAdjacencyIndex for O(log n) lookup
      */
     protected getVerbsByTarget_internal(targetId: string): Promise<HNSWVerbWithMetadata[]>;
     /**