npm - @soulcraft/brainy - Versions diffs - 4.1.4 → 4.2.0 - Mend

@soulcraft/brainy 4.1.4 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/dist/import/FormatDetector.d.ts +6 -1
package/dist/import/FormatDetector.js +40 -1
package/dist/import/ImportCoordinator.d.ts +102 -4
package/dist/import/ImportCoordinator.js +248 -6
package/dist/import/InstancePool.d.ts +136 -0
package/dist/import/InstancePool.js +231 -0
package/dist/importers/SmartCSVImporter.d.ts +2 -1
package/dist/importers/SmartCSVImporter.js +11 -22
package/dist/importers/SmartDOCXImporter.d.ts +125 -0
package/dist/importers/SmartDOCXImporter.js +227 -0
package/dist/importers/SmartExcelImporter.d.ts +12 -1
package/dist/importers/SmartExcelImporter.js +40 -25
package/dist/importers/SmartJSONImporter.d.ts +1 -0
package/dist/importers/SmartJSONImporter.js +25 -6
package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
package/dist/importers/SmartMarkdownImporter.js +11 -16
package/dist/importers/SmartPDFImporter.d.ts +2 -1
package/dist/importers/SmartPDFImporter.js +11 -22
package/dist/importers/SmartYAMLImporter.d.ts +121 -0
package/dist/importers/SmartYAMLImporter.js +275 -0
package/dist/importers/VFSStructureGenerator.js +12 -0
package/dist/neural/SmartExtractor.d.ts +279 -0
package/dist/neural/SmartExtractor.js +592 -0
package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
package/dist/neural/SmartRelationshipExtractor.js +396 -0
package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
package/dist/neural/embeddedTypeEmbeddings.js +2 -2
package/dist/neural/entityExtractor.d.ts +3 -0
package/dist/neural/entityExtractor.js +34 -36
package/dist/neural/presets.d.ts +189 -0
package/dist/neural/presets.js +365 -0
package/dist/neural/signals/ContextSignal.d.ts +166 -0
package/dist/neural/signals/ContextSignal.js +646 -0
package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
package/dist/neural/signals/EmbeddingSignal.js +435 -0
package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
package/dist/neural/signals/ExactMatchSignal.js +542 -0
package/dist/neural/signals/PatternSignal.d.ts +159 -0
package/dist/neural/signals/PatternSignal.js +478 -0
package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
package/dist/neural/signals/VerbContextSignal.js +390 -0
package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
package/dist/neural/signals/VerbPatternSignal.js +457 -0
package/dist/types/graphTypes.d.ts +2 -0
package/package.json +4 -1

package/dist/import/InstancePool.d.ts ADDED Viewed

@@ -0,0 +1,136 @@
+/**
+ * InstancePool - Shared instance management for memory efficiency
+ *
+ * Production-grade instance pooling to prevent memory leaks during imports.
+ * Critical for scaling to billions of entities.
+ *
+ * Problem: Creating new NLP/Extractor instances in loops → memory leak
+ * Solution: Reuse shared instances across entire import session
+ *
+ * Memory savings:
+ * - Without pooling: 100K rows × 50MB per instance = 5TB RAM (OOM!)
+ * - With pooling: 50MB total (shared across all rows)
+ */
+import { Brainy } from '../brainy.js';
+import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
+import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
+/**
+ * InstancePool - Manages shared instances for memory efficiency
+ *
+ * Lifecycle:
+ * 1. Create pool at import start
+ * 2. Reuse instances across all rows
+ * 3. Pool is garbage collected when import completes
+ *
+ * Thread safety: Not thread-safe (single import session per pool)
+ */
+export declare class InstancePool {
+    private brain;
+    private nlpInstance;
+    private extractorInstance;
+    private nlpInitialized;
+    private initializationPromise;
+    private stats;
+    constructor(brain: Brainy);
+    /**
+     * Get shared NaturalLanguageProcessor instance
+     *
+     * Lazy initialization - created on first access
+     * All subsequent calls return same instance
+     *
+     * @returns Shared NLP instance
+     */
+    getNLP(): Promise<NaturalLanguageProcessor>;
+    /**
+     * Get shared NeuralEntityExtractor instance
+     *
+     * Lazy initialization - created on first access
+     * All subsequent calls return same instance
+     *
+     * @returns Shared extractor instance
+     */
+    getExtractor(): NeuralEntityExtractor;
+    /**
+     * Get shared NLP instance (synchronous, may return uninitialized)
+     *
+     * Use when you need NLP synchronously and will handle initialization yourself.
+     * Prefer getNLP() for async code.
+     *
+     * @returns Shared NLP instance (possibly uninitialized)
+     */
+    getNLPSync(): NaturalLanguageProcessor;
+    /**
+     * Initialize all instances upfront
+     *
+     * Call at start of import to avoid lazy initialization overhead
+     * during processing. Improves predictability and first-row performance.
+     *
+     * @returns Promise that resolves when all instances are ready
+     */
+    init(): Promise<void>;
+    /**
+     * Internal initialization implementation
+     */
+    private initializeInternal;
+    /**
+     * Ensure NLP is initialized (loads 220 patterns)
+     *
+     * Handles concurrent initialization requests safely
+     */
+    private ensureNLPInitialized;
+    /**
+     * Check if instances are initialized
+     *
+     * @returns True if NLP is initialized and ready to use
+     */
+    isInitialized(): boolean;
+    /**
+     * Get pool statistics
+     *
+     * Useful for performance monitoring and memory leak detection
+     *
+     * @returns Statistics about instance reuse
+     */
+    getStats(): {
+        nlpCreated: boolean;
+        extractorCreated: boolean;
+        initialized: boolean;
+        memorySaved: number;
+        nlpReuses: number;
+        extractorReuses: number;
+        creationTime: number;
+    };
+    /**
+     * Calculate estimated memory saved by pooling
+     *
+     * Assumes ~50MB per NLP instance, ~10MB per extractor instance
+     *
+     * @returns Estimated memory saved in bytes
+     */
+    private calculateMemorySaved;
+    /**
+     * Reset statistics (useful for testing)
+     */
+    resetStats(): void;
+    /**
+     * Get string representation (for debugging)
+     */
+    toString(): string;
+    /**
+     * Cleanup method (for explicit resource management)
+     *
+     * Note: Usually not needed - pool is garbage collected when import completes.
+     * Use only if you need explicit cleanup for some reason.
+     */
+    cleanup(): void;
+}
+/**
+ * Create a new instance pool
+ *
+ * Convenience factory function
+ *
+ * @param brain Brainy instance
+ * @param autoInit Whether to initialize instances immediately
+ * @returns Instance pool
+ */
+export declare function createInstancePool(brain: Brainy, autoInit?: boolean): Promise<InstancePool>;

package/dist/import/InstancePool.js ADDED Viewed

@@ -0,0 +1,231 @@
+/**
+ * InstancePool - Shared instance management for memory efficiency
+ *
+ * Production-grade instance pooling to prevent memory leaks during imports.
+ * Critical for scaling to billions of entities.
+ *
+ * Problem: Creating new NLP/Extractor instances in loops → memory leak
+ * Solution: Reuse shared instances across entire import session
+ *
+ * Memory savings:
+ * - Without pooling: 100K rows × 50MB per instance = 5TB RAM (OOM!)
+ * - With pooling: 50MB total (shared across all rows)
+ */
+import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
+import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
+/**
+ * InstancePool - Manages shared instances for memory efficiency
+ *
+ * Lifecycle:
+ * 1. Create pool at import start
+ * 2. Reuse instances across all rows
+ * 3. Pool is garbage collected when import completes
+ *
+ * Thread safety: Not thread-safe (single import session per pool)
+ */
+export class InstancePool {
+    constructor(brain) {
+        // Shared instances (created lazily)
+        this.nlpInstance = null;
+        this.extractorInstance = null;
+        // Initialization state
+        this.nlpInitialized = false;
+        this.initializationPromise = null;
+        // Statistics
+        this.stats = {
+            nlpReuses: 0,
+            extractorReuses: 0,
+            creationTime: 0
+        };
+        this.brain = brain;
+    }
+    /**
+     * Get shared NaturalLanguageProcessor instance
+     *
+     * Lazy initialization - created on first access
+     * All subsequent calls return same instance
+     *
+     * @returns Shared NLP instance
+     */
+    async getNLP() {
+        if (!this.nlpInstance) {
+            const startTime = Date.now();
+            this.nlpInstance = new NaturalLanguageProcessor(this.brain);
+            this.stats.creationTime += Date.now() - startTime;
+        }
+        // Ensure initialized before returning
+        if (!this.nlpInitialized) {
+            await this.ensureNLPInitialized();
+        }
+        this.stats.nlpReuses++;
+        return this.nlpInstance;
+    }
+    /**
+     * Get shared NeuralEntityExtractor instance
+     *
+     * Lazy initialization - created on first access
+     * All subsequent calls return same instance
+     *
+     * @returns Shared extractor instance
+     */
+    getExtractor() {
+        if (!this.extractorInstance) {
+            const startTime = Date.now();
+            this.extractorInstance = new NeuralEntityExtractor(this.brain);
+            this.stats.creationTime += Date.now() - startTime;
+        }
+        this.stats.extractorReuses++;
+        return this.extractorInstance;
+    }
+    /**
+     * Get shared NLP instance (synchronous, may return uninitialized)
+     *
+     * Use when you need NLP synchronously and will handle initialization yourself.
+     * Prefer getNLP() for async code.
+     *
+     * @returns Shared NLP instance (possibly uninitialized)
+     */
+    getNLPSync() {
+        if (!this.nlpInstance) {
+            this.nlpInstance = new NaturalLanguageProcessor(this.brain);
+        }
+        this.stats.nlpReuses++;
+        return this.nlpInstance;
+    }
+    /**
+     * Initialize all instances upfront
+     *
+     * Call at start of import to avoid lazy initialization overhead
+     * during processing. Improves predictability and first-row performance.
+     *
+     * @returns Promise that resolves when all instances are ready
+     */
+    async init() {
+        // Prevent duplicate initialization
+        if (this.initializationPromise) {
+            return this.initializationPromise;
+        }
+        this.initializationPromise = this.initializeInternal();
+        return this.initializationPromise;
+    }
+    /**
+     * Internal initialization implementation
+     */
+    async initializeInternal() {
+        const startTime = Date.now();
+        // Create instances
+        if (!this.nlpInstance) {
+            this.nlpInstance = new NaturalLanguageProcessor(this.brain);
+        }
+        if (!this.extractorInstance) {
+            this.extractorInstance = new NeuralEntityExtractor(this.brain);
+        }
+        // Initialize NLP (loads pattern library)
+        await this.ensureNLPInitialized();
+        this.stats.creationTime = Date.now() - startTime;
+    }
+    /**
+     * Ensure NLP is initialized (loads 220 patterns)
+     *
+     * Handles concurrent initialization requests safely
+     */
+    async ensureNLPInitialized() {
+        if (this.nlpInitialized) {
+            return;
+        }
+        if (!this.nlpInstance) {
+            throw new Error('NLP instance not created yet');
+        }
+        await this.nlpInstance.init();
+        this.nlpInitialized = true;
+    }
+    /**
+     * Check if instances are initialized
+     *
+     * @returns True if NLP is initialized and ready to use
+     */
+    isInitialized() {
+        return this.nlpInitialized && this.nlpInstance !== null;
+    }
+    /**
+     * Get pool statistics
+     *
+     * Useful for performance monitoring and memory leak detection
+     *
+     * @returns Statistics about instance reuse
+     */
+    getStats() {
+        return {
+            ...this.stats,
+            nlpCreated: this.nlpInstance !== null,
+            extractorCreated: this.extractorInstance !== null,
+            initialized: this.isInitialized(),
+            // Memory savings estimate
+            memorySaved: this.calculateMemorySaved()
+        };
+    }
+    /**
+     * Calculate estimated memory saved by pooling
+     *
+     * Assumes ~50MB per NLP instance, ~10MB per extractor instance
+     *
+     * @returns Estimated memory saved in bytes
+     */
+    calculateMemorySaved() {
+        const nlpSize = 50 * 1024 * 1024; // 50MB per instance
+        const extractorSize = 10 * 1024 * 1024; // 10MB per instance
+        // Without pooling: size × reuses
+        // With pooling: size × 1
+        // Saved: size × (reuses - 1)
+        const nlpSaved = nlpSize * Math.max(0, this.stats.nlpReuses - 1);
+        const extractorSaved = extractorSize * Math.max(0, this.stats.extractorReuses - 1);
+        return nlpSaved + extractorSaved;
+    }
+    /**
+     * Reset statistics (useful for testing)
+     */
+    resetStats() {
+        this.stats = {
+            nlpReuses: 0,
+            extractorReuses: 0,
+            creationTime: 0
+        };
+    }
+    /**
+     * Get string representation (for debugging)
+     */
+    toString() {
+        const stats = this.getStats();
+        return `InstancePool(nlp=${stats.nlpCreated}, extractor=${stats.extractorCreated}, initialized=${stats.initialized}, nlpReuses=${stats.nlpReuses}, extractorReuses=${stats.extractorReuses})`;
+    }
+    /**
+     * Cleanup method (for explicit resource management)
+     *
+     * Note: Usually not needed - pool is garbage collected when import completes.
+     * Use only if you need explicit cleanup for some reason.
+     */
+    cleanup() {
+        // Clear references to allow garbage collection
+        this.nlpInstance = null;
+        this.extractorInstance = null;
+        this.nlpInitialized = false;
+        this.initializationPromise = null;
+    }
+}
+/**
+ * Create a new instance pool
+ *
+ * Convenience factory function
+ *
+ * @param brain Brainy instance
+ * @param autoInit Whether to initialize instances immediately
+ * @returns Instance pool
+ */
+export async function createInstancePool(brain, autoInit = true) {
+    const pool = new InstancePool(brain);
+    if (autoInit) {
+        await pool.init();
+    }
+    return pool;
+}
+//# sourceMappingURL=InstancePool.js.map

package/dist/importers/SmartCSVImporter.d.ts CHANGED Viewed

@@ -101,6 +101,7 @@ export declare class SmartCSVImporter {
     private brain;
     private extractor;
     private nlp;
+    private relationshipExtractor;
     private csvHandler;
     constructor(brain: Brainy);
     /**
@@ -124,7 +125,7 @@ export declare class SmartCSVImporter {
      */
     private mapTypeString;
     /**
-     * Infer relationship type from context
+     * Infer relationship type from context using SmartRelationshipExtractor
      */
     private inferRelationship;
     /**

package/dist/importers/SmartCSVImporter.js CHANGED Viewed

@@ -12,6 +12,7 @@
  */
 import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
 import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
+import { SmartRelationshipExtractor } from '../neural/SmartRelationshipExtractor.js';
 import { NounType, VerbType } from '../types/graphTypes.js';
 import { CSVHandler } from '../augmentations/intelligentImport/handlers/csvHandler.js';
 /**
@@ -22,6 +23,7 @@ export class SmartCSVImporter {
         this.brain = brain;
         this.extractor = new NeuralEntityExtractor(brain);
         this.nlp = new NaturalLanguageProcessor(brain);
+        this.relationshipExtractor = new SmartRelationshipExtractor(brain);
         this.csvHandler = new CSVHandler();
     }
     /**
@@ -266,29 +268,16 @@ export class SmartCSVImporter {
         return mapping[normalized] || NounType.Thing;
     }
     /**
-     * Infer relationship type from context
+     * Infer relationship type from context using SmartRelationshipExtractor
      */
-    async inferRelationship(fromTerm, toTerm, context) {
-        const lowerContext = context.toLowerCase();
-        // Pattern-based relationship detection
-        const patterns = [
-            [new RegExp(`${toTerm}.*of.*${fromTerm}`, 'i'), VerbType.PartOf],
-            [new RegExp(`${fromTerm}.*contains.*${toTerm}`, 'i'), VerbType.Contains],
-            [new RegExp(`located in.*${toTerm}`, 'i'), VerbType.LocatedAt],
-            [new RegExp(`ruled by.*${toTerm}`, 'i'), VerbType.Owns],
-            [new RegExp(`capital.*${toTerm}`, 'i'), VerbType.Contains],
-            [new RegExp(`created by.*${toTerm}`, 'i'), VerbType.CreatedBy],
-            [new RegExp(`authored by.*${toTerm}`, 'i'), VerbType.CreatedBy],
-            [new RegExp(`part of.*${toTerm}`, 'i'), VerbType.PartOf],
-            [new RegExp(`related to.*${toTerm}`, 'i'), VerbType.RelatedTo]
-        ];
-        for (const [pattern, verbType] of patterns) {
-            if (pattern.test(lowerContext)) {
-                return verbType;
-            }
-        }
-        // Default to RelatedTo
-        return VerbType.RelatedTo;
+    async inferRelationship(fromTerm, toTerm, context, fromType, toType) {
+        // Use SmartRelationshipExtractor for robust relationship classification
+        const result = await this.relationshipExtractor.infer(fromTerm, toTerm, context, {
+            subjectType: fromType,
+            objectType: toType
+        });
+        // Return inferred type or fallback to RelatedTo
+        return result?.type || VerbType.RelatedTo;
     }
     /**
      * Generate consistent entity ID from name

package/dist/importers/SmartDOCXImporter.d.ts ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * Smart DOCX Importer
+ *
+ * Extracts entities and relationships from Word documents using:
+ * - Mammoth parser for DOCX → HTML/text conversion
+ * - Heading extraction for document structure
+ * - Table extraction for structured data
+ * - NeuralEntityExtractor for entity extraction from paragraphs
+ * - NaturalLanguageProcessor for relationship inference
+ * - Hierarchical relationship creation based on heading hierarchy
+ *
+ * v4.2.0: New format handler
+ * NO MOCKS - Production-ready implementation
+ */
+import { Brainy } from '../brainy.js';
+import { NounType, VerbType } from '../types/graphTypes.js';
+export interface SmartDOCXOptions {
+    /** Enable neural entity extraction from paragraphs */
+    enableNeuralExtraction?: boolean;
+    /** Enable hierarchical relationship creation based on headings */
+    enableHierarchicalRelationships?: boolean;
+    /** Enable concept extraction for tagging */
+    enableConceptExtraction?: boolean;
+    /** Confidence threshold for entities (0-1) */
+    confidenceThreshold?: number;
+    /** Minimum paragraph length to process */
+    minParagraphLength?: number;
+    /** Progress callback */
+    onProgress?: (stats: {
+        processed: number;
+        entities: number;
+        relationships: number;
+    }) => void;
+}
+export interface ExtractedDOCXEntity {
+    /** Entity ID */
+    id: string;
+    /** Entity name */
+    name: string;
+    /** Entity type */
+    type: NounType;
+    /** Entity description/context */
+    description: string;
+    /** Confidence score */
+    confidence: number;
+    /** Weight/importance score */
+    weight?: number;
+    /** Section/heading context */
+    section: string | null;
+    /** Paragraph index in document */
+    paragraphIndex: number;
+    /** Metadata */
+    metadata: Record<string, any>;
+}
+export interface ExtractedDOCXRelationship {
+    from: string;
+    to: string;
+    type: VerbType;
+    confidence: number;
+    weight?: number;
+    evidence: string;
+}
+export interface SmartDOCXResult {
+    /** Total paragraphs processed */
+    paragraphsProcessed: number;
+    /** Entities extracted */
+    entitiesExtracted: number;
+    /** Relationships inferred */
+    relationshipsInferred: number;
+    /** All extracted entities */
+    entities: ExtractedDOCXEntity[];
+    /** All relationships */
+    relationships: ExtractedDOCXRelationship[];
+    /** Entity ID mapping (index -> ID) */
+    entityMap: Map<string, string>;
+    /** Processing time in ms */
+    processingTime: number;
+    /** Document structure */
+    structure: {
+        headings: Array<{
+            level: number;
+            text: string;
+            index: number;
+        }>;
+        paragraphCount: number;
+        tableCount: number;
+    };
+    /** Extraction statistics */
+    stats: {
+        byType: Record<string, number>;
+        bySection: Record<string, number>;
+        byConfidence: {
+            high: number;
+            medium: number;
+            low: number;
+        };
+    };
+}
+/**
+ * SmartDOCXImporter - Extracts structured knowledge from Word documents
+ */
+export declare class SmartDOCXImporter {
+    private brain;
+    private extractor;
+    private nlp;
+    private relationshipExtractor;
+    private mammothLoaded;
+    constructor(brain: Brainy);
+    /**
+     * Initialize the importer
+     */
+    init(): Promise<void>;
+    /**
+     * Extract entities and relationships from DOCX buffer
+     */
+    extract(buffer: Buffer, options?: SmartDOCXOptions): Promise<SmartDOCXResult>;
+    /**
+     * Extract entities and relationships from parsed DOCX content
+     */
+    private extractFromContent;
+    /**
+     * Parse document structure from HTML
+     */
+    private parseStructure;
+}