npm - @soulcraft/brainy - Versions diffs - 4.1.4 → 4.2.1 - Mend

@soulcraft/brainy 4.1.4 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/CHANGELOG.md +35 -0
package/dist/import/FormatDetector.d.ts +6 -1
package/dist/import/FormatDetector.js +40 -1
package/dist/import/ImportCoordinator.d.ts +102 -4
package/dist/import/ImportCoordinator.js +248 -6
package/dist/import/InstancePool.d.ts +136 -0
package/dist/import/InstancePool.js +231 -0
package/dist/importers/SmartCSVImporter.d.ts +2 -1
package/dist/importers/SmartCSVImporter.js +11 -22
package/dist/importers/SmartDOCXImporter.d.ts +125 -0
package/dist/importers/SmartDOCXImporter.js +227 -0
package/dist/importers/SmartExcelImporter.d.ts +12 -1
package/dist/importers/SmartExcelImporter.js +40 -25
package/dist/importers/SmartJSONImporter.d.ts +1 -0
package/dist/importers/SmartJSONImporter.js +25 -6
package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
package/dist/importers/SmartMarkdownImporter.js +11 -16
package/dist/importers/SmartPDFImporter.d.ts +2 -1
package/dist/importers/SmartPDFImporter.js +11 -22
package/dist/importers/SmartYAMLImporter.d.ts +121 -0
package/dist/importers/SmartYAMLImporter.js +275 -0
package/dist/importers/VFSStructureGenerator.js +12 -0
package/dist/neural/SmartExtractor.d.ts +279 -0
package/dist/neural/SmartExtractor.js +592 -0
package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
package/dist/neural/SmartRelationshipExtractor.js +396 -0
package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
package/dist/neural/embeddedTypeEmbeddings.js +2 -2
package/dist/neural/entityExtractor.d.ts +3 -0
package/dist/neural/entityExtractor.js +34 -36
package/dist/neural/presets.d.ts +189 -0
package/dist/neural/presets.js +365 -0
package/dist/neural/signals/ContextSignal.d.ts +166 -0
package/dist/neural/signals/ContextSignal.js +646 -0
package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
package/dist/neural/signals/EmbeddingSignal.js +435 -0
package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
package/dist/neural/signals/ExactMatchSignal.js +542 -0
package/dist/neural/signals/PatternSignal.d.ts +159 -0
package/dist/neural/signals/PatternSignal.js +478 -0
package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
package/dist/neural/signals/VerbContextSignal.js +390 -0
package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
package/dist/neural/signals/VerbPatternSignal.js +457 -0
package/dist/types/graphTypes.d.ts +2 -0
package/dist/utils/metadataIndex.d.ts +22 -0
package/dist/utils/metadataIndex.js +76 -0
package/package.json +4 -1

package/dist/neural/signals/EmbeddingSignal.d.ts ADDED Viewed

@@ -0,0 +1,175 @@
+/**
+ * EmbeddingSignal - Neural entity type classification using embeddings
+ *
+ * PRODUCTION-READY: Merges neural + graph + temporal signals into one
+ * 3x faster than separate signals (single embedding lookup)
+ *
+ * Weight: 35% (20% neural + 10% graph + 5% temporal boost)
+ * Speed: Fast (~10ms) - single embedding lookup with parallel checking
+ *
+ * Features:
+ * - Single embedding computation (efficient)
+ * - Parallel checking against 3 sources
+ * - Confidence boosting when multiple sources agree
+ * - LRU cache for hot entities
+ * - Uses pre-computed type embeddings (zero initialization cost)
+ */
+import type { Brainy } from '../../brainy.js';
+import type { NounType } from '../../types/graphTypes.js';
+import type { Vector } from '../../coreTypes.js';
+/**
+ * Signal result with classification details
+ */
+export interface TypeSignal {
+    source: 'embedding-type' | 'embedding-graph' | 'embedding-history' | 'embedding-combined';
+    type: NounType;
+    confidence: number;
+    evidence: string;
+    metadata?: {
+        typeScore?: number;
+        graphScore?: number;
+        historyScore?: number;
+        agreementBoost?: number;
+    };
+}
+/**
+ * Options for embedding signal
+ */
+export interface EmbeddingSignalOptions {
+    minConfidence?: number;
+    checkGraph?: boolean;
+    checkHistory?: boolean;
+    timeout?: number;
+    cacheSize?: number;
+}
+/**
+ * EmbeddingSignal - Neural type classification with parallel source checking
+ *
+ * Production features:
+ * - Pre-computed type embeddings (instant initialization)
+ * - Parallel source checking (type + graph + history)
+ * - LRU cache for performance
+ * - Confidence boosting when sources agree
+ * - Graceful degradation on errors
+ */
+export declare class EmbeddingSignal {
+    private brain;
+    private options;
+    private typeEmbeddings;
+    private initialized;
+    private cache;
+    private cacheOrder;
+    private historicalEntities;
+    private readonly MAX_HISTORY;
+    private stats;
+    constructor(brain: Brainy, options?: EmbeddingSignalOptions);
+    /**
+     * Initialize type embeddings (lazy, happens once)
+     *
+     * PRODUCTION OPTIMIZATION: Uses pre-computed embeddings
+     * Zero runtime cost - embeddings loaded instantly
+     */
+    private init;
+    /**
+     * Classify entity type using embedding-based signals
+     *
+     * Main entry point - embeds candidate once, checks all sources in parallel
+     *
+     * @param candidate Entity text to classify
+     * @param context Optional context for better matching
+     * @returns TypeSignal with classification result
+     */
+    classify(candidate: string, context?: {
+        definition?: string;
+        allTerms?: string[];
+        metadata?: any;
+    }): Promise<TypeSignal | null>;
+    /**
+     * Match against NounType embeddings (31 types)
+     *
+     * Returns best matching type with confidence
+     */
+    private matchTypeEmbeddings;
+    /**
+     * Match against existing graph entities
+     *
+     * Finds similar entities already in the graph
+     * Boosts confidence for entities similar to existing ones
+     */
+    private matchGraphEntities;
+    /**
+     * Match against historical import data
+     *
+     * Temporal boosting: entities imported recently are more relevant
+     * Helps with batch imports of similar entities
+     */
+    private matchHistoricalData;
+    /**
+     * Combine results from all sources with confidence boosting
+     *
+     * Key insight: When multiple sources agree, boost confidence
+     * This is the "ensemble" effect that makes this signal powerful
+     */
+    private combineResults;
+    /**
+     * Add entity to historical data (for temporal boosting)
+     *
+     * Call this after successful imports to improve future matching
+     */
+    addToHistory(text: string, type: NounType, vector: Vector): void;
+    /**
+     * Clear historical data (useful between import sessions)
+     */
+    clearHistory(): void;
+    /**
+     * Get statistics about signal performance
+     */
+    getStats(): {
+        cacheSize: number;
+        historySize: number;
+        cacheHitRate: number;
+        typeMatchRate: number;
+        graphMatchRate: number;
+        historyMatchRate: number;
+        calls: number;
+        cacheHits: number;
+        typeMatches: number;
+        graphMatches: number;
+        historyMatches: number;
+        combinedBoosts: number;
+    };
+    /**
+     * Reset statistics (useful for testing)
+     */
+    resetStats(): void;
+    /**
+     * Clear cache
+     */
+    clearCache(): void;
+    /**
+     * Generate cache key from candidate and context
+     */
+    private getCacheKey;
+    /**
+     * Get from LRU cache
+     */
+    private getFromCache;
+    /**
+     * Add to LRU cache with eviction
+     */
+    private addToCache;
+    /**
+     * Embed text with timeout protection
+     */
+    private embedWithTimeout;
+    /**
+     * Calculate cosine similarity between two vectors
+     */
+    private cosineSimilarity;
+}
+/**
+ * Create a new EmbeddingSignal instance
+ *
+ * Convenience factory function
+ */
+export declare function createEmbeddingSignal(brain: Brainy, options?: EmbeddingSignalOptions): EmbeddingSignal;

package/dist/neural/signals/EmbeddingSignal.js ADDED Viewed

@@ -0,0 +1,435 @@
+/**
+ * EmbeddingSignal - Neural entity type classification using embeddings
+ *
+ * PRODUCTION-READY: Merges neural + graph + temporal signals into one
+ * 3x faster than separate signals (single embedding lookup)
+ *
+ * Weight: 35% (20% neural + 10% graph + 5% temporal boost)
+ * Speed: Fast (~10ms) - single embedding lookup with parallel checking
+ *
+ * Features:
+ * - Single embedding computation (efficient)
+ * - Parallel checking against 3 sources
+ * - Confidence boosting when multiple sources agree
+ * - LRU cache for hot entities
+ * - Uses pre-computed type embeddings (zero initialization cost)
+ */
+import { getNounTypeEmbeddings } from '../embeddedTypeEmbeddings.js';
+/**
+ * EmbeddingSignal - Neural type classification with parallel source checking
+ *
+ * Production features:
+ * - Pre-computed type embeddings (instant initialization)
+ * - Parallel source checking (type + graph + history)
+ * - LRU cache for performance
+ * - Confidence boosting when sources agree
+ * - Graceful degradation on errors
+ */
+export class EmbeddingSignal {
+    constructor(brain, options) {
+        // Pre-computed type embeddings (loaded once)
+        this.typeEmbeddings = new Map();
+        this.initialized = false;
+        // LRU cache for hot entities (includes null results to avoid recomputation)
+        this.cache = new Map();
+        this.cacheOrder = [];
+        // Historical data for temporal boosting
+        this.historicalEntities = [];
+        this.MAX_HISTORY = 1000; // Keep last 1000 imports
+        // Statistics
+        this.stats = {
+            calls: 0,
+            cacheHits: 0,
+            typeMatches: 0,
+            graphMatches: 0,
+            historyMatches: 0,
+            combinedBoosts: 0
+        };
+        this.brain = brain;
+        this.options = {
+            minConfidence: options?.minConfidence ?? 0.60,
+            checkGraph: options?.checkGraph ?? true,
+            checkHistory: options?.checkHistory ?? true,
+            timeout: options?.timeout ?? 100,
+            cacheSize: options?.cacheSize ?? 1000
+        };
+    }
+    /**
+     * Initialize type embeddings (lazy, happens once)
+     *
+     * PRODUCTION OPTIMIZATION: Uses pre-computed embeddings
+     * Zero runtime cost - embeddings loaded instantly
+     */
+    async init() {
+        if (this.initialized)
+            return;
+        // Load pre-computed type embeddings (instant, no computation)
+        const embeddings = getNounTypeEmbeddings();
+        for (const [type, vector] of embeddings.entries()) {
+            this.typeEmbeddings.set(type, vector);
+        }
+        this.initialized = true;
+    }
+    /**
+     * Classify entity type using embedding-based signals
+     *
+     * Main entry point - embeds candidate once, checks all sources in parallel
+     *
+     * @param candidate Entity text to classify
+     * @param context Optional context for better matching
+     * @returns TypeSignal with classification result
+     */
+    async classify(candidate, context) {
+        this.stats.calls++;
+        // Ensure initialized
+        await this.init();
+        // Check cache first
+        const cacheKey = this.getCacheKey(candidate, context);
+        const cached = this.getFromCache(cacheKey);
+        if (cached) {
+            this.stats.cacheHits++;
+            return cached;
+        }
+        try {
+            // Embed candidate once (efficiency!)
+            const vector = await this.embedWithTimeout(candidate);
+            // Check all three sources in parallel
+            const [typeMatch, graphMatch, historyMatch] = await Promise.all([
+                this.matchTypeEmbeddings(vector, candidate),
+                this.options.checkGraph ? this.matchGraphEntities(vector, candidate) : null,
+                this.options.checkHistory ? this.matchHistoricalData(vector, candidate) : null
+            ]);
+            // Combine results with confidence boosting
+            const result = this.combineResults([typeMatch, graphMatch, historyMatch]);
+            // Cache result (including nulls to avoid recomputation)
+            if (!result || result.confidence >= this.options.minConfidence) {
+                this.addToCache(cacheKey, result);
+            }
+            return result;
+        }
+        catch (error) {
+            // Graceful degradation - return null instead of throwing
+            console.warn(`EmbeddingSignal error for "${candidate}":`, error);
+            return null;
+        }
+    }
+    /**
+     * Match against NounType embeddings (31 types)
+     *
+     * Returns best matching type with confidence
+     */
+    async matchTypeEmbeddings(vector, candidate) {
+        let bestType = null;
+        let bestScore = 0;
+        // Check similarity against all type embeddings
+        for (const [type, typeVector] of this.typeEmbeddings.entries()) {
+            const similarity = this.cosineSimilarity(vector, typeVector);
+            if (similarity > bestScore) {
+                bestScore = similarity;
+                bestType = type;
+            }
+        }
+        // Use lower threshold for type matching (0.40) to catch more matches
+        // Production systems can adjust minConfidence on the signal itself
+        if (bestType && bestScore >= 0.40) {
+            this.stats.typeMatches++;
+            return {
+                type: bestType,
+                confidence: bestScore,
+                source: 'embedding-type',
+                metadata: { typeScore: bestScore }
+            };
+        }
+        return null;
+    }
+    /**
+     * Match against existing graph entities
+     *
+     * Finds similar entities already in the graph
+     * Boosts confidence for entities similar to existing ones
+     */
+    async matchGraphEntities(vector, candidate) {
+        try {
+            // Query HNSW index for similar entities
+            const similar = await this.brain.similar({
+                to: vector,
+                limit: 5,
+                threshold: 0.70 // Higher threshold for graph matching
+            });
+            if (similar.length === 0)
+                return null;
+            // Use the most similar entity's type
+            const best = similar[0];
+            const entity = await this.brain.get(best.id);
+            if (entity && entity.type) {
+                this.stats.graphMatches++;
+                return {
+                    type: entity.type,
+                    confidence: best.score * 0.95, // Slight discount for graph match
+                    source: 'embedding-graph',
+                    metadata: {
+                        graphScore: best.score,
+                        matchedEntity: best.id,
+                        totalMatches: similar.length
+                    }
+                };
+            }
+        }
+        catch (error) {
+            // Graceful degradation if HNSW not available
+            return null;
+        }
+        return null;
+    }
+    /**
+     * Match against historical import data
+     *
+     * Temporal boosting: entities imported recently are more relevant
+     * Helps with batch imports of similar entities
+     */
+    async matchHistoricalData(vector, candidate) {
+        if (this.historicalEntities.length === 0)
+            return null;
+        let bestMatch = null;
+        let bestScore = 0;
+        // Check against recent history
+        const recentThreshold = Date.now() - 3600000; // Last hour
+        for (const historical of this.historicalEntities) {
+            const similarity = this.cosineSimilarity(vector, historical.vector);
+            // Boost recent entities
+            const recencyBoost = historical.timestamp > recentThreshold ? 1.05 : 1.0;
+            const usageBoost = 1 + (Math.log(historical.usageCount + 1) * 0.02);
+            const adjustedScore = similarity * recencyBoost * usageBoost;
+            if (adjustedScore > bestScore && similarity >= 0.75) {
+                bestScore = adjustedScore;
+                bestMatch = historical;
+            }
+        }
+        if (bestMatch) {
+            this.stats.historyMatches++;
+            return {
+                type: bestMatch.type,
+                confidence: Math.min(bestScore, 0.95), // Cap at 0.95
+                source: 'embedding-history',
+                metadata: {
+                    historyScore: bestScore,
+                    matchedText: bestMatch.text,
+                    recency: bestMatch.timestamp,
+                    usageCount: bestMatch.usageCount
+                }
+            };
+        }
+        return null;
+    }
+    /**
+     * Combine results from all sources with confidence boosting
+     *
+     * Key insight: When multiple sources agree, boost confidence
+     * This is the "ensemble" effect that makes this signal powerful
+     */
+    combineResults(matches) {
+        // Filter out null matches
+        const validMatches = matches.filter((m) => m !== null);
+        if (validMatches.length === 0)
+            return null;
+        // Count votes by type
+        const typeVotes = new Map();
+        for (const match of validMatches) {
+            const existing = typeVotes.get(match.type) || [];
+            typeVotes.set(match.type, [...existing, match]);
+        }
+        // Find type with most votes and highest combined confidence
+        let bestType = null;
+        let bestCombinedScore = 0;
+        let bestMatches = [];
+        for (const [type, matches] of typeVotes.entries()) {
+            // Calculate combined score with agreement boosting
+            const avgConfidence = matches.reduce((sum, m) => sum + m.confidence, 0) / matches.length;
+            const agreementBoost = matches.length > 1 ? 0.05 * (matches.length - 1) : 0;
+            const combinedScore = avgConfidence + agreementBoost;
+            if (combinedScore > bestCombinedScore) {
+                bestCombinedScore = combinedScore;
+                bestType = type;
+                bestMatches = matches;
+            }
+        }
+        if (!bestType || bestCombinedScore < this.options.minConfidence) {
+            return null;
+        }
+        // Track combined boosts
+        if (bestMatches.length > 1) {
+            this.stats.combinedBoosts++;
+        }
+        // Build evidence string
+        const sources = bestMatches.map(m => m.source.replace('embedding-', '')).join('+');
+        const evidence = `Matched via ${sources} (${bestMatches.length} source${bestMatches.length > 1 ? 's' : ''} agree)`;
+        // Combine metadata
+        const metadata = {
+            agreementBoost: bestMatches.length > 1 ? 0.05 * (bestMatches.length - 1) : 0
+        };
+        for (const match of bestMatches) {
+            if (match.source === 'embedding-type')
+                metadata.typeScore = match.metadata?.typeScore;
+            if (match.source === 'embedding-graph')
+                metadata.graphScore = match.metadata?.graphScore;
+            if (match.source === 'embedding-history')
+                metadata.historyScore = match.metadata?.historyScore;
+        }
+        return {
+            source: bestMatches.length > 1 ? 'embedding-combined' : bestMatches[0].source,
+            type: bestType,
+            confidence: Math.min(bestCombinedScore, 1.0), // Cap at 1.0
+            evidence,
+            metadata
+        };
+    }
+    /**
+     * Add entity to historical data (for temporal boosting)
+     *
+     * Call this after successful imports to improve future matching
+     */
+    addToHistory(text, type, vector) {
+        // Check if already exists
+        const existing = this.historicalEntities.find(h => h.text.toLowerCase() === text.toLowerCase());
+        if (existing) {
+            existing.usageCount++;
+            existing.timestamp = Date.now();
+            return;
+        }
+        // Add new historical entity
+        this.historicalEntities.push({
+            text,
+            type,
+            vector,
+            timestamp: Date.now(),
+            usageCount: 1
+        });
+        // Trim to max size (keep most recent and most used)
+        if (this.historicalEntities.length > this.MAX_HISTORY) {
+            // Sort by recency and usage
+            this.historicalEntities.sort((a, b) => {
+                const aScore = a.timestamp + (a.usageCount * 60000); // 1 minute per usage
+                const bScore = b.timestamp + (b.usageCount * 60000);
+                return bScore - aScore;
+            });
+            // Keep top MAX_HISTORY
+            this.historicalEntities = this.historicalEntities.slice(0, this.MAX_HISTORY);
+        }
+    }
+    /**
+     * Clear historical data (useful between import sessions)
+     */
+    clearHistory() {
+        this.historicalEntities = [];
+    }
+    /**
+     * Get statistics about signal performance
+     */
+    getStats() {
+        return {
+            ...this.stats,
+            cacheSize: this.cache.size,
+            historySize: this.historicalEntities.length,
+            cacheHitRate: this.stats.calls > 0 ? this.stats.cacheHits / this.stats.calls : 0,
+            typeMatchRate: this.stats.calls > 0 ? this.stats.typeMatches / this.stats.calls : 0,
+            graphMatchRate: this.stats.calls > 0 ? this.stats.graphMatches / this.stats.calls : 0,
+            historyMatchRate: this.stats.calls > 0 ? this.stats.historyMatches / this.stats.calls : 0
+        };
+    }
+    /**
+     * Reset statistics (useful for testing)
+     */
+    resetStats() {
+        this.stats = {
+            calls: 0,
+            cacheHits: 0,
+            typeMatches: 0,
+            graphMatches: 0,
+            historyMatches: 0,
+            combinedBoosts: 0
+        };
+    }
+    /**
+     * Clear cache
+     */
+    clearCache() {
+        this.cache.clear();
+        this.cacheOrder = [];
+    }
+    // ========== Private Helper Methods ==========
+    /**
+     * Generate cache key from candidate and context
+     */
+    getCacheKey(candidate, context) {
+        const normalized = candidate.toLowerCase().trim();
+        if (!context?.definition)
+            return normalized;
+        return `${normalized}:${context.definition.substring(0, 50)}`;
+    }
+    /**
+     * Get from LRU cache
+     */
+    getFromCache(key) {
+        // Check if key exists in cache (including null values)
+        if (!this.cache.has(key))
+            return null;
+        const cached = this.cache.get(key);
+        // Move to end (most recently used)
+        this.cacheOrder = this.cacheOrder.filter(k => k !== key);
+        this.cacheOrder.push(key);
+        return cached ?? null;
+    }
+    /**
+     * Add to LRU cache with eviction
+     */
+    addToCache(key, value) {
+        // Add to cache
+        this.cache.set(key, value);
+        this.cacheOrder.push(key);
+        // Evict oldest if over limit
+        if (this.cache.size > this.options.cacheSize) {
+            const oldest = this.cacheOrder.shift();
+            if (oldest) {
+                this.cache.delete(oldest);
+            }
+        }
+    }
+    /**
+     * Embed text with timeout protection
+     */
+    async embedWithTimeout(text) {
+        return Promise.race([
+            this.brain.embed(text),
+            new Promise((_, reject) => setTimeout(() => reject(new Error('Embedding timeout')), this.options.timeout))
+        ]);
+    }
+    /**
+     * Calculate cosine similarity between two vectors
+     */
+    cosineSimilarity(a, b) {
+        if (a.length !== b.length) {
+            throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`);
+        }
+        let dotProduct = 0;
+        let normA = 0;
+        let normB = 0;
+        for (let i = 0; i < a.length; i++) {
+            dotProduct += a[i] * b[i];
+            normA += a[i] * a[i];
+            normB += b[i] * b[i];
+        }
+        const denominator = Math.sqrt(normA) * Math.sqrt(normB);
+        if (denominator === 0)
+            return 0;
+        return dotProduct / denominator;
+    }
+}
+/**
+ * Create a new EmbeddingSignal instance
+ *
+ * Convenience factory function
+ */
+export function createEmbeddingSignal(brain, options) {
+    return new EmbeddingSignal(brain, options);
+}
+//# sourceMappingURL=EmbeddingSignal.js.map