npm - @soulcraft/brainy - Versions diffs - 6.4.0 → 6.6.0 - Mend

@soulcraft/brainy 6.4.0 → 6.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/assets/models/all-MiniLM-L6-v2-q8/config.json +25 -0
package/assets/models/all-MiniLM-L6-v2-q8/model.onnx +0 -0
package/assets/models/all-MiniLM-L6-v2-q8/tokenizer.json +30686 -0
package/assets/models/all-MiniLM-L6-v2-q8/vocab.json +1 -0
package/dist/critical/model-guardian.d.ts +5 -22
package/dist/critical/model-guardian.js +38 -210
package/dist/embeddings/EmbeddingManager.d.ts +7 -17
package/dist/embeddings/EmbeddingManager.js +28 -136
package/dist/embeddings/wasm/AssetLoader.d.ts +67 -0
package/dist/embeddings/wasm/AssetLoader.js +238 -0
package/dist/embeddings/wasm/EmbeddingPostProcessor.d.ts +60 -0
package/dist/embeddings/wasm/EmbeddingPostProcessor.js +123 -0
package/dist/embeddings/wasm/ONNXInferenceEngine.d.ts +55 -0
package/dist/embeddings/wasm/ONNXInferenceEngine.js +154 -0
package/dist/embeddings/wasm/WASMEmbeddingEngine.d.ts +82 -0
package/dist/embeddings/wasm/WASMEmbeddingEngine.js +231 -0
package/dist/embeddings/wasm/WordPieceTokenizer.d.ts +71 -0
package/dist/embeddings/wasm/WordPieceTokenizer.js +264 -0
package/dist/embeddings/wasm/index.d.ts +13 -0
package/dist/embeddings/wasm/index.js +15 -0
package/dist/embeddings/wasm/types.d.ts +114 -0
package/dist/embeddings/wasm/types.js +25 -0
package/dist/setup.d.ts +11 -11
package/dist/setup.js +17 -31
package/dist/utils/embedding.d.ts +45 -62
package/dist/utils/embedding.js +61 -440
package/dist/vfs/VirtualFileSystem.d.ts +14 -0
package/dist/vfs/VirtualFileSystem.js +56 -6
package/package.json +10 -3
package/scripts/download-model.cjs +175 -0

package/dist/embeddings/wasm/WordPieceTokenizer.d.ts ADDED Viewed

@@ -0,0 +1,71 @@
+/**
+ * WordPiece Tokenizer for BERT-based models
+ *
+ * Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
+ * This is a clean, dependency-free implementation.
+ *
+ * Algorithm:
+ * 1. Normalize text (lowercase for uncased models)
+ * 2. Split on whitespace and punctuation
+ * 3. Apply WordPiece subword tokenization
+ * 4. Add special tokens ([CLS], [SEP])
+ * 5. Generate attention mask
+ */
+import { TokenizerConfig, TokenizedInput } from './types.js';
+/**
+ * WordPiece tokenizer for BERT-based sentence transformers
+ */
+export declare class WordPieceTokenizer {
+    private vocab;
+    private reverseVocab;
+    private config;
+    constructor(vocab: Map<string, number> | Record<string, number>, config?: Partial<TokenizerConfig>);
+    /**
+     * Tokenize text into token IDs
+     */
+    encode(text: string): TokenizedInput;
+    /**
+     * Encode with padding to fixed length
+     */
+    encodeWithPadding(text: string, targetLength?: number): TokenizedInput;
+    /**
+     * Batch encode multiple texts
+     */
+    encodeBatch(texts: string[]): {
+        inputIds: number[][];
+        attentionMask: number[][];
+        tokenTypeIds: number[][];
+    };
+    /**
+     * Basic tokenization: split on whitespace and punctuation
+     */
+    private basicTokenize;
+    /**
+     * WordPiece tokenization for a single word
+     */
+    private wordPieceTokenize;
+    /**
+     * Check if character is whitespace
+     */
+    private isWhitespace;
+    /**
+     * Check if character is punctuation
+     */
+    private isPunctuation;
+    /**
+     * Decode token IDs back to text (for debugging)
+     */
+    decode(tokenIds: number[]): string;
+    /**
+     * Get vocabulary size
+     */
+    get vocabSize(): number;
+    /**
+     * Get max sequence length
+     */
+    get maxLength(): number;
+}
+/**
+ * Create tokenizer from vocabulary JSON
+ */
+export declare function createTokenizer(vocabJson: Record<string, number>): WordPieceTokenizer;

package/dist/embeddings/wasm/WordPieceTokenizer.js ADDED Viewed

@@ -0,0 +1,264 @@
+/**
+ * WordPiece Tokenizer for BERT-based models
+ *
+ * Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
+ * This is a clean, dependency-free implementation.
+ *
+ * Algorithm:
+ * 1. Normalize text (lowercase for uncased models)
+ * 2. Split on whitespace and punctuation
+ * 3. Apply WordPiece subword tokenization
+ * 4. Add special tokens ([CLS], [SEP])
+ * 5. Generate attention mask
+ */
+import { SPECIAL_TOKENS, MODEL_CONSTANTS, } from './types.js';
+/**
+ * WordPiece tokenizer for BERT-based sentence transformers
+ */
+export class WordPieceTokenizer {
+    constructor(vocab, config) {
+        // Convert Record to Map if needed
+        this.vocab = vocab instanceof Map ? vocab : new Map(Object.entries(vocab));
+        // Build reverse vocab for debugging
+        this.reverseVocab = new Map();
+        for (const [token, id] of this.vocab) {
+            this.reverseVocab.set(id, token);
+        }
+        // Default config for all-MiniLM-L6-v2
+        this.config = {
+            vocab: this.vocab,
+            unkTokenId: config?.unkTokenId ?? SPECIAL_TOKENS.UNK,
+            clsTokenId: config?.clsTokenId ?? SPECIAL_TOKENS.CLS,
+            sepTokenId: config?.sepTokenId ?? SPECIAL_TOKENS.SEP,
+            padTokenId: config?.padTokenId ?? SPECIAL_TOKENS.PAD,
+            maxLength: config?.maxLength ?? MODEL_CONSTANTS.MAX_SEQUENCE_LENGTH,
+            doLowerCase: config?.doLowerCase ?? true,
+        };
+    }
+    /**
+     * Tokenize text into token IDs
+     */
+    encode(text) {
+        // 1. Normalize
+        let normalizedText = text;
+        if (this.config.doLowerCase) {
+            normalizedText = text.toLowerCase();
+        }
+        // 2. Clean and split into words
+        const words = this.basicTokenize(normalizedText);
+        // 3. Apply WordPiece to each word
+        const tokens = [this.config.clsTokenId];
+        for (const word of words) {
+            const wordTokens = this.wordPieceTokenize(word);
+            // Check if adding these tokens would exceed max length (accounting for [SEP])
+            if (tokens.length + wordTokens.length + 1 > this.config.maxLength) {
+                break;
+            }
+            tokens.push(...wordTokens);
+        }
+        tokens.push(this.config.sepTokenId);
+        // 4. Generate attention mask and token type IDs
+        const attentionMask = new Array(tokens.length).fill(1);
+        const tokenTypeIds = new Array(tokens.length).fill(0);
+        return {
+            inputIds: tokens,
+            attentionMask,
+            tokenTypeIds,
+            tokenCount: tokens.length - 2, // Exclude [CLS] and [SEP]
+        };
+    }
+    /**
+     * Encode with padding to fixed length
+     */
+    encodeWithPadding(text, targetLength) {
+        const result = this.encode(text);
+        const padLength = targetLength ?? this.config.maxLength;
+        // Pad to target length
+        while (result.inputIds.length < padLength) {
+            result.inputIds.push(this.config.padTokenId);
+            result.attentionMask.push(0);
+            result.tokenTypeIds.push(0);
+        }
+        // Truncate if longer (shouldn't happen with proper encode())
+        if (result.inputIds.length > padLength) {
+            result.inputIds.length = padLength;
+            result.attentionMask.length = padLength;
+            result.tokenTypeIds.length = padLength;
+            // Ensure [SEP] is at the end
+            result.inputIds[padLength - 1] = this.config.sepTokenId;
+            result.attentionMask[padLength - 1] = 1;
+        }
+        return result;
+    }
+    /**
+     * Batch encode multiple texts
+     */
+    encodeBatch(texts) {
+        const results = texts.map((text) => this.encode(text));
+        // Find max length in batch
+        const maxLen = Math.max(...results.map((r) => r.inputIds.length));
+        // Pad all to same length
+        const inputIds = [];
+        const attentionMask = [];
+        const tokenTypeIds = [];
+        for (const result of results) {
+            const padded = this.encodeWithPadding('', // Not used since we're modifying result
+            maxLen);
+            // Copy original values
+            for (let i = 0; i < result.inputIds.length; i++) {
+                padded.inputIds[i] = result.inputIds[i];
+                padded.attentionMask[i] = result.attentionMask[i];
+                padded.tokenTypeIds[i] = result.tokenTypeIds[i];
+            }
+            // Pad the rest
+            for (let i = result.inputIds.length; i < maxLen; i++) {
+                padded.inputIds[i] = this.config.padTokenId;
+                padded.attentionMask[i] = 0;
+                padded.tokenTypeIds[i] = 0;
+            }
+            inputIds.push(padded.inputIds.slice(0, maxLen));
+            attentionMask.push(padded.attentionMask.slice(0, maxLen));
+            tokenTypeIds.push(padded.tokenTypeIds.slice(0, maxLen));
+        }
+        return { inputIds, attentionMask, tokenTypeIds };
+    }
+    /**
+     * Basic tokenization: split on whitespace and punctuation
+     */
+    basicTokenize(text) {
+        // Clean whitespace
+        text = text.trim().replace(/\s+/g, ' ');
+        if (!text) {
+            return [];
+        }
+        const words = [];
+        let currentWord = '';
+        for (const char of text) {
+            if (this.isWhitespace(char)) {
+                if (currentWord) {
+                    words.push(currentWord);
+                    currentWord = '';
+                }
+            }
+            else if (this.isPunctuation(char)) {
+                if (currentWord) {
+                    words.push(currentWord);
+                    currentWord = '';
+                }
+                words.push(char);
+            }
+            else {
+                currentWord += char;
+            }
+        }
+        if (currentWord) {
+            words.push(currentWord);
+        }
+        return words;
+    }
+    /**
+     * WordPiece tokenization for a single word
+     */
+    wordPieceTokenize(word) {
+        if (!word) {
+            return [];
+        }
+        // Check if whole word is in vocabulary
+        if (this.vocab.has(word)) {
+            return [this.vocab.get(word)];
+        }
+        const tokens = [];
+        let start = 0;
+        while (start < word.length) {
+            let end = word.length;
+            let foundToken = false;
+            while (start < end) {
+                let substr = word.slice(start, end);
+                // Add ## prefix for subwords (not at start of word)
+                if (start > 0) {
+                    substr = '##' + substr;
+                }
+                if (this.vocab.has(substr)) {
+                    tokens.push(this.vocab.get(substr));
+                    foundToken = true;
+                    break;
+                }
+                end--;
+            }
+            if (!foundToken) {
+                // Unknown character - use [UNK] for single character
+                tokens.push(this.config.unkTokenId);
+                start++;
+            }
+            else {
+                start = end;
+            }
+        }
+        return tokens;
+    }
+    /**
+     * Check if character is whitespace
+     */
+    isWhitespace(char) {
+        return /\s/.test(char);
+    }
+    /**
+     * Check if character is punctuation
+     */
+    isPunctuation(char) {
+        const code = char.charCodeAt(0);
+        // ASCII punctuation ranges
+        if ((code >= 33 && code <= 47) || // !"#$%&'()*+,-./
+            (code >= 58 && code <= 64) || // :;<=>?@
+            (code >= 91 && code <= 96) || // [\]^_`
+            (code >= 123 && code <= 126) // {|}~
+        ) {
+            return true;
+        }
+        // Unicode punctuation categories
+        return /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-./:;<=>?@\[\]^_`{|}~]/.test(char);
+    }
+    /**
+     * Decode token IDs back to text (for debugging)
+     */
+    decode(tokenIds) {
+        const tokens = [];
+        for (const id of tokenIds) {
+            const token = this.reverseVocab.get(id);
+            if (token && !['[CLS]', '[SEP]', '[PAD]'].includes(token)) {
+                if (token.startsWith('##')) {
+                    // Subword - append without space
+                    if (tokens.length > 0) {
+                        tokens[tokens.length - 1] += token.slice(2);
+                    }
+                    else {
+                        tokens.push(token.slice(2));
+                    }
+                }
+                else {
+                    tokens.push(token);
+                }
+            }
+        }
+        return tokens.join(' ');
+    }
+    /**
+     * Get vocabulary size
+     */
+    get vocabSize() {
+        return this.vocab.size;
+    }
+    /**
+     * Get max sequence length
+     */
+    get maxLength() {
+        return this.config.maxLength;
+    }
+}
+/**
+ * Create tokenizer from vocabulary JSON
+ */
+export function createTokenizer(vocabJson) {
+    return new WordPieceTokenizer(vocabJson);
+}
+//# sourceMappingURL=WordPieceTokenizer.js.map

package/dist/embeddings/wasm/index.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * WASM Embedding Engine - Public Exports
+ *
+ * Clean, production-grade embedding engine using direct ONNX WASM.
+ * No transformers.js dependency, no runtime downloads, works everywhere.
+ */
+export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
+export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
+export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
+export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
+export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
+export type { TokenizerConfig, TokenizedInput, InferenceConfig, EmbeddingResult, EngineStats, ModelConfig, } from './types.js';
+export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';

package/dist/embeddings/wasm/index.js ADDED Viewed

@@ -0,0 +1,15 @@
+/**
+ * WASM Embedding Engine - Public Exports
+ *
+ * Clean, production-grade embedding engine using direct ONNX WASM.
+ * No transformers.js dependency, no runtime downloads, works everywhere.
+ */
+// Main engine
+export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
+// Components (for advanced use)
+export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
+export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
+export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
+export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
+export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';
+//# sourceMappingURL=index.js.map

package/dist/embeddings/wasm/types.d.ts ADDED Viewed

@@ -0,0 +1,114 @@
+/**
+ * Type definitions for WASM Embedding Engine
+ *
+ * Clean, production-grade types for direct ONNX WASM embeddings.
+ */
+/**
+ * Tokenizer configuration for WordPiece
+ */
+export interface TokenizerConfig {
+    /** Vocabulary mapping word → token ID */
+    vocab: Map<string, number>;
+    /** [UNK] token ID (100 for BERT-based models) */
+    unkTokenId: number;
+    /** [CLS] token ID (101 for BERT-based models) */
+    clsTokenId: number;
+    /** [SEP] token ID (102 for BERT-based models) */
+    sepTokenId: number;
+    /** [PAD] token ID (0 for BERT-based models) */
+    padTokenId: number;
+    /** Maximum sequence length (512 for all-MiniLM-L6-v2) */
+    maxLength: number;
+    /** Whether to lowercase input (true for uncased models) */
+    doLowerCase: boolean;
+}
+/**
+ * Result of tokenization
+ */
+export interface TokenizedInput {
+    /** Token IDs including [CLS] and [SEP] */
+    inputIds: number[];
+    /** Attention mask (1 for real tokens, 0 for padding) */
+    attentionMask: number[];
+    /** Token type IDs (all 0 for single sentence) */
+    tokenTypeIds: number[];
+    /** Number of tokens (excluding special tokens) */
+    tokenCount: number;
+}
+/**
+ * ONNX inference engine configuration
+ */
+export interface InferenceConfig {
+    /** Path to ONNX model file */
+    modelPath: string;
+    /** Path to WASM files directory */
+    wasmPath?: string;
+    /** Number of threads (1 for universal compatibility) */
+    numThreads: number;
+    /** Enable SIMD if available */
+    enableSimd: boolean;
+    /** Enable CPU memory arena (false for memory efficiency) */
+    enableCpuMemArena: boolean;
+}
+/**
+ * Embedding result with metadata
+ */
+export interface EmbeddingResult {
+    /** 384-dimensional embedding vector */
+    embedding: number[];
+    /** Number of tokens processed */
+    tokenCount: number;
+    /** Processing time in milliseconds */
+    processingTimeMs: number;
+}
+/**
+ * Engine statistics
+ */
+export interface EngineStats {
+    /** Whether the engine is initialized */
+    initialized: boolean;
+    /** Total number of embeddings generated */
+    embedCount: number;
+    /** Total processing time in milliseconds */
+    totalProcessingTimeMs: number;
+    /** Average processing time per embedding */
+    avgProcessingTimeMs: number;
+    /** Model name */
+    modelName: string;
+}
+/**
+ * Model configuration (from config.json)
+ */
+export interface ModelConfig {
+    /** Model architecture type */
+    architectures: string[];
+    /** Hidden size (384 for all-MiniLM-L6-v2) */
+    hidden_size: number;
+    /** Number of attention heads */
+    num_attention_heads: number;
+    /** Number of hidden layers */
+    num_hidden_layers: number;
+    /** Vocabulary size */
+    vocab_size: number;
+    /** Maximum position embeddings */
+    max_position_embeddings: number;
+}
+/**
+ * Special token IDs for BERT-based models
+ */
+export declare const SPECIAL_TOKENS: {
+    readonly PAD: 0;
+    readonly UNK: 100;
+    readonly CLS: 101;
+    readonly SEP: 102;
+    readonly MASK: 103;
+};
+/**
+ * Model constants for all-MiniLM-L6-v2
+ */
+export declare const MODEL_CONSTANTS: {
+    readonly HIDDEN_SIZE: 384;
+    readonly MAX_SEQUENCE_LENGTH: 512;
+    readonly VOCAB_SIZE: 30522;
+    readonly MODEL_NAME: "all-MiniLM-L6-v2";
+};

package/dist/embeddings/wasm/types.js ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * Type definitions for WASM Embedding Engine
+ *
+ * Clean, production-grade types for direct ONNX WASM embeddings.
+ */
+/**
+ * Special token IDs for BERT-based models
+ */
+export const SPECIAL_TOKENS = {
+    PAD: 0,
+    UNK: 100,
+    CLS: 101,
+    SEP: 102,
+    MASK: 103,
+};
+/**
+ * Model constants for all-MiniLM-L6-v2
+ */
+export const MODEL_CONSTANTS = {
+    HIDDEN_SIZE: 384,
+    MAX_SEQUENCE_LENGTH: 512,
+    VOCAB_SIZE: 30522,
+    MODEL_NAME: 'all-MiniLM-L6-v2',
+};
+//# sourceMappingURL=types.js.map

package/dist/setup.d.ts CHANGED Viewed

@@ -1,17 +1,17 @@
 /**
- * CRITICAL: This file is imported for its side effects to patch the environment
- * for Node.js compatibility before any other library code runs.
+ * Brainy Setup - Minimal Polyfills
  *
- * It ensures that by the time Transformers.js/ONNX Runtime is imported by any other
- * module, the necessary compatibility fixes for the current Node.js
- * environment are already in place.
+ * ARCHITECTURE (v7.0.0):
+ * Brainy uses direct ONNX WASM for embeddings.
+ * No transformers.js dependency, no hacks required.
  *
- * This file MUST be imported as the first import in unified.ts to prevent
- * race conditions with library initialization. Failure to do so may
- * result in errors like "TextEncoder is not a constructor" when the package
- * is used in Node.js environments.
+ * This file provides minimal polyfills for cross-environment compatibility:
+ * - TextEncoder/TextDecoder for older environments
  *
- * The package.json file marks this file as having side effects to prevent
- * tree-shaking by bundlers, ensuring the patch is always applied.
+ * BENEFITS:
+ * - Clean codebase with no workarounds
+ * - Works everywhere: Node.js, Bun, Bun --compile, browsers, Deno
+ * - No platform-specific binaries
+ * - Model bundled in package (no runtime downloads)
  */
 export {};

package/dist/setup.js CHANGED Viewed

@@ -1,45 +1,31 @@
 /**
- * CRITICAL: This file is imported for its side effects to patch the environment
- * for Node.js compatibility before any other library code runs.
+ * Brainy Setup - Minimal Polyfills
  *
- * It ensures that by the time Transformers.js/ONNX Runtime is imported by any other
- * module, the necessary compatibility fixes for the current Node.js
- * environment are already in place.
+ * ARCHITECTURE (v7.0.0):
+ * Brainy uses direct ONNX WASM for embeddings.
+ * No transformers.js dependency, no hacks required.
  *
- * This file MUST be imported as the first import in unified.ts to prevent
- * race conditions with library initialization. Failure to do so may
- * result in errors like "TextEncoder is not a constructor" when the package
- * is used in Node.js environments.
+ * This file provides minimal polyfills for cross-environment compatibility:
+ * - TextEncoder/TextDecoder for older environments
  *
- * The package.json file marks this file as having side effects to prevent
- * tree-shaking by bundlers, ensuring the patch is always applied.
+ * BENEFITS:
+ * - Clean codebase with no workarounds
+ * - Works everywhere: Node.js, Bun, Bun --compile, browsers, Deno
+ * - No platform-specific binaries
+ * - Model bundled in package (no runtime downloads)
  */
-// Get the appropriate global object for the current environment
-const globalObj = (() => {
-    if (typeof globalThis !== 'undefined')
-        return globalThis;
-    if (typeof global !== 'undefined')
-        return global;
-    if (typeof self !== 'undefined')
-        return self;
-    return null; // No global object available
-})();
-// Define TextEncoder and TextDecoder globally to make sure they're available
-// Now works across all environments: Node.js, serverless, and other server environments
+// ============================================================================
+// TextEncoder/TextDecoder Polyfills
+// ============================================================================
+const globalObj = globalThis ?? global ?? self;
 if (globalObj) {
-    if (!globalObj.TextEncoder) {
+    if (!globalObj.TextEncoder)
         globalObj.TextEncoder = TextEncoder;
-    }
-    if (!globalObj.TextDecoder) {
+    if (!globalObj.TextDecoder)
         globalObj.TextDecoder = TextDecoder;
-    }
-    // Create special global constructors for library compatibility
     globalObj.__TextEncoder__ = TextEncoder;
     globalObj.__TextDecoder__ = TextDecoder;
 }
-// Also import normally for ES modules environments
 import { applyTensorFlowPatch } from './utils/textEncoding.js';
-// Apply the TextEncoder/TextDecoder compatibility patch
 applyTensorFlowPatch();
-console.log('Applied TextEncoder/TextDecoder patch via ES modules in setup.ts');
 //# sourceMappingURL=setup.js.map