@soulcraft/brainy 6.5.0 → 6.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/assets/models/all-MiniLM-L6-v2-q8/config.json +25 -0
  2. package/assets/models/all-MiniLM-L6-v2-q8/model.onnx +0 -0
  3. package/assets/models/all-MiniLM-L6-v2-q8/tokenizer.json +30686 -0
  4. package/assets/models/all-MiniLM-L6-v2-q8/vocab.json +1 -0
  5. package/dist/brainy.js +0 -6
  6. package/dist/config/index.d.ts +1 -3
  7. package/dist/config/index.js +2 -4
  8. package/dist/config/modelAutoConfig.d.ts +10 -17
  9. package/dist/config/modelAutoConfig.js +15 -88
  10. package/dist/config/sharedConfigManager.d.ts +1 -2
  11. package/dist/config/zeroConfig.d.ts +2 -13
  12. package/dist/config/zeroConfig.js +7 -15
  13. package/dist/critical/model-guardian.d.ts +5 -22
  14. package/dist/critical/model-guardian.js +38 -210
  15. package/dist/embeddings/EmbeddingManager.d.ts +7 -17
  16. package/dist/embeddings/EmbeddingManager.js +28 -136
  17. package/dist/embeddings/wasm/AssetLoader.d.ts +67 -0
  18. package/dist/embeddings/wasm/AssetLoader.js +238 -0
  19. package/dist/embeddings/wasm/EmbeddingPostProcessor.d.ts +60 -0
  20. package/dist/embeddings/wasm/EmbeddingPostProcessor.js +123 -0
  21. package/dist/embeddings/wasm/ONNXInferenceEngine.d.ts +55 -0
  22. package/dist/embeddings/wasm/ONNXInferenceEngine.js +154 -0
  23. package/dist/embeddings/wasm/WASMEmbeddingEngine.d.ts +82 -0
  24. package/dist/embeddings/wasm/WASMEmbeddingEngine.js +231 -0
  25. package/dist/embeddings/wasm/WordPieceTokenizer.d.ts +71 -0
  26. package/dist/embeddings/wasm/WordPieceTokenizer.js +264 -0
  27. package/dist/embeddings/wasm/index.d.ts +13 -0
  28. package/dist/embeddings/wasm/index.js +15 -0
  29. package/dist/embeddings/wasm/types.d.ts +114 -0
  30. package/dist/embeddings/wasm/types.js +25 -0
  31. package/dist/setup.d.ts +11 -11
  32. package/dist/setup.js +17 -31
  33. package/dist/types/brainy.types.d.ts +0 -5
  34. package/dist/utils/embedding.d.ts +45 -62
  35. package/dist/utils/embedding.js +61 -440
  36. package/package.json +10 -3
  37. package/scripts/download-model.cjs +175 -0
@@ -0,0 +1,82 @@
1
+ /**
2
+ * WASM Embedding Engine
3
+ *
4
+ * The main embedding engine that combines all components:
5
+ * - WordPieceTokenizer: Text → Token IDs
6
+ * - ONNXInferenceEngine: Token IDs → Hidden States
7
+ * - EmbeddingPostProcessor: Hidden States → Normalized Embedding
8
+ *
9
+ * This replaces transformers.js with a clean, production-grade implementation.
10
+ *
11
+ * Features:
12
+ * - Singleton pattern (one model instance)
13
+ * - Lazy initialization
14
+ * - Batch processing support
15
+ * - Zero runtime dependencies
16
+ */
17
+ import { EmbeddingResult, EngineStats } from './types.js';
18
+ /**
19
+ * WASM-based embedding engine
20
+ */
21
+ export declare class WASMEmbeddingEngine {
22
+ private tokenizer;
23
+ private inference;
24
+ private postProcessor;
25
+ private initialized;
26
+ private embedCount;
27
+ private totalProcessingTimeMs;
28
+ private constructor();
29
+ /**
30
+ * Get the singleton instance
31
+ */
32
+ static getInstance(): WASMEmbeddingEngine;
33
+ /**
34
+ * Initialize all components
35
+ */
36
+ initialize(): Promise<void>;
37
+ /**
38
+ * Perform actual initialization
39
+ */
40
+ private performInit;
41
+ /**
42
+ * Generate embedding for text
43
+ */
44
+ embed(text: string): Promise<number[]>;
45
+ /**
46
+ * Generate embedding with metadata
47
+ */
48
+ embedWithMetadata(text: string): Promise<EmbeddingResult>;
49
+ /**
50
+ * Batch embed multiple texts
51
+ */
52
+ embedBatch(texts: string[]): Promise<number[][]>;
53
+ /**
54
+ * Check if initialized
55
+ */
56
+ isInitialized(): boolean;
57
+ /**
58
+ * Get engine statistics
59
+ */
60
+ getStats(): EngineStats;
61
+ /**
62
+ * Dispose and free resources
63
+ */
64
+ dispose(): Promise<void>;
65
+ /**
66
+ * Reset singleton (for testing)
67
+ */
68
+ static resetInstance(): void;
69
+ }
70
+ export declare const wasmEmbeddingEngine: WASMEmbeddingEngine;
71
+ /**
72
+ * Convenience function to get embeddings
73
+ */
74
+ export declare function embed(text: string): Promise<number[]>;
75
+ /**
76
+ * Convenience function for batch embeddings
77
+ */
78
+ export declare function embedBatch(texts: string[]): Promise<number[][]>;
79
+ /**
80
+ * Get embedding stats
81
+ */
82
+ export declare function getEmbeddingStats(): EngineStats;
@@ -0,0 +1,231 @@
1
+ /**
2
+ * WASM Embedding Engine
3
+ *
4
+ * The main embedding engine that combines all components:
5
+ * - WordPieceTokenizer: Text → Token IDs
6
+ * - ONNXInferenceEngine: Token IDs → Hidden States
7
+ * - EmbeddingPostProcessor: Hidden States → Normalized Embedding
8
+ *
9
+ * This replaces transformers.js with a clean, production-grade implementation.
10
+ *
11
+ * Features:
12
+ * - Singleton pattern (one model instance)
13
+ * - Lazy initialization
14
+ * - Batch processing support
15
+ * - Zero runtime dependencies
16
+ */
17
+ import { WordPieceTokenizer } from './WordPieceTokenizer.js';
18
+ import { ONNXInferenceEngine } from './ONNXInferenceEngine.js';
19
+ import { EmbeddingPostProcessor } from './EmbeddingPostProcessor.js';
20
+ import { getAssetLoader } from './AssetLoader.js';
21
+ import { MODEL_CONSTANTS } from './types.js';
22
+ // Global singleton instance
23
+ let globalInstance = null;
24
+ let globalInitPromise = null;
25
+ /**
26
+ * WASM-based embedding engine
27
+ */
28
+ export class WASMEmbeddingEngine {
29
+ constructor() {
30
+ this.tokenizer = null;
31
+ this.inference = null;
32
+ this.postProcessor = null;
33
+ this.initialized = false;
34
+ this.embedCount = 0;
35
+ this.totalProcessingTimeMs = 0;
36
+ // Private constructor for singleton
37
+ }
38
+ /**
39
+ * Get the singleton instance
40
+ */
41
+ static getInstance() {
42
+ if (!globalInstance) {
43
+ globalInstance = new WASMEmbeddingEngine();
44
+ }
45
+ return globalInstance;
46
+ }
47
+ /**
48
+ * Initialize all components
49
+ */
50
+ async initialize() {
51
+ // Already initialized
52
+ if (this.initialized) {
53
+ return;
54
+ }
55
+ // Initialization in progress
56
+ if (globalInitPromise) {
57
+ await globalInitPromise;
58
+ return;
59
+ }
60
+ // Start initialization
61
+ globalInitPromise = this.performInit();
62
+ try {
63
+ await globalInitPromise;
64
+ }
65
+ finally {
66
+ globalInitPromise = null;
67
+ }
68
+ }
69
+ /**
70
+ * Perform actual initialization
71
+ */
72
+ async performInit() {
73
+ const startTime = Date.now();
74
+ console.log('🚀 Initializing WASM Embedding Engine...');
75
+ try {
76
+ const assetLoader = getAssetLoader();
77
+ // Verify assets exist
78
+ const verification = await assetLoader.verifyAssets();
79
+ if (!verification.valid) {
80
+ throw new Error(`Missing model assets:\n${verification.errors.join('\n')}\n\n` +
81
+ `Expected model at: ${verification.modelPath}\n` +
82
+ `Expected vocab at: ${verification.vocabPath}\n\n` +
83
+ `Run 'npm run download-model' to download the model files.`);
84
+ }
85
+ // Load vocabulary and create tokenizer
86
+ console.log('📖 Loading vocabulary...');
87
+ const vocab = await assetLoader.loadVocab();
88
+ this.tokenizer = new WordPieceTokenizer(vocab);
89
+ console.log(`✅ Vocabulary loaded: ${this.tokenizer.vocabSize} tokens`);
90
+ // Initialize ONNX inference engine
91
+ console.log('🧠 Loading ONNX model...');
92
+ const modelPath = await assetLoader.getModelPath();
93
+ this.inference = new ONNXInferenceEngine({ modelPath });
94
+ await this.inference.initialize(modelPath);
95
+ console.log('✅ ONNX model loaded');
96
+ // Create post-processor
97
+ this.postProcessor = new EmbeddingPostProcessor(MODEL_CONSTANTS.HIDDEN_SIZE);
98
+ this.initialized = true;
99
+ const initTime = Date.now() - startTime;
100
+ console.log(`✅ WASM Embedding Engine ready in ${initTime}ms`);
101
+ }
102
+ catch (error) {
103
+ this.initialized = false;
104
+ this.tokenizer = null;
105
+ this.inference = null;
106
+ this.postProcessor = null;
107
+ throw new Error(`Failed to initialize WASM Embedding Engine: ${error instanceof Error ? error.message : String(error)}`);
108
+ }
109
+ }
110
+ /**
111
+ * Generate embedding for text
112
+ */
113
+ async embed(text) {
114
+ const result = await this.embedWithMetadata(text);
115
+ return result.embedding;
116
+ }
117
+ /**
118
+ * Generate embedding with metadata
119
+ */
120
+ async embedWithMetadata(text) {
121
+ // Ensure initialized
122
+ if (!this.initialized) {
123
+ await this.initialize();
124
+ }
125
+ if (!this.tokenizer || !this.inference || !this.postProcessor) {
126
+ throw new Error('Engine not properly initialized');
127
+ }
128
+ const startTime = Date.now();
129
+ // 1. Tokenize
130
+ const tokenized = this.tokenizer.encode(text);
131
+ // 2. Run inference
132
+ const hiddenStates = await this.inference.inferSingle(tokenized.inputIds, tokenized.attentionMask, tokenized.tokenTypeIds);
133
+ // 3. Post-process (mean pool + normalize)
134
+ const embedding = this.postProcessor.process(hiddenStates, tokenized.attentionMask, tokenized.inputIds.length);
135
+ const processingTimeMs = Date.now() - startTime;
136
+ this.embedCount++;
137
+ this.totalProcessingTimeMs += processingTimeMs;
138
+ return {
139
+ embedding: Array.from(embedding),
140
+ tokenCount: tokenized.tokenCount,
141
+ processingTimeMs,
142
+ };
143
+ }
144
+ /**
145
+ * Batch embed multiple texts
146
+ */
147
+ async embedBatch(texts) {
148
+ // Ensure initialized
149
+ if (!this.initialized) {
150
+ await this.initialize();
151
+ }
152
+ if (!this.tokenizer || !this.inference || !this.postProcessor) {
153
+ throw new Error('Engine not properly initialized');
154
+ }
155
+ if (texts.length === 0) {
156
+ return [];
157
+ }
158
+ // Tokenize all texts
159
+ const batch = this.tokenizer.encodeBatch(texts);
160
+ const seqLen = batch.inputIds[0].length;
161
+ // Run batch inference
162
+ const hiddenStates = await this.inference.infer(batch.inputIds, batch.attentionMask, batch.tokenTypeIds);
163
+ // Post-process each result
164
+ const embeddings = this.postProcessor.processBatch(hiddenStates, batch.attentionMask, texts.length, seqLen);
165
+ this.embedCount += texts.length;
166
+ return embeddings.map(e => Array.from(e));
167
+ }
168
+ /**
169
+ * Check if initialized
170
+ */
171
+ isInitialized() {
172
+ return this.initialized;
173
+ }
174
+ /**
175
+ * Get engine statistics
176
+ */
177
+ getStats() {
178
+ return {
179
+ initialized: this.initialized,
180
+ embedCount: this.embedCount,
181
+ totalProcessingTimeMs: this.totalProcessingTimeMs,
182
+ avgProcessingTimeMs: this.embedCount > 0
183
+ ? this.totalProcessingTimeMs / this.embedCount
184
+ : 0,
185
+ modelName: MODEL_CONSTANTS.MODEL_NAME,
186
+ };
187
+ }
188
+ /**
189
+ * Dispose and free resources
190
+ */
191
+ async dispose() {
192
+ if (this.inference) {
193
+ await this.inference.dispose();
194
+ this.inference = null;
195
+ }
196
+ this.tokenizer = null;
197
+ this.postProcessor = null;
198
+ this.initialized = false;
199
+ }
200
+ /**
201
+ * Reset singleton (for testing)
202
+ */
203
+ static resetInstance() {
204
+ if (globalInstance) {
205
+ globalInstance.dispose();
206
+ }
207
+ globalInstance = null;
208
+ globalInitPromise = null;
209
+ }
210
+ }
211
+ // Export singleton access
212
+ export const wasmEmbeddingEngine = WASMEmbeddingEngine.getInstance();
213
+ /**
214
+ * Convenience function to get embeddings
215
+ */
216
+ export async function embed(text) {
217
+ return wasmEmbeddingEngine.embed(text);
218
+ }
219
+ /**
220
+ * Convenience function for batch embeddings
221
+ */
222
+ export async function embedBatch(texts) {
223
+ return wasmEmbeddingEngine.embedBatch(texts);
224
+ }
225
+ /**
226
+ * Get embedding stats
227
+ */
228
+ export function getEmbeddingStats() {
229
+ return wasmEmbeddingEngine.getStats();
230
+ }
231
+ //# sourceMappingURL=WASMEmbeddingEngine.js.map
@@ -0,0 +1,71 @@
1
+ /**
2
+ * WordPiece Tokenizer for BERT-based models
3
+ *
4
+ * Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
5
+ * This is a clean, dependency-free implementation.
6
+ *
7
+ * Algorithm:
8
+ * 1. Normalize text (lowercase for uncased models)
9
+ * 2. Split on whitespace and punctuation
10
+ * 3. Apply WordPiece subword tokenization
11
+ * 4. Add special tokens ([CLS], [SEP])
12
+ * 5. Generate attention mask
13
+ */
14
+ import { TokenizerConfig, TokenizedInput } from './types.js';
15
+ /**
16
+ * WordPiece tokenizer for BERT-based sentence transformers
17
+ */
18
+ export declare class WordPieceTokenizer {
19
+ private vocab;
20
+ private reverseVocab;
21
+ private config;
22
+ constructor(vocab: Map<string, number> | Record<string, number>, config?: Partial<TokenizerConfig>);
23
+ /**
24
+ * Tokenize text into token IDs
25
+ */
26
+ encode(text: string): TokenizedInput;
27
+ /**
28
+ * Encode with padding to fixed length
29
+ */
30
+ encodeWithPadding(text: string, targetLength?: number): TokenizedInput;
31
+ /**
32
+ * Batch encode multiple texts
33
+ */
34
+ encodeBatch(texts: string[]): {
35
+ inputIds: number[][];
36
+ attentionMask: number[][];
37
+ tokenTypeIds: number[][];
38
+ };
39
+ /**
40
+ * Basic tokenization: split on whitespace and punctuation
41
+ */
42
+ private basicTokenize;
43
+ /**
44
+ * WordPiece tokenization for a single word
45
+ */
46
+ private wordPieceTokenize;
47
+ /**
48
+ * Check if character is whitespace
49
+ */
50
+ private isWhitespace;
51
+ /**
52
+ * Check if character is punctuation
53
+ */
54
+ private isPunctuation;
55
+ /**
56
+ * Decode token IDs back to text (for debugging)
57
+ */
58
+ decode(tokenIds: number[]): string;
59
+ /**
60
+ * Get vocabulary size
61
+ */
62
+ get vocabSize(): number;
63
+ /**
64
+ * Get max sequence length
65
+ */
66
+ get maxLength(): number;
67
+ }
68
+ /**
69
+ * Create tokenizer from vocabulary JSON
70
+ */
71
+ export declare function createTokenizer(vocabJson: Record<string, number>): WordPieceTokenizer;
@@ -0,0 +1,264 @@
1
+ /**
2
+ * WordPiece Tokenizer for BERT-based models
3
+ *
4
+ * Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
5
+ * This is a clean, dependency-free implementation.
6
+ *
7
+ * Algorithm:
8
+ * 1. Normalize text (lowercase for uncased models)
9
+ * 2. Split on whitespace and punctuation
10
+ * 3. Apply WordPiece subword tokenization
11
+ * 4. Add special tokens ([CLS], [SEP])
12
+ * 5. Generate attention mask
13
+ */
14
+ import { SPECIAL_TOKENS, MODEL_CONSTANTS, } from './types.js';
15
+ /**
16
+ * WordPiece tokenizer for BERT-based sentence transformers
17
+ */
18
+ export class WordPieceTokenizer {
19
+ constructor(vocab, config) {
20
+ // Convert Record to Map if needed
21
+ this.vocab = vocab instanceof Map ? vocab : new Map(Object.entries(vocab));
22
+ // Build reverse vocab for debugging
23
+ this.reverseVocab = new Map();
24
+ for (const [token, id] of this.vocab) {
25
+ this.reverseVocab.set(id, token);
26
+ }
27
+ // Default config for all-MiniLM-L6-v2
28
+ this.config = {
29
+ vocab: this.vocab,
30
+ unkTokenId: config?.unkTokenId ?? SPECIAL_TOKENS.UNK,
31
+ clsTokenId: config?.clsTokenId ?? SPECIAL_TOKENS.CLS,
32
+ sepTokenId: config?.sepTokenId ?? SPECIAL_TOKENS.SEP,
33
+ padTokenId: config?.padTokenId ?? SPECIAL_TOKENS.PAD,
34
+ maxLength: config?.maxLength ?? MODEL_CONSTANTS.MAX_SEQUENCE_LENGTH,
35
+ doLowerCase: config?.doLowerCase ?? true,
36
+ };
37
+ }
38
+ /**
39
+ * Tokenize text into token IDs
40
+ */
41
+ encode(text) {
42
+ // 1. Normalize
43
+ let normalizedText = text;
44
+ if (this.config.doLowerCase) {
45
+ normalizedText = text.toLowerCase();
46
+ }
47
+ // 2. Clean and split into words
48
+ const words = this.basicTokenize(normalizedText);
49
+ // 3. Apply WordPiece to each word
50
+ const tokens = [this.config.clsTokenId];
51
+ for (const word of words) {
52
+ const wordTokens = this.wordPieceTokenize(word);
53
+ // Check if adding these tokens would exceed max length (accounting for [SEP])
54
+ if (tokens.length + wordTokens.length + 1 > this.config.maxLength) {
55
+ break;
56
+ }
57
+ tokens.push(...wordTokens);
58
+ }
59
+ tokens.push(this.config.sepTokenId);
60
+ // 4. Generate attention mask and token type IDs
61
+ const attentionMask = new Array(tokens.length).fill(1);
62
+ const tokenTypeIds = new Array(tokens.length).fill(0);
63
+ return {
64
+ inputIds: tokens,
65
+ attentionMask,
66
+ tokenTypeIds,
67
+ tokenCount: tokens.length - 2, // Exclude [CLS] and [SEP]
68
+ };
69
+ }
70
+ /**
71
+ * Encode with padding to fixed length
72
+ */
73
+ encodeWithPadding(text, targetLength) {
74
+ const result = this.encode(text);
75
+ const padLength = targetLength ?? this.config.maxLength;
76
+ // Pad to target length
77
+ while (result.inputIds.length < padLength) {
78
+ result.inputIds.push(this.config.padTokenId);
79
+ result.attentionMask.push(0);
80
+ result.tokenTypeIds.push(0);
81
+ }
82
+ // Truncate if longer (shouldn't happen with proper encode())
83
+ if (result.inputIds.length > padLength) {
84
+ result.inputIds.length = padLength;
85
+ result.attentionMask.length = padLength;
86
+ result.tokenTypeIds.length = padLength;
87
+ // Ensure [SEP] is at the end
88
+ result.inputIds[padLength - 1] = this.config.sepTokenId;
89
+ result.attentionMask[padLength - 1] = 1;
90
+ }
91
+ return result;
92
+ }
93
+ /**
94
+ * Batch encode multiple texts
95
+ */
96
+ encodeBatch(texts) {
97
+ const results = texts.map((text) => this.encode(text));
98
+ // Find max length in batch
99
+ const maxLen = Math.max(...results.map((r) => r.inputIds.length));
100
+ // Pad all to same length
101
+ const inputIds = [];
102
+ const attentionMask = [];
103
+ const tokenTypeIds = [];
104
+ for (const result of results) {
105
+ const padded = this.encodeWithPadding('', // Not used since we're modifying result
106
+ maxLen);
107
+ // Copy original values
108
+ for (let i = 0; i < result.inputIds.length; i++) {
109
+ padded.inputIds[i] = result.inputIds[i];
110
+ padded.attentionMask[i] = result.attentionMask[i];
111
+ padded.tokenTypeIds[i] = result.tokenTypeIds[i];
112
+ }
113
+ // Pad the rest
114
+ for (let i = result.inputIds.length; i < maxLen; i++) {
115
+ padded.inputIds[i] = this.config.padTokenId;
116
+ padded.attentionMask[i] = 0;
117
+ padded.tokenTypeIds[i] = 0;
118
+ }
119
+ inputIds.push(padded.inputIds.slice(0, maxLen));
120
+ attentionMask.push(padded.attentionMask.slice(0, maxLen));
121
+ tokenTypeIds.push(padded.tokenTypeIds.slice(0, maxLen));
122
+ }
123
+ return { inputIds, attentionMask, tokenTypeIds };
124
+ }
125
+ /**
126
+ * Basic tokenization: split on whitespace and punctuation
127
+ */
128
+ basicTokenize(text) {
129
+ // Clean whitespace
130
+ text = text.trim().replace(/\s+/g, ' ');
131
+ if (!text) {
132
+ return [];
133
+ }
134
+ const words = [];
135
+ let currentWord = '';
136
+ for (const char of text) {
137
+ if (this.isWhitespace(char)) {
138
+ if (currentWord) {
139
+ words.push(currentWord);
140
+ currentWord = '';
141
+ }
142
+ }
143
+ else if (this.isPunctuation(char)) {
144
+ if (currentWord) {
145
+ words.push(currentWord);
146
+ currentWord = '';
147
+ }
148
+ words.push(char);
149
+ }
150
+ else {
151
+ currentWord += char;
152
+ }
153
+ }
154
+ if (currentWord) {
155
+ words.push(currentWord);
156
+ }
157
+ return words;
158
+ }
159
+ /**
160
+ * WordPiece tokenization for a single word
161
+ */
162
+ wordPieceTokenize(word) {
163
+ if (!word) {
164
+ return [];
165
+ }
166
+ // Check if whole word is in vocabulary
167
+ if (this.vocab.has(word)) {
168
+ return [this.vocab.get(word)];
169
+ }
170
+ const tokens = [];
171
+ let start = 0;
172
+ while (start < word.length) {
173
+ let end = word.length;
174
+ let foundToken = false;
175
+ while (start < end) {
176
+ let substr = word.slice(start, end);
177
+ // Add ## prefix for subwords (not at start of word)
178
+ if (start > 0) {
179
+ substr = '##' + substr;
180
+ }
181
+ if (this.vocab.has(substr)) {
182
+ tokens.push(this.vocab.get(substr));
183
+ foundToken = true;
184
+ break;
185
+ }
186
+ end--;
187
+ }
188
+ if (!foundToken) {
189
+ // Unknown character - use [UNK] for single character
190
+ tokens.push(this.config.unkTokenId);
191
+ start++;
192
+ }
193
+ else {
194
+ start = end;
195
+ }
196
+ }
197
+ return tokens;
198
+ }
199
+ /**
200
+ * Check if character is whitespace
201
+ */
202
+ isWhitespace(char) {
203
+ return /\s/.test(char);
204
+ }
205
+ /**
206
+ * Check if character is punctuation
207
+ */
208
+ isPunctuation(char) {
209
+ const code = char.charCodeAt(0);
210
+ // ASCII punctuation ranges
211
+ if ((code >= 33 && code <= 47) || // !"#$%&'()*+,-./
212
+ (code >= 58 && code <= 64) || // :;<=>?@
213
+ (code >= 91 && code <= 96) || // [\]^_`
214
+ (code >= 123 && code <= 126) // {|}~
215
+ ) {
216
+ return true;
217
+ }
218
+ // Unicode punctuation categories
219
+ return /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-./:;<=>?@\[\]^_`{|}~]/.test(char);
220
+ }
221
+ /**
222
+ * Decode token IDs back to text (for debugging)
223
+ */
224
+ decode(tokenIds) {
225
+ const tokens = [];
226
+ for (const id of tokenIds) {
227
+ const token = this.reverseVocab.get(id);
228
+ if (token && !['[CLS]', '[SEP]', '[PAD]'].includes(token)) {
229
+ if (token.startsWith('##')) {
230
+ // Subword - append without space
231
+ if (tokens.length > 0) {
232
+ tokens[tokens.length - 1] += token.slice(2);
233
+ }
234
+ else {
235
+ tokens.push(token.slice(2));
236
+ }
237
+ }
238
+ else {
239
+ tokens.push(token);
240
+ }
241
+ }
242
+ }
243
+ return tokens.join(' ');
244
+ }
245
+ /**
246
+ * Get vocabulary size
247
+ */
248
+ get vocabSize() {
249
+ return this.vocab.size;
250
+ }
251
+ /**
252
+ * Get max sequence length
253
+ */
254
+ get maxLength() {
255
+ return this.config.maxLength;
256
+ }
257
+ }
258
+ /**
259
+ * Create tokenizer from vocabulary JSON
260
+ */
261
+ export function createTokenizer(vocabJson) {
262
+ return new WordPieceTokenizer(vocabJson);
263
+ }
264
+ //# sourceMappingURL=WordPieceTokenizer.js.map
@@ -0,0 +1,13 @@
1
+ /**
2
+ * WASM Embedding Engine - Public Exports
3
+ *
4
+ * Clean, production-grade embedding engine using direct ONNX WASM.
5
+ * No transformers.js dependency, no runtime downloads, works everywhere.
6
+ */
7
+ export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
8
+ export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
9
+ export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
10
+ export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
11
+ export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
12
+ export type { TokenizerConfig, TokenizedInput, InferenceConfig, EmbeddingResult, EngineStats, ModelConfig, } from './types.js';
13
+ export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';
@@ -0,0 +1,15 @@
1
+ /**
2
+ * WASM Embedding Engine - Public Exports
3
+ *
4
+ * Clean, production-grade embedding engine using direct ONNX WASM.
5
+ * No transformers.js dependency, no runtime downloads, works everywhere.
6
+ */
7
+ // Main engine
8
+ export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
9
+ // Components (for advanced use)
10
+ export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
11
+ export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
12
+ export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
13
+ export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
14
+ export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';
15
+ //# sourceMappingURL=index.js.map