@soulcraft/brainy 6.5.0 → 6.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/models/all-MiniLM-L6-v2-q8/config.json +25 -0
- package/assets/models/all-MiniLM-L6-v2-q8/model.onnx +0 -0
- package/assets/models/all-MiniLM-L6-v2-q8/tokenizer.json +30686 -0
- package/assets/models/all-MiniLM-L6-v2-q8/vocab.json +1 -0
- package/dist/brainy.js +0 -6
- package/dist/config/index.d.ts +1 -3
- package/dist/config/index.js +2 -4
- package/dist/config/modelAutoConfig.d.ts +10 -17
- package/dist/config/modelAutoConfig.js +15 -88
- package/dist/config/sharedConfigManager.d.ts +1 -2
- package/dist/config/zeroConfig.d.ts +2 -13
- package/dist/config/zeroConfig.js +7 -15
- package/dist/critical/model-guardian.d.ts +5 -22
- package/dist/critical/model-guardian.js +38 -210
- package/dist/embeddings/EmbeddingManager.d.ts +7 -17
- package/dist/embeddings/EmbeddingManager.js +28 -136
- package/dist/embeddings/wasm/AssetLoader.d.ts +67 -0
- package/dist/embeddings/wasm/AssetLoader.js +238 -0
- package/dist/embeddings/wasm/EmbeddingPostProcessor.d.ts +60 -0
- package/dist/embeddings/wasm/EmbeddingPostProcessor.js +123 -0
- package/dist/embeddings/wasm/ONNXInferenceEngine.d.ts +55 -0
- package/dist/embeddings/wasm/ONNXInferenceEngine.js +154 -0
- package/dist/embeddings/wasm/WASMEmbeddingEngine.d.ts +82 -0
- package/dist/embeddings/wasm/WASMEmbeddingEngine.js +231 -0
- package/dist/embeddings/wasm/WordPieceTokenizer.d.ts +71 -0
- package/dist/embeddings/wasm/WordPieceTokenizer.js +264 -0
- package/dist/embeddings/wasm/index.d.ts +13 -0
- package/dist/embeddings/wasm/index.js +15 -0
- package/dist/embeddings/wasm/types.d.ts +114 -0
- package/dist/embeddings/wasm/types.js +25 -0
- package/dist/setup.d.ts +11 -11
- package/dist/setup.js +17 -31
- package/dist/types/brainy.types.d.ts +0 -5
- package/dist/utils/embedding.d.ts +45 -62
- package/dist/utils/embedding.js +61 -440
- package/package.json +10 -3
- package/scripts/download-model.cjs +175 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WASM Embedding Engine
|
|
3
|
+
*
|
|
4
|
+
* The main embedding engine that combines all components:
|
|
5
|
+
* - WordPieceTokenizer: Text → Token IDs
|
|
6
|
+
* - ONNXInferenceEngine: Token IDs → Hidden States
|
|
7
|
+
* - EmbeddingPostProcessor: Hidden States → Normalized Embedding
|
|
8
|
+
*
|
|
9
|
+
* This replaces transformers.js with a clean, production-grade implementation.
|
|
10
|
+
*
|
|
11
|
+
* Features:
|
|
12
|
+
* - Singleton pattern (one model instance)
|
|
13
|
+
* - Lazy initialization
|
|
14
|
+
* - Batch processing support
|
|
15
|
+
* - Zero runtime dependencies
|
|
16
|
+
*/
|
|
17
|
+
import { EmbeddingResult, EngineStats } from './types.js';
|
|
18
|
+
/**
|
|
19
|
+
* WASM-based embedding engine
|
|
20
|
+
*/
|
|
21
|
+
export declare class WASMEmbeddingEngine {
|
|
22
|
+
private tokenizer;
|
|
23
|
+
private inference;
|
|
24
|
+
private postProcessor;
|
|
25
|
+
private initialized;
|
|
26
|
+
private embedCount;
|
|
27
|
+
private totalProcessingTimeMs;
|
|
28
|
+
private constructor();
|
|
29
|
+
/**
|
|
30
|
+
* Get the singleton instance
|
|
31
|
+
*/
|
|
32
|
+
static getInstance(): WASMEmbeddingEngine;
|
|
33
|
+
/**
|
|
34
|
+
* Initialize all components
|
|
35
|
+
*/
|
|
36
|
+
initialize(): Promise<void>;
|
|
37
|
+
/**
|
|
38
|
+
* Perform actual initialization
|
|
39
|
+
*/
|
|
40
|
+
private performInit;
|
|
41
|
+
/**
|
|
42
|
+
* Generate embedding for text
|
|
43
|
+
*/
|
|
44
|
+
embed(text: string): Promise<number[]>;
|
|
45
|
+
/**
|
|
46
|
+
* Generate embedding with metadata
|
|
47
|
+
*/
|
|
48
|
+
embedWithMetadata(text: string): Promise<EmbeddingResult>;
|
|
49
|
+
/**
|
|
50
|
+
* Batch embed multiple texts
|
|
51
|
+
*/
|
|
52
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
53
|
+
/**
|
|
54
|
+
* Check if initialized
|
|
55
|
+
*/
|
|
56
|
+
isInitialized(): boolean;
|
|
57
|
+
/**
|
|
58
|
+
* Get engine statistics
|
|
59
|
+
*/
|
|
60
|
+
getStats(): EngineStats;
|
|
61
|
+
/**
|
|
62
|
+
* Dispose and free resources
|
|
63
|
+
*/
|
|
64
|
+
dispose(): Promise<void>;
|
|
65
|
+
/**
|
|
66
|
+
* Reset singleton (for testing)
|
|
67
|
+
*/
|
|
68
|
+
static resetInstance(): void;
|
|
69
|
+
}
|
|
70
|
+
export declare const wasmEmbeddingEngine: WASMEmbeddingEngine;
|
|
71
|
+
/**
|
|
72
|
+
* Convenience function to get embeddings
|
|
73
|
+
*/
|
|
74
|
+
export declare function embed(text: string): Promise<number[]>;
|
|
75
|
+
/**
|
|
76
|
+
* Convenience function for batch embeddings
|
|
77
|
+
*/
|
|
78
|
+
export declare function embedBatch(texts: string[]): Promise<number[][]>;
|
|
79
|
+
/**
|
|
80
|
+
* Get embedding stats
|
|
81
|
+
*/
|
|
82
|
+
export declare function getEmbeddingStats(): EngineStats;
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WASM Embedding Engine
|
|
3
|
+
*
|
|
4
|
+
* The main embedding engine that combines all components:
|
|
5
|
+
* - WordPieceTokenizer: Text → Token IDs
|
|
6
|
+
* - ONNXInferenceEngine: Token IDs → Hidden States
|
|
7
|
+
* - EmbeddingPostProcessor: Hidden States → Normalized Embedding
|
|
8
|
+
*
|
|
9
|
+
* This replaces transformers.js with a clean, production-grade implementation.
|
|
10
|
+
*
|
|
11
|
+
* Features:
|
|
12
|
+
* - Singleton pattern (one model instance)
|
|
13
|
+
* - Lazy initialization
|
|
14
|
+
* - Batch processing support
|
|
15
|
+
* - Zero runtime dependencies
|
|
16
|
+
*/
|
|
17
|
+
import { WordPieceTokenizer } from './WordPieceTokenizer.js';
|
|
18
|
+
import { ONNXInferenceEngine } from './ONNXInferenceEngine.js';
|
|
19
|
+
import { EmbeddingPostProcessor } from './EmbeddingPostProcessor.js';
|
|
20
|
+
import { getAssetLoader } from './AssetLoader.js';
|
|
21
|
+
import { MODEL_CONSTANTS } from './types.js';
|
|
22
|
+
// Global singleton instance
|
|
23
|
+
let globalInstance = null;
|
|
24
|
+
let globalInitPromise = null;
|
|
25
|
+
/**
|
|
26
|
+
* WASM-based embedding engine
|
|
27
|
+
*/
|
|
28
|
+
export class WASMEmbeddingEngine {
|
|
29
|
+
constructor() {
|
|
30
|
+
this.tokenizer = null;
|
|
31
|
+
this.inference = null;
|
|
32
|
+
this.postProcessor = null;
|
|
33
|
+
this.initialized = false;
|
|
34
|
+
this.embedCount = 0;
|
|
35
|
+
this.totalProcessingTimeMs = 0;
|
|
36
|
+
// Private constructor for singleton
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Get the singleton instance
|
|
40
|
+
*/
|
|
41
|
+
static getInstance() {
|
|
42
|
+
if (!globalInstance) {
|
|
43
|
+
globalInstance = new WASMEmbeddingEngine();
|
|
44
|
+
}
|
|
45
|
+
return globalInstance;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Initialize all components
|
|
49
|
+
*/
|
|
50
|
+
async initialize() {
|
|
51
|
+
// Already initialized
|
|
52
|
+
if (this.initialized) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
// Initialization in progress
|
|
56
|
+
if (globalInitPromise) {
|
|
57
|
+
await globalInitPromise;
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
// Start initialization
|
|
61
|
+
globalInitPromise = this.performInit();
|
|
62
|
+
try {
|
|
63
|
+
await globalInitPromise;
|
|
64
|
+
}
|
|
65
|
+
finally {
|
|
66
|
+
globalInitPromise = null;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Perform actual initialization
|
|
71
|
+
*/
|
|
72
|
+
async performInit() {
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
console.log('🚀 Initializing WASM Embedding Engine...');
|
|
75
|
+
try {
|
|
76
|
+
const assetLoader = getAssetLoader();
|
|
77
|
+
// Verify assets exist
|
|
78
|
+
const verification = await assetLoader.verifyAssets();
|
|
79
|
+
if (!verification.valid) {
|
|
80
|
+
throw new Error(`Missing model assets:\n${verification.errors.join('\n')}\n\n` +
|
|
81
|
+
`Expected model at: ${verification.modelPath}\n` +
|
|
82
|
+
`Expected vocab at: ${verification.vocabPath}\n\n` +
|
|
83
|
+
`Run 'npm run download-model' to download the model files.`);
|
|
84
|
+
}
|
|
85
|
+
// Load vocabulary and create tokenizer
|
|
86
|
+
console.log('📖 Loading vocabulary...');
|
|
87
|
+
const vocab = await assetLoader.loadVocab();
|
|
88
|
+
this.tokenizer = new WordPieceTokenizer(vocab);
|
|
89
|
+
console.log(`✅ Vocabulary loaded: ${this.tokenizer.vocabSize} tokens`);
|
|
90
|
+
// Initialize ONNX inference engine
|
|
91
|
+
console.log('🧠 Loading ONNX model...');
|
|
92
|
+
const modelPath = await assetLoader.getModelPath();
|
|
93
|
+
this.inference = new ONNXInferenceEngine({ modelPath });
|
|
94
|
+
await this.inference.initialize(modelPath);
|
|
95
|
+
console.log('✅ ONNX model loaded');
|
|
96
|
+
// Create post-processor
|
|
97
|
+
this.postProcessor = new EmbeddingPostProcessor(MODEL_CONSTANTS.HIDDEN_SIZE);
|
|
98
|
+
this.initialized = true;
|
|
99
|
+
const initTime = Date.now() - startTime;
|
|
100
|
+
console.log(`✅ WASM Embedding Engine ready in ${initTime}ms`);
|
|
101
|
+
}
|
|
102
|
+
catch (error) {
|
|
103
|
+
this.initialized = false;
|
|
104
|
+
this.tokenizer = null;
|
|
105
|
+
this.inference = null;
|
|
106
|
+
this.postProcessor = null;
|
|
107
|
+
throw new Error(`Failed to initialize WASM Embedding Engine: ${error instanceof Error ? error.message : String(error)}`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Generate embedding for text
|
|
112
|
+
*/
|
|
113
|
+
async embed(text) {
|
|
114
|
+
const result = await this.embedWithMetadata(text);
|
|
115
|
+
return result.embedding;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Generate embedding with metadata
|
|
119
|
+
*/
|
|
120
|
+
async embedWithMetadata(text) {
|
|
121
|
+
// Ensure initialized
|
|
122
|
+
if (!this.initialized) {
|
|
123
|
+
await this.initialize();
|
|
124
|
+
}
|
|
125
|
+
if (!this.tokenizer || !this.inference || !this.postProcessor) {
|
|
126
|
+
throw new Error('Engine not properly initialized');
|
|
127
|
+
}
|
|
128
|
+
const startTime = Date.now();
|
|
129
|
+
// 1. Tokenize
|
|
130
|
+
const tokenized = this.tokenizer.encode(text);
|
|
131
|
+
// 2. Run inference
|
|
132
|
+
const hiddenStates = await this.inference.inferSingle(tokenized.inputIds, tokenized.attentionMask, tokenized.tokenTypeIds);
|
|
133
|
+
// 3. Post-process (mean pool + normalize)
|
|
134
|
+
const embedding = this.postProcessor.process(hiddenStates, tokenized.attentionMask, tokenized.inputIds.length);
|
|
135
|
+
const processingTimeMs = Date.now() - startTime;
|
|
136
|
+
this.embedCount++;
|
|
137
|
+
this.totalProcessingTimeMs += processingTimeMs;
|
|
138
|
+
return {
|
|
139
|
+
embedding: Array.from(embedding),
|
|
140
|
+
tokenCount: tokenized.tokenCount,
|
|
141
|
+
processingTimeMs,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Batch embed multiple texts
|
|
146
|
+
*/
|
|
147
|
+
async embedBatch(texts) {
|
|
148
|
+
// Ensure initialized
|
|
149
|
+
if (!this.initialized) {
|
|
150
|
+
await this.initialize();
|
|
151
|
+
}
|
|
152
|
+
if (!this.tokenizer || !this.inference || !this.postProcessor) {
|
|
153
|
+
throw new Error('Engine not properly initialized');
|
|
154
|
+
}
|
|
155
|
+
if (texts.length === 0) {
|
|
156
|
+
return [];
|
|
157
|
+
}
|
|
158
|
+
// Tokenize all texts
|
|
159
|
+
const batch = this.tokenizer.encodeBatch(texts);
|
|
160
|
+
const seqLen = batch.inputIds[0].length;
|
|
161
|
+
// Run batch inference
|
|
162
|
+
const hiddenStates = await this.inference.infer(batch.inputIds, batch.attentionMask, batch.tokenTypeIds);
|
|
163
|
+
// Post-process each result
|
|
164
|
+
const embeddings = this.postProcessor.processBatch(hiddenStates, batch.attentionMask, texts.length, seqLen);
|
|
165
|
+
this.embedCount += texts.length;
|
|
166
|
+
return embeddings.map(e => Array.from(e));
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Check if initialized
|
|
170
|
+
*/
|
|
171
|
+
isInitialized() {
|
|
172
|
+
return this.initialized;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Get engine statistics
|
|
176
|
+
*/
|
|
177
|
+
getStats() {
|
|
178
|
+
return {
|
|
179
|
+
initialized: this.initialized,
|
|
180
|
+
embedCount: this.embedCount,
|
|
181
|
+
totalProcessingTimeMs: this.totalProcessingTimeMs,
|
|
182
|
+
avgProcessingTimeMs: this.embedCount > 0
|
|
183
|
+
? this.totalProcessingTimeMs / this.embedCount
|
|
184
|
+
: 0,
|
|
185
|
+
modelName: MODEL_CONSTANTS.MODEL_NAME,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Dispose and free resources
|
|
190
|
+
*/
|
|
191
|
+
async dispose() {
|
|
192
|
+
if (this.inference) {
|
|
193
|
+
await this.inference.dispose();
|
|
194
|
+
this.inference = null;
|
|
195
|
+
}
|
|
196
|
+
this.tokenizer = null;
|
|
197
|
+
this.postProcessor = null;
|
|
198
|
+
this.initialized = false;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Reset singleton (for testing)
|
|
202
|
+
*/
|
|
203
|
+
static resetInstance() {
|
|
204
|
+
if (globalInstance) {
|
|
205
|
+
globalInstance.dispose();
|
|
206
|
+
}
|
|
207
|
+
globalInstance = null;
|
|
208
|
+
globalInitPromise = null;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// Export singleton access
|
|
212
|
+
export const wasmEmbeddingEngine = WASMEmbeddingEngine.getInstance();
|
|
213
|
+
/**
|
|
214
|
+
* Convenience function to get embeddings
|
|
215
|
+
*/
|
|
216
|
+
export async function embed(text) {
|
|
217
|
+
return wasmEmbeddingEngine.embed(text);
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Convenience function for batch embeddings
|
|
221
|
+
*/
|
|
222
|
+
export async function embedBatch(texts) {
|
|
223
|
+
return wasmEmbeddingEngine.embedBatch(texts);
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Get embedding stats
|
|
227
|
+
*/
|
|
228
|
+
export function getEmbeddingStats() {
|
|
229
|
+
return wasmEmbeddingEngine.getStats();
|
|
230
|
+
}
|
|
231
|
+
//# sourceMappingURL=WASMEmbeddingEngine.js.map
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WordPiece Tokenizer for BERT-based models
|
|
3
|
+
*
|
|
4
|
+
* Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
|
|
5
|
+
* This is a clean, dependency-free implementation.
|
|
6
|
+
*
|
|
7
|
+
* Algorithm:
|
|
8
|
+
* 1. Normalize text (lowercase for uncased models)
|
|
9
|
+
* 2. Split on whitespace and punctuation
|
|
10
|
+
* 3. Apply WordPiece subword tokenization
|
|
11
|
+
* 4. Add special tokens ([CLS], [SEP])
|
|
12
|
+
* 5. Generate attention mask
|
|
13
|
+
*/
|
|
14
|
+
import { TokenizerConfig, TokenizedInput } from './types.js';
|
|
15
|
+
/**
|
|
16
|
+
* WordPiece tokenizer for BERT-based sentence transformers
|
|
17
|
+
*/
|
|
18
|
+
export declare class WordPieceTokenizer {
|
|
19
|
+
private vocab;
|
|
20
|
+
private reverseVocab;
|
|
21
|
+
private config;
|
|
22
|
+
constructor(vocab: Map<string, number> | Record<string, number>, config?: Partial<TokenizerConfig>);
|
|
23
|
+
/**
|
|
24
|
+
* Tokenize text into token IDs
|
|
25
|
+
*/
|
|
26
|
+
encode(text: string): TokenizedInput;
|
|
27
|
+
/**
|
|
28
|
+
* Encode with padding to fixed length
|
|
29
|
+
*/
|
|
30
|
+
encodeWithPadding(text: string, targetLength?: number): TokenizedInput;
|
|
31
|
+
/**
|
|
32
|
+
* Batch encode multiple texts
|
|
33
|
+
*/
|
|
34
|
+
encodeBatch(texts: string[]): {
|
|
35
|
+
inputIds: number[][];
|
|
36
|
+
attentionMask: number[][];
|
|
37
|
+
tokenTypeIds: number[][];
|
|
38
|
+
};
|
|
39
|
+
/**
|
|
40
|
+
* Basic tokenization: split on whitespace and punctuation
|
|
41
|
+
*/
|
|
42
|
+
private basicTokenize;
|
|
43
|
+
/**
|
|
44
|
+
* WordPiece tokenization for a single word
|
|
45
|
+
*/
|
|
46
|
+
private wordPieceTokenize;
|
|
47
|
+
/**
|
|
48
|
+
* Check if character is whitespace
|
|
49
|
+
*/
|
|
50
|
+
private isWhitespace;
|
|
51
|
+
/**
|
|
52
|
+
* Check if character is punctuation
|
|
53
|
+
*/
|
|
54
|
+
private isPunctuation;
|
|
55
|
+
/**
|
|
56
|
+
* Decode token IDs back to text (for debugging)
|
|
57
|
+
*/
|
|
58
|
+
decode(tokenIds: number[]): string;
|
|
59
|
+
/**
|
|
60
|
+
* Get vocabulary size
|
|
61
|
+
*/
|
|
62
|
+
get vocabSize(): number;
|
|
63
|
+
/**
|
|
64
|
+
* Get max sequence length
|
|
65
|
+
*/
|
|
66
|
+
get maxLength(): number;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Create tokenizer from vocabulary JSON
|
|
70
|
+
*/
|
|
71
|
+
export declare function createTokenizer(vocabJson: Record<string, number>): WordPieceTokenizer;
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WordPiece Tokenizer for BERT-based models
|
|
3
|
+
*
|
|
4
|
+
* Implements the WordPiece tokenization algorithm used by all-MiniLM-L6-v2.
|
|
5
|
+
* This is a clean, dependency-free implementation.
|
|
6
|
+
*
|
|
7
|
+
* Algorithm:
|
|
8
|
+
* 1. Normalize text (lowercase for uncased models)
|
|
9
|
+
* 2. Split on whitespace and punctuation
|
|
10
|
+
* 3. Apply WordPiece subword tokenization
|
|
11
|
+
* 4. Add special tokens ([CLS], [SEP])
|
|
12
|
+
* 5. Generate attention mask
|
|
13
|
+
*/
|
|
14
|
+
import { SPECIAL_TOKENS, MODEL_CONSTANTS, } from './types.js';
|
|
15
|
+
/**
|
|
16
|
+
* WordPiece tokenizer for BERT-based sentence transformers
|
|
17
|
+
*/
|
|
18
|
+
export class WordPieceTokenizer {
|
|
19
|
+
constructor(vocab, config) {
|
|
20
|
+
// Convert Record to Map if needed
|
|
21
|
+
this.vocab = vocab instanceof Map ? vocab : new Map(Object.entries(vocab));
|
|
22
|
+
// Build reverse vocab for debugging
|
|
23
|
+
this.reverseVocab = new Map();
|
|
24
|
+
for (const [token, id] of this.vocab) {
|
|
25
|
+
this.reverseVocab.set(id, token);
|
|
26
|
+
}
|
|
27
|
+
// Default config for all-MiniLM-L6-v2
|
|
28
|
+
this.config = {
|
|
29
|
+
vocab: this.vocab,
|
|
30
|
+
unkTokenId: config?.unkTokenId ?? SPECIAL_TOKENS.UNK,
|
|
31
|
+
clsTokenId: config?.clsTokenId ?? SPECIAL_TOKENS.CLS,
|
|
32
|
+
sepTokenId: config?.sepTokenId ?? SPECIAL_TOKENS.SEP,
|
|
33
|
+
padTokenId: config?.padTokenId ?? SPECIAL_TOKENS.PAD,
|
|
34
|
+
maxLength: config?.maxLength ?? MODEL_CONSTANTS.MAX_SEQUENCE_LENGTH,
|
|
35
|
+
doLowerCase: config?.doLowerCase ?? true,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Tokenize text into token IDs
|
|
40
|
+
*/
|
|
41
|
+
encode(text) {
|
|
42
|
+
// 1. Normalize
|
|
43
|
+
let normalizedText = text;
|
|
44
|
+
if (this.config.doLowerCase) {
|
|
45
|
+
normalizedText = text.toLowerCase();
|
|
46
|
+
}
|
|
47
|
+
// 2. Clean and split into words
|
|
48
|
+
const words = this.basicTokenize(normalizedText);
|
|
49
|
+
// 3. Apply WordPiece to each word
|
|
50
|
+
const tokens = [this.config.clsTokenId];
|
|
51
|
+
for (const word of words) {
|
|
52
|
+
const wordTokens = this.wordPieceTokenize(word);
|
|
53
|
+
// Check if adding these tokens would exceed max length (accounting for [SEP])
|
|
54
|
+
if (tokens.length + wordTokens.length + 1 > this.config.maxLength) {
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
tokens.push(...wordTokens);
|
|
58
|
+
}
|
|
59
|
+
tokens.push(this.config.sepTokenId);
|
|
60
|
+
// 4. Generate attention mask and token type IDs
|
|
61
|
+
const attentionMask = new Array(tokens.length).fill(1);
|
|
62
|
+
const tokenTypeIds = new Array(tokens.length).fill(0);
|
|
63
|
+
return {
|
|
64
|
+
inputIds: tokens,
|
|
65
|
+
attentionMask,
|
|
66
|
+
tokenTypeIds,
|
|
67
|
+
tokenCount: tokens.length - 2, // Exclude [CLS] and [SEP]
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Encode with padding to fixed length
|
|
72
|
+
*/
|
|
73
|
+
encodeWithPadding(text, targetLength) {
|
|
74
|
+
const result = this.encode(text);
|
|
75
|
+
const padLength = targetLength ?? this.config.maxLength;
|
|
76
|
+
// Pad to target length
|
|
77
|
+
while (result.inputIds.length < padLength) {
|
|
78
|
+
result.inputIds.push(this.config.padTokenId);
|
|
79
|
+
result.attentionMask.push(0);
|
|
80
|
+
result.tokenTypeIds.push(0);
|
|
81
|
+
}
|
|
82
|
+
// Truncate if longer (shouldn't happen with proper encode())
|
|
83
|
+
if (result.inputIds.length > padLength) {
|
|
84
|
+
result.inputIds.length = padLength;
|
|
85
|
+
result.attentionMask.length = padLength;
|
|
86
|
+
result.tokenTypeIds.length = padLength;
|
|
87
|
+
// Ensure [SEP] is at the end
|
|
88
|
+
result.inputIds[padLength - 1] = this.config.sepTokenId;
|
|
89
|
+
result.attentionMask[padLength - 1] = 1;
|
|
90
|
+
}
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Batch encode multiple texts
|
|
95
|
+
*/
|
|
96
|
+
encodeBatch(texts) {
|
|
97
|
+
const results = texts.map((text) => this.encode(text));
|
|
98
|
+
// Find max length in batch
|
|
99
|
+
const maxLen = Math.max(...results.map((r) => r.inputIds.length));
|
|
100
|
+
// Pad all to same length
|
|
101
|
+
const inputIds = [];
|
|
102
|
+
const attentionMask = [];
|
|
103
|
+
const tokenTypeIds = [];
|
|
104
|
+
for (const result of results) {
|
|
105
|
+
const padded = this.encodeWithPadding('', // Not used since we're modifying result
|
|
106
|
+
maxLen);
|
|
107
|
+
// Copy original values
|
|
108
|
+
for (let i = 0; i < result.inputIds.length; i++) {
|
|
109
|
+
padded.inputIds[i] = result.inputIds[i];
|
|
110
|
+
padded.attentionMask[i] = result.attentionMask[i];
|
|
111
|
+
padded.tokenTypeIds[i] = result.tokenTypeIds[i];
|
|
112
|
+
}
|
|
113
|
+
// Pad the rest
|
|
114
|
+
for (let i = result.inputIds.length; i < maxLen; i++) {
|
|
115
|
+
padded.inputIds[i] = this.config.padTokenId;
|
|
116
|
+
padded.attentionMask[i] = 0;
|
|
117
|
+
padded.tokenTypeIds[i] = 0;
|
|
118
|
+
}
|
|
119
|
+
inputIds.push(padded.inputIds.slice(0, maxLen));
|
|
120
|
+
attentionMask.push(padded.attentionMask.slice(0, maxLen));
|
|
121
|
+
tokenTypeIds.push(padded.tokenTypeIds.slice(0, maxLen));
|
|
122
|
+
}
|
|
123
|
+
return { inputIds, attentionMask, tokenTypeIds };
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Basic tokenization: split on whitespace and punctuation
|
|
127
|
+
*/
|
|
128
|
+
basicTokenize(text) {
|
|
129
|
+
// Clean whitespace
|
|
130
|
+
text = text.trim().replace(/\s+/g, ' ');
|
|
131
|
+
if (!text) {
|
|
132
|
+
return [];
|
|
133
|
+
}
|
|
134
|
+
const words = [];
|
|
135
|
+
let currentWord = '';
|
|
136
|
+
for (const char of text) {
|
|
137
|
+
if (this.isWhitespace(char)) {
|
|
138
|
+
if (currentWord) {
|
|
139
|
+
words.push(currentWord);
|
|
140
|
+
currentWord = '';
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
else if (this.isPunctuation(char)) {
|
|
144
|
+
if (currentWord) {
|
|
145
|
+
words.push(currentWord);
|
|
146
|
+
currentWord = '';
|
|
147
|
+
}
|
|
148
|
+
words.push(char);
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
currentWord += char;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (currentWord) {
|
|
155
|
+
words.push(currentWord);
|
|
156
|
+
}
|
|
157
|
+
return words;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* WordPiece tokenization for a single word
|
|
161
|
+
*/
|
|
162
|
+
wordPieceTokenize(word) {
|
|
163
|
+
if (!word) {
|
|
164
|
+
return [];
|
|
165
|
+
}
|
|
166
|
+
// Check if whole word is in vocabulary
|
|
167
|
+
if (this.vocab.has(word)) {
|
|
168
|
+
return [this.vocab.get(word)];
|
|
169
|
+
}
|
|
170
|
+
const tokens = [];
|
|
171
|
+
let start = 0;
|
|
172
|
+
while (start < word.length) {
|
|
173
|
+
let end = word.length;
|
|
174
|
+
let foundToken = false;
|
|
175
|
+
while (start < end) {
|
|
176
|
+
let substr = word.slice(start, end);
|
|
177
|
+
// Add ## prefix for subwords (not at start of word)
|
|
178
|
+
if (start > 0) {
|
|
179
|
+
substr = '##' + substr;
|
|
180
|
+
}
|
|
181
|
+
if (this.vocab.has(substr)) {
|
|
182
|
+
tokens.push(this.vocab.get(substr));
|
|
183
|
+
foundToken = true;
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
end--;
|
|
187
|
+
}
|
|
188
|
+
if (!foundToken) {
|
|
189
|
+
// Unknown character - use [UNK] for single character
|
|
190
|
+
tokens.push(this.config.unkTokenId);
|
|
191
|
+
start++;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
start = end;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return tokens;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Check if character is whitespace
|
|
201
|
+
*/
|
|
202
|
+
isWhitespace(char) {
|
|
203
|
+
return /\s/.test(char);
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Check if character is punctuation
|
|
207
|
+
*/
|
|
208
|
+
isPunctuation(char) {
|
|
209
|
+
const code = char.charCodeAt(0);
|
|
210
|
+
// ASCII punctuation ranges
|
|
211
|
+
if ((code >= 33 && code <= 47) || // !"#$%&'()*+,-./
|
|
212
|
+
(code >= 58 && code <= 64) || // :;<=>?@
|
|
213
|
+
(code >= 91 && code <= 96) || // [\]^_`
|
|
214
|
+
(code >= 123 && code <= 126) // {|}~
|
|
215
|
+
) {
|
|
216
|
+
return true;
|
|
217
|
+
}
|
|
218
|
+
// Unicode punctuation categories
|
|
219
|
+
return /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-./:;<=>?@\[\]^_`{|}~]/.test(char);
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Decode token IDs back to text (for debugging)
|
|
223
|
+
*/
|
|
224
|
+
decode(tokenIds) {
|
|
225
|
+
const tokens = [];
|
|
226
|
+
for (const id of tokenIds) {
|
|
227
|
+
const token = this.reverseVocab.get(id);
|
|
228
|
+
if (token && !['[CLS]', '[SEP]', '[PAD]'].includes(token)) {
|
|
229
|
+
if (token.startsWith('##')) {
|
|
230
|
+
// Subword - append without space
|
|
231
|
+
if (tokens.length > 0) {
|
|
232
|
+
tokens[tokens.length - 1] += token.slice(2);
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
tokens.push(token.slice(2));
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
else {
|
|
239
|
+
tokens.push(token);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return tokens.join(' ');
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Get vocabulary size
|
|
247
|
+
*/
|
|
248
|
+
get vocabSize() {
|
|
249
|
+
return this.vocab.size;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Get max sequence length
|
|
253
|
+
*/
|
|
254
|
+
get maxLength() {
|
|
255
|
+
return this.config.maxLength;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Create tokenizer from vocabulary JSON
|
|
260
|
+
*/
|
|
261
|
+
export function createTokenizer(vocabJson) {
|
|
262
|
+
return new WordPieceTokenizer(vocabJson);
|
|
263
|
+
}
|
|
264
|
+
//# sourceMappingURL=WordPieceTokenizer.js.map
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WASM Embedding Engine - Public Exports
|
|
3
|
+
*
|
|
4
|
+
* Clean, production-grade embedding engine using direct ONNX WASM.
|
|
5
|
+
* No transformers.js dependency, no runtime downloads, works everywhere.
|
|
6
|
+
*/
|
|
7
|
+
export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
|
|
8
|
+
export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
|
|
9
|
+
export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
|
|
10
|
+
export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
|
|
11
|
+
export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
|
|
12
|
+
export type { TokenizerConfig, TokenizedInput, InferenceConfig, EmbeddingResult, EngineStats, ModelConfig, } from './types.js';
|
|
13
|
+
export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WASM Embedding Engine - Public Exports
|
|
3
|
+
*
|
|
4
|
+
* Clean, production-grade embedding engine using direct ONNX WASM.
|
|
5
|
+
* No transformers.js dependency, no runtime downloads, works everywhere.
|
|
6
|
+
*/
|
|
7
|
+
// Main engine
|
|
8
|
+
export { WASMEmbeddingEngine, wasmEmbeddingEngine, embed, embedBatch, getEmbeddingStats, } from './WASMEmbeddingEngine.js';
|
|
9
|
+
// Components (for advanced use)
|
|
10
|
+
export { WordPieceTokenizer, createTokenizer } from './WordPieceTokenizer.js';
|
|
11
|
+
export { ONNXInferenceEngine, createInferenceEngine } from './ONNXInferenceEngine.js';
|
|
12
|
+
export { EmbeddingPostProcessor, createPostProcessor } from './EmbeddingPostProcessor.js';
|
|
13
|
+
export { AssetLoader, getAssetLoader, createAssetLoader } from './AssetLoader.js';
|
|
14
|
+
export { SPECIAL_TOKENS, MODEL_CONSTANTS } from './types.js';
|
|
15
|
+
//# sourceMappingURL=index.js.map
|