edgeflowjs 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/README.md +200 -66
  2. package/dist/backends/index.d.ts +9 -2
  3. package/dist/backends/index.d.ts.map +1 -1
  4. package/dist/backends/index.js +13 -13
  5. package/dist/backends/index.js.map +1 -1
  6. package/dist/backends/onnx.d.ts +49 -4
  7. package/dist/backends/onnx.d.ts.map +1 -1
  8. package/dist/backends/onnx.js +165 -76
  9. package/dist/backends/onnx.js.map +1 -1
  10. package/dist/backends/transformers-adapter.d.ts +99 -0
  11. package/dist/backends/transformers-adapter.d.ts.map +1 -0
  12. package/dist/backends/transformers-adapter.js +171 -0
  13. package/dist/backends/transformers-adapter.js.map +1 -0
  14. package/dist/backends/webgpu.d.ts +7 -5
  15. package/dist/backends/webgpu.d.ts.map +1 -1
  16. package/dist/backends/webgpu.js +7 -5
  17. package/dist/backends/webgpu.js.map +1 -1
  18. package/dist/backends/webnn.d.ts +6 -5
  19. package/dist/backends/webnn.d.ts.map +1 -1
  20. package/dist/backends/webnn.js +6 -5
  21. package/dist/backends/webnn.js.map +1 -1
  22. package/dist/core/composer.d.ts +118 -0
  23. package/dist/core/composer.d.ts.map +1 -0
  24. package/dist/core/composer.js +163 -0
  25. package/dist/core/composer.js.map +1 -0
  26. package/dist/core/device-profiler.d.ts +75 -0
  27. package/dist/core/device-profiler.d.ts.map +1 -0
  28. package/dist/core/device-profiler.js +131 -0
  29. package/dist/core/device-profiler.js.map +1 -0
  30. package/dist/core/index.d.ts +4 -0
  31. package/dist/core/index.d.ts.map +1 -1
  32. package/dist/core/index.js +8 -0
  33. package/dist/core/index.js.map +1 -1
  34. package/dist/core/memory.d.ts +22 -2
  35. package/dist/core/memory.d.ts.map +1 -1
  36. package/dist/core/memory.js +49 -13
  37. package/dist/core/memory.js.map +1 -1
  38. package/dist/core/plugin.d.ts +100 -0
  39. package/dist/core/plugin.d.ts.map +1 -0
  40. package/dist/core/plugin.js +106 -0
  41. package/dist/core/plugin.js.map +1 -0
  42. package/dist/core/runtime.d.ts +4 -0
  43. package/dist/core/runtime.d.ts.map +1 -1
  44. package/dist/core/runtime.js +18 -0
  45. package/dist/core/runtime.js.map +1 -1
  46. package/dist/core/scheduler.d.ts +17 -0
  47. package/dist/core/scheduler.d.ts.map +1 -1
  48. package/dist/core/scheduler.js +101 -3
  49. package/dist/core/scheduler.js.map +1 -1
  50. package/dist/core/types.d.ts +14 -0
  51. package/dist/core/types.d.ts.map +1 -1
  52. package/dist/core/types.js.map +1 -1
  53. package/dist/core/worker.d.ts +202 -0
  54. package/dist/core/worker.d.ts.map +1 -0
  55. package/dist/core/worker.js +477 -0
  56. package/dist/core/worker.js.map +1 -0
  57. package/dist/edgeflow.browser.js +9789 -4379
  58. package/dist/edgeflow.browser.js.map +4 -4
  59. package/dist/edgeflow.browser.min.js +435 -5
  60. package/dist/edgeflow.browser.min.js.map +4 -4
  61. package/dist/index.d.ts +9 -6
  62. package/dist/index.d.ts.map +1 -1
  63. package/dist/index.js +32 -12
  64. package/dist/index.js.map +1 -1
  65. package/dist/pipelines/automatic-speech-recognition.d.ts +63 -0
  66. package/dist/pipelines/automatic-speech-recognition.d.ts.map +1 -0
  67. package/dist/pipelines/automatic-speech-recognition.js +269 -0
  68. package/dist/pipelines/automatic-speech-recognition.js.map +1 -0
  69. package/dist/pipelines/base.d.ts +6 -1
  70. package/dist/pipelines/base.d.ts.map +1 -1
  71. package/dist/pipelines/base.js +12 -2
  72. package/dist/pipelines/base.js.map +1 -1
  73. package/dist/pipelines/feature-extraction.d.ts +5 -40
  74. package/dist/pipelines/feature-extraction.d.ts.map +1 -1
  75. package/dist/pipelines/feature-extraction.js +44 -63
  76. package/dist/pipelines/feature-extraction.js.map +1 -1
  77. package/dist/pipelines/image-classification.d.ts +4 -36
  78. package/dist/pipelines/image-classification.d.ts.map +1 -1
  79. package/dist/pipelines/image-classification.js +22 -60
  80. package/dist/pipelines/image-classification.js.map +1 -1
  81. package/dist/pipelines/image-segmentation.d.ts +221 -0
  82. package/dist/pipelines/image-segmentation.d.ts.map +1 -0
  83. package/dist/pipelines/image-segmentation.js +535 -0
  84. package/dist/pipelines/image-segmentation.js.map +1 -0
  85. package/dist/pipelines/index.d.ts +18 -0
  86. package/dist/pipelines/index.d.ts.map +1 -1
  87. package/dist/pipelines/index.js +51 -2
  88. package/dist/pipelines/index.js.map +1 -1
  89. package/dist/pipelines/object-detection.d.ts +44 -0
  90. package/dist/pipelines/object-detection.d.ts.map +1 -0
  91. package/dist/pipelines/object-detection.js +218 -0
  92. package/dist/pipelines/object-detection.js.map +1 -0
  93. package/dist/pipelines/question-answering.d.ts +41 -0
  94. package/dist/pipelines/question-answering.d.ts.map +1 -0
  95. package/dist/pipelines/question-answering.js +164 -0
  96. package/dist/pipelines/question-answering.js.map +1 -0
  97. package/dist/pipelines/text-classification.d.ts +3 -39
  98. package/dist/pipelines/text-classification.d.ts.map +1 -1
  99. package/dist/pipelines/text-classification.js +29 -67
  100. package/dist/pipelines/text-classification.js.map +1 -1
  101. package/dist/pipelines/text-generation.d.ts +281 -0
  102. package/dist/pipelines/text-generation.d.ts.map +1 -0
  103. package/dist/pipelines/text-generation.js +766 -0
  104. package/dist/pipelines/text-generation.js.map +1 -0
  105. package/dist/pipelines/zero-shot-classification.d.ts +45 -0
  106. package/dist/pipelines/zero-shot-classification.d.ts.map +1 -0
  107. package/dist/pipelines/zero-shot-classification.js +140 -0
  108. package/dist/pipelines/zero-shot-classification.js.map +1 -0
  109. package/dist/tools/benchmark.d.ts +92 -0
  110. package/dist/tools/benchmark.d.ts.map +1 -0
  111. package/dist/tools/benchmark.js +213 -0
  112. package/dist/tools/benchmark.js.map +1 -0
  113. package/dist/tools/debugger.d.ts +258 -0
  114. package/dist/tools/debugger.d.ts.map +1 -0
  115. package/dist/tools/debugger.js +624 -0
  116. package/dist/tools/debugger.js.map +1 -0
  117. package/dist/tools/index.d.ts +8 -0
  118. package/dist/tools/index.d.ts.map +1 -1
  119. package/dist/tools/index.js +16 -0
  120. package/dist/tools/index.js.map +1 -1
  121. package/dist/tools/monitor.d.ts +284 -0
  122. package/dist/tools/monitor.d.ts.map +1 -0
  123. package/dist/tools/monitor.js +921 -0
  124. package/dist/tools/monitor.js.map +1 -0
  125. package/dist/tools/quantization.d.ts +235 -0
  126. package/dist/tools/quantization.d.ts.map +1 -0
  127. package/dist/tools/quantization.js +830 -0
  128. package/dist/tools/quantization.js.map +1 -0
  129. package/dist/utils/hub.d.ts +162 -0
  130. package/dist/utils/hub.d.ts.map +1 -0
  131. package/dist/utils/hub.js +311 -0
  132. package/dist/utils/hub.js.map +1 -0
  133. package/dist/utils/index.d.ts +3 -1
  134. package/dist/utils/index.d.ts.map +1 -1
  135. package/dist/utils/index.js +5 -1
  136. package/dist/utils/index.js.map +1 -1
  137. package/dist/utils/model-loader.d.ts.map +1 -1
  138. package/dist/utils/model-loader.js +106 -30
  139. package/dist/utils/model-loader.js.map +1 -1
  140. package/dist/utils/offline.d.ts +147 -0
  141. package/dist/utils/offline.d.ts.map +1 -0
  142. package/dist/utils/offline.js +405 -0
  143. package/dist/utils/offline.js.map +1 -0
  144. package/dist/utils/preprocessor.d.ts +82 -6
  145. package/dist/utils/preprocessor.d.ts.map +1 -1
  146. package/dist/utils/preprocessor.js +278 -21
  147. package/dist/utils/preprocessor.js.map +1 -1
  148. package/dist/utils/tokenizer.d.ts +197 -72
  149. package/dist/utils/tokenizer.d.ts.map +1 -1
  150. package/dist/utils/tokenizer.js +558 -274
  151. package/dist/utils/tokenizer.js.map +1 -1
  152. package/package.json +26 -11
@@ -1,185 +1,283 @@
1
1
  /**
2
2
  * edgeFlow.js - Tokenizer
3
3
  *
4
- * Lightweight tokenizer implementation for text processing.
5
- * Supports BPE, WordPiece, and basic tokenization.
4
+ * Full-featured tokenizer supporting HuggingFace tokenizer.json format.
5
+ * Supports BPE, WordPiece, and Unigram tokenization.
6
6
  */
7
7
  import { EdgeFlowError, ErrorCodes, } from '../core/types.js';
8
8
  // ============================================================================
9
- // Base Tokenizer
9
+ // Tokenizer Implementation
10
10
  // ============================================================================
11
11
  /**
12
- * Tokenizer - Base class for all tokenizers
12
+ * Tokenizer - Full-featured tokenizer supporting HuggingFace format
13
13
  */
14
14
  export class Tokenizer {
15
- vocab;
16
- reverseVocab;
17
- config;
18
- model;
15
+ vocab = new Map();
16
+ reverseVocab = new Map();
19
17
  merges = new Map();
20
- constructor(config, options = {}) {
21
- this.config = {
22
- vocabSize: config.vocabSize ?? 30522,
23
- maxLength: config.maxLength ?? 512,
24
- padTokenId: config.padTokenId ?? 0,
25
- unkTokenId: config.unkTokenId ?? 100,
26
- bosTokenId: config.bosTokenId,
27
- eosTokenId: config.eosTokenId,
28
- sepTokenId: config.sepTokenId ?? 102,
29
- clsTokenId: config.clsTokenId ?? 101,
30
- maskTokenId: config.maskTokenId ?? 103,
31
- };
32
- this.model = options.model ?? 'basic';
33
- this.vocab = new Map();
34
- this.reverseVocab = new Map();
35
- // Load vocabulary
36
- if (options.vocab) {
37
- this.loadVocab(options.vocab);
38
- }
39
- // Load merges for BPE
40
- if (options.merges) {
41
- this.loadMerges(options.merges);
42
- }
18
+ addedTokens = new Map();
19
+ specialTokens = new Set();
20
+ modelType = 'BPE';
21
+ unkToken = '[UNK]';
22
+ continuingSubwordPrefix = '##';
23
+ // Special token IDs
24
+ padTokenId = 0;
25
+ unkTokenId = 0;
26
+ clsTokenId;
27
+ sepTokenId;
28
+ maskTokenId;
29
+ bosTokenId;
30
+ eosTokenId;
31
+ // Config
32
+ maxLength = 512;
33
+ doLowerCase = false;
34
+ stripAccents = false;
35
+ // Post-processor config
36
+ postProcessor;
37
+ // Byte encoder for BPE
38
+ byteEncoder = new Map();
39
+ byteDecoder = new Map();
40
+ constructor() {
41
+ this.initByteEncoder();
43
42
  }
44
43
  /**
45
- * Load vocabulary
44
+ * Initialize byte encoder/decoder for BPE
46
45
  */
47
- loadVocab(vocab) {
48
- if (vocab instanceof Map) {
49
- this.vocab = new Map(vocab);
50
- }
51
- else {
52
- this.vocab = new Map(Object.entries(vocab));
46
+ initByteEncoder() {
47
+ const bytes = [];
48
+ // Printable ASCII
49
+ for (let i = 33; i <= 126; i++)
50
+ bytes.push(i);
51
+ for (let i = 161; i <= 172; i++)
52
+ bytes.push(i);
53
+ for (let i = 174; i <= 255; i++)
54
+ bytes.push(i);
55
+ const chars = [...bytes];
56
+ let n = 0;
57
+ for (let i = 0; i < 256; i++) {
58
+ if (!bytes.includes(i)) {
59
+ bytes.push(i);
60
+ chars.push(256 + n);
61
+ n++;
62
+ }
53
63
  }
54
- // Build reverse vocab
55
- for (const [token, id] of this.vocab) {
56
- this.reverseVocab.set(id, token);
64
+ for (let i = 0; i < bytes.length; i++) {
65
+ const byte = bytes[i];
66
+ const char = String.fromCharCode(chars[i]);
67
+ this.byteEncoder.set(byte, char);
68
+ this.byteDecoder.set(char, byte);
57
69
  }
58
70
  }
59
71
  /**
60
- * Load BPE merges
72
+ * Load from HuggingFace tokenizer.json
61
73
  */
62
- loadMerges(merges) {
63
- for (const merge of merges) {
64
- const [a, b] = merge.split(' ');
65
- if (a && b) {
66
- this.merges.set(`${a} ${b}`, `${a}${b}`);
74
+ static async fromJSON(json) {
75
+ const tokenizer = new Tokenizer();
76
+ const data = typeof json === 'string' ? JSON.parse(json) : json;
77
+ // Load model config
78
+ if (data.model) {
79
+ tokenizer.modelType = data.model.type;
80
+ // Load vocabulary.
81
+ // BPE/WordPiece: vocab is an object { token: id }.
82
+ // Unigram (SentencePiece): vocab is an array of [token, score] pairs
83
+ // where the array *index* is the token ID.
84
+ if (data.model.vocab) {
85
+ if (Array.isArray(data.model.vocab)) {
86
+ // Unigram format
87
+ const unigramVocab = data.model.vocab;
88
+ for (let i = 0; i < unigramVocab.length; i++) {
89
+ const entry = unigramVocab[i];
90
+ const token = Array.isArray(entry) ? entry[0] : entry;
91
+ tokenizer.vocab.set(token, i);
92
+ tokenizer.reverseVocab.set(i, token);
93
+ }
94
+ }
95
+ else {
96
+ for (const [token, id] of Object.entries(data.model.vocab)) {
97
+ tokenizer.vocab.set(token, id);
98
+ tokenizer.reverseVocab.set(id, token);
99
+ }
100
+ }
67
101
  }
68
- }
69
- }
70
- /**
71
- * Tokenize text
72
- */
73
- encode(text, options = {}) {
74
- const { addSpecialTokens = true, maxLength = this.config.maxLength, padding = 'max_length', truncation = true, returnAttentionMask = true, returnTokenTypeIds = false, } = options;
75
- // Tokenize
76
- let tokens = this.tokenize(text);
77
- // Add special tokens
78
- if (addSpecialTokens) {
79
- tokens = this.addSpecialTokens(tokens);
80
- }
81
- // Convert to IDs
82
- let inputIds = this.convertTokensToIds(tokens);
83
- // Truncate if needed
84
- if (truncation && inputIds.length > maxLength) {
85
- inputIds = inputIds.slice(0, maxLength);
86
- // Ensure EOS token if present
87
- if (addSpecialTokens && this.config.sepTokenId !== undefined) {
88
- inputIds[inputIds.length - 1] = this.config.sepTokenId;
102
+ // Load merges for BPE
103
+ if (data.model.merges) {
104
+ for (let i = 0; i < data.model.merges.length; i++) {
105
+ tokenizer.merges.set(data.model.merges[i], i);
106
+ }
89
107
  }
108
+ // Model-specific config
109
+ tokenizer.unkToken = data.model.unk_token ?? '[UNK]';
110
+ tokenizer.continuingSubwordPrefix = data.model.continuing_subword_prefix ?? '##';
90
111
  }
91
- // Create attention mask
92
- const attentionMask = returnAttentionMask
93
- ? inputIds.map(() => 1)
94
- : [];
95
- // Pad if needed
96
- if (padding === 'max_length' && inputIds.length < maxLength) {
97
- const padLength = maxLength - inputIds.length;
98
- inputIds = [...inputIds, ...new Array(padLength).fill(this.config.padTokenId)];
99
- if (returnAttentionMask) {
100
- attentionMask.push(...new Array(padLength).fill(0));
112
+ // Load added tokens
113
+ if (data.added_tokens) {
114
+ for (const token of data.added_tokens) {
115
+ tokenizer.addedTokens.set(token.content, token.id);
116
+ tokenizer.reverseVocab.set(token.id, token.content);
117
+ if (token.special) {
118
+ tokenizer.specialTokens.add(token.content);
119
+ }
120
+ // Detect special token types
121
+ const content = token.content.toLowerCase();
122
+ if (content.includes('pad'))
123
+ tokenizer.padTokenId = token.id;
124
+ if (content.includes('unk'))
125
+ tokenizer.unkTokenId = token.id;
126
+ if (content.includes('cls') || content === '[cls]')
127
+ tokenizer.clsTokenId = token.id;
128
+ if (content.includes('sep') || content === '[sep]')
129
+ tokenizer.sepTokenId = token.id;
130
+ if (content.includes('mask'))
131
+ tokenizer.maskTokenId = token.id;
132
+ if (content.includes('bos') || content === '<s>')
133
+ tokenizer.bosTokenId = token.id;
134
+ if (content.includes('eos') || content === '</s>')
135
+ tokenizer.eosTokenId = token.id;
101
136
  }
102
137
  }
103
- const result = {
104
- inputIds,
105
- attentionMask,
106
- };
107
- // Token type IDs (for segment embeddings)
108
- if (returnTokenTypeIds) {
109
- result.tokenTypeIds = inputIds.map(() => 0);
138
+ // Load normalizer config
139
+ if (data.normalizer) {
140
+ tokenizer.doLowerCase = data.normalizer.lowercase ?? false;
141
+ tokenizer.stripAccents = data.normalizer.strip_accents ?? false;
110
142
  }
111
- return result;
143
+ // Load truncation config
144
+ if (data.truncation) {
145
+ tokenizer.maxLength = data.truncation.max_length;
146
+ }
147
+ // Load post-processor
148
+ if (data.post_processor) {
149
+ tokenizer.postProcessor = data.post_processor;
150
+ }
151
+ return tokenizer;
112
152
  }
113
153
  /**
114
- * Batch encode
154
+ * Load from URL (tokenizer.json)
115
155
  */
116
- encodeBatch(texts, options = {}) {
117
- // Determine max length for 'longest' padding
118
- let maxLen = options.maxLength ?? this.config.maxLength;
119
- if (options.padding === 'longest') {
120
- const encodings = texts.map(text => this.encode(text, { ...options, padding: 'do_not_pad' }));
121
- maxLen = Math.max(...encodings.map(e => e.inputIds.length));
156
+ static async fromUrl(url) {
157
+ const response = await fetch(url);
158
+ if (!response.ok) {
159
+ throw new EdgeFlowError(`Failed to load tokenizer from ${url}: ${response.status}`, ErrorCodes.MODEL_NOT_FOUND);
122
160
  }
123
- return texts.map(text => this.encode(text, { ...options, maxLength: maxLen }));
161
+ const json = await response.json();
162
+ return Tokenizer.fromJSON(json);
124
163
  }
125
164
  /**
126
- * Decode token IDs back to text
165
+ * Load from HuggingFace Hub
127
166
  */
128
- decode(ids, skipSpecialTokens = true) {
129
- const tokens = this.convertIdsToTokens(ids);
130
- // Filter special tokens if requested
131
- const filteredTokens = skipSpecialTokens
132
- ? tokens.filter(token => !this.isSpecialToken(token))
133
- : tokens;
134
- return this.detokenize(filteredTokens);
167
+ static async fromHuggingFace(modelId, options) {
168
+ const revision = options?.revision ?? 'main';
169
+ const url = `https://huggingface.co/${modelId}/resolve/${revision}/tokenizer.json`;
170
+ return Tokenizer.fromUrl(url);
135
171
  }
136
172
  /**
137
- * Basic tokenization (split by whitespace and punctuation)
173
+ * Normalize text
138
174
  */
139
- tokenize(text) {
140
- // Normalize text
141
- const normalized = this.normalize(text);
142
- switch (this.model) {
143
- case 'bpe':
144
- return this.tokenizeBPE(normalized);
145
- case 'wordpiece':
146
- return this.tokenizeWordPiece(normalized);
147
- default:
148
- return this.tokenizeBasic(normalized);
175
+ normalize(text) {
176
+ let result = text;
177
+ if (this.doLowerCase) {
178
+ result = result.toLowerCase();
149
179
  }
180
+ if (this.stripAccents) {
181
+ result = result.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
182
+ }
183
+ // Normalize whitespace
184
+ result = result.replace(/\s+/g, ' ').trim();
185
+ return result;
150
186
  }
151
187
  /**
152
- * Normalize text
188
+ * Pre-tokenize text (split into words)
153
189
  */
154
- normalize(text) {
155
- return text
156
- .toLowerCase()
157
- .replace(/[^\w\s'-]/g, ' $& ')
158
- .replace(/\s+/g, ' ')
159
- .trim();
190
+ preTokenize(text) {
191
+ // GPT-2 style: split on whitespace and punctuation, keeping them
192
+ const pattern = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu;
193
+ const matches = text.match(pattern);
194
+ return matches ?? [text];
160
195
  }
161
196
  /**
162
- * Basic tokenization
197
+ * Encode text to bytes (for BPE)
163
198
  */
164
- tokenizeBasic(text) {
165
- return text.split(/\s+/).filter(t => t.length > 0);
199
+ textToBytes(text) {
200
+ const encoder = new TextEncoder();
201
+ const bytes = encoder.encode(text);
202
+ return Array.from(bytes).map(b => this.byteEncoder.get(b) ?? '').join('');
166
203
  }
167
204
  /**
168
- * WordPiece tokenization
205
+ * Decode bytes to text (for BPE)
169
206
  */
170
- tokenizeWordPiece(text) {
171
- const words = text.split(/\s+/).filter(w => w.length > 0);
172
- const tokens = [];
173
- for (const word of words) {
174
- const wordTokens = this.tokenizeWord(word);
175
- tokens.push(...wordTokens);
207
+ bytesToText(text) {
208
+ const bytes = new Uint8Array(text.split('').map(c => this.byteDecoder.get(c) ?? 0));
209
+ const decoder = new TextDecoder('utf-8', { fatal: false });
210
+ return decoder.decode(bytes);
211
+ }
212
+ /**
213
+ * Get BPE pairs from word
214
+ */
215
+ getPairs(word) {
216
+ const pairs = new Set();
217
+ for (let i = 0; i < word.length - 1; i++) {
218
+ pairs.add(`${word[i]} ${word[i + 1]}`);
176
219
  }
177
- return tokens;
220
+ return pairs;
178
221
  }
179
222
  /**
180
- * Tokenize a single word using WordPiece
223
+ * Apply BPE to a word
181
224
  */
182
- tokenizeWord(word) {
225
+ bpe(token) {
226
+ if (this.vocab.has(token)) {
227
+ return [token];
228
+ }
229
+ let word = token.split('');
230
+ let pairs = this.getPairs(word);
231
+ if (pairs.size === 0) {
232
+ return [token];
233
+ }
234
+ while (true) {
235
+ // Find the pair with lowest merge rank
236
+ let minPair = null;
237
+ let minRank = Infinity;
238
+ for (const pair of pairs) {
239
+ const rank = this.merges.get(pair);
240
+ if (rank !== undefined && rank < minRank) {
241
+ minRank = rank;
242
+ minPair = pair;
243
+ }
244
+ }
245
+ if (minPair === null)
246
+ break;
247
+ const parts = minPair.split(' ');
248
+ const first = parts[0];
249
+ const second = parts[1];
250
+ if (!first || !second)
251
+ break;
252
+ const newWord = [];
253
+ let i = 0;
254
+ while (i < word.length) {
255
+ const j = word.indexOf(first, i);
256
+ if (j === -1) {
257
+ newWord.push(...word.slice(i));
258
+ break;
259
+ }
260
+ newWord.push(...word.slice(i, j));
261
+ if (word[j] === first && j < word.length - 1 && word[j + 1] === second) {
262
+ newWord.push(first + second);
263
+ i = j + 2;
264
+ }
265
+ else {
266
+ newWord.push(word[j]);
267
+ i = j + 1;
268
+ }
269
+ }
270
+ word = newWord;
271
+ if (word.length === 1)
272
+ break;
273
+ pairs = this.getPairs(word);
274
+ }
275
+ return word;
276
+ }
277
+ /**
278
+ * WordPiece tokenization
279
+ */
280
+ wordPiece(word) {
183
281
  if (this.vocab.has(word)) {
184
282
  return [word];
185
283
  }
@@ -187,211 +285,397 @@ export class Tokenizer {
187
285
  let start = 0;
188
286
  while (start < word.length) {
189
287
  let end = word.length;
190
- let found = false;
288
+ let curSubstr = null;
191
289
  while (start < end) {
192
- const substr = start === 0 ? word.slice(start, end) : `##${word.slice(start, end)}`;
290
+ let substr = word.slice(start, end);
291
+ if (start > 0) {
292
+ substr = this.continuingSubwordPrefix + substr;
293
+ }
193
294
  if (this.vocab.has(substr)) {
194
- tokens.push(substr);
195
- found = true;
295
+ curSubstr = substr;
196
296
  break;
197
297
  }
198
298
  end--;
199
299
  }
200
- if (!found) {
201
- // Unknown character
202
- tokens.push('[UNK]');
300
+ if (curSubstr === null) {
301
+ tokens.push(this.unkToken);
203
302
  start++;
204
303
  }
205
304
  else {
305
+ tokens.push(curSubstr);
206
306
  start = end;
207
307
  }
208
308
  }
209
309
  return tokens;
210
310
  }
211
311
  /**
212
- * BPE tokenization
312
+ * Tokenize a single word
313
+ */
314
+ tokenizeWord(word) {
315
+ // Check added tokens first
316
+ if (this.addedTokens.has(word)) {
317
+ return [word];
318
+ }
319
+ switch (this.modelType) {
320
+ case 'BPE': {
321
+ // Convert to byte representation
322
+ const byteStr = this.textToBytes(word);
323
+ return this.bpe(byteStr);
324
+ }
325
+ case 'WordPiece':
326
+ return this.wordPiece(word);
327
+ case 'Unigram':
328
+ return this.unigramTokenize(word);
329
+ default:
330
+ return this.vocab.has(word) ? [word] : [this.unkToken];
331
+ }
332
+ }
333
+ /**
334
+ * Greedy longest-match tokenizer for SentencePiece Unigram models.
335
+ * Adds the U+2581 (▁) word-start prefix expected by SPM-based models.
213
336
  */
214
- tokenizeBPE(text) {
215
- const words = text.split(/\s+/).filter(w => w.length > 0);
337
+ unigramTokenize(word) {
338
+ // SentencePiece prepends ▁ to words that follow a space (i.e. the
339
+ // tokenizer receives individual words, so all of them get the prefix).
340
+ const prefixedWord = '\u2581' + word;
216
341
  const tokens = [];
217
- for (const word of words) {
218
- // Split word into characters
219
- let chars = word.split('').map((c, i) => i === word.length - 1 ? c + '</w>' : c);
220
- // Apply merges iteratively
221
- while (chars.length > 1) {
222
- let minPair = null;
223
- let minScore = Infinity;
224
- for (let i = 0; i < chars.length - 1; i++) {
225
- const pair = `${chars[i]} ${chars[i + 1]}`;
226
- if (this.merges.has(pair)) {
227
- const score = Array.from(this.merges.keys()).indexOf(pair);
228
- if (score < minScore) {
229
- minScore = score;
230
- minPair = [i, pair];
231
- }
232
- }
233
- }
234
- if (!minPair)
342
+ let start = 0;
343
+ const text = prefixedWord;
344
+ while (start < text.length) {
345
+ let end = text.length;
346
+ let found = false;
347
+ // Greedy longest-match scan
348
+ while (end > start) {
349
+ const sub = text.slice(start, end);
350
+ if (this.vocab.has(sub)) {
351
+ tokens.push(sub);
352
+ start = end;
353
+ found = true;
235
354
  break;
236
- const [idx, pair] = minPair;
237
- const merged = this.merges.get(pair);
238
- chars = [
239
- ...chars.slice(0, idx),
240
- merged,
241
- ...chars.slice(idx + 2),
242
- ];
355
+ }
356
+ end--;
357
+ }
358
+ if (!found) {
359
+ // Emit the single character (or unk if it's not in vocab either)
360
+ const ch = text[start];
361
+ tokens.push(this.vocab.has(ch) ? ch : this.unkToken);
362
+ start++;
243
363
  }
244
- tokens.push(...chars);
245
364
  }
246
- return tokens;
365
+ return tokens.length > 0 ? tokens : [this.unkToken];
247
366
  }
248
367
  /**
249
- * Add special tokens
368
+ * Main tokenization
250
369
  */
251
- addSpecialTokens(tokens) {
252
- const result = [];
253
- // Add CLS token
254
- if (this.config.clsTokenId !== undefined) {
255
- result.push('[CLS]');
370
+ tokenize(text) {
371
+ // Normalize
372
+ const normalized = this.normalize(text);
373
+ // Check for added tokens (special tokens)
374
+ const tokens = [];
375
+ let remaining = normalized;
376
+ // Sort added tokens by length (longest first) for greedy matching
377
+ const sortedAddedTokens = Array.from(this.addedTokens.keys())
378
+ .sort((a, b) => b.length - a.length);
379
+ // Split by added tokens
380
+ for (const addedToken of sortedAddedTokens) {
381
+ if (remaining.includes(addedToken)) {
382
+ const parts = remaining.split(addedToken);
383
+ const newRemaining = [];
384
+ for (let i = 0; i < parts.length; i++) {
385
+ if (parts[i]) {
386
+ newRemaining.push(parts[i]);
387
+ }
388
+ if (i < parts.length - 1) {
389
+ tokens.push(addedToken);
390
+ }
391
+ }
392
+ remaining = newRemaining.join(' ');
393
+ }
256
394
  }
257
- result.push(...tokens);
258
- // Add SEP token
259
- if (this.config.sepTokenId !== undefined) {
260
- result.push('[SEP]');
395
+ // Pre-tokenize remaining text
396
+ if (remaining.trim()) {
397
+ const words = this.preTokenize(remaining);
398
+ for (const word of words) {
399
+ if (!word)
400
+ continue;
401
+ const wordTokens = this.tokenizeWord(word);
402
+ tokens.push(...wordTokens);
403
+ }
261
404
  }
262
- return result;
405
+ return tokens;
263
406
  }
264
407
  /**
265
408
  * Convert tokens to IDs
266
409
  */
267
410
  convertTokensToIds(tokens) {
268
411
  return tokens.map(token => {
269
- const id = this.vocab.get(token);
270
- if (id !== undefined)
271
- return id;
272
- // Handle special tokens
273
- if (token === '[CLS]')
274
- return this.config.clsTokenId ?? this.config.unkTokenId;
275
- if (token === '[SEP]')
276
- return this.config.sepTokenId ?? this.config.unkTokenId;
277
- if (token === '[PAD]')
278
- return this.config.padTokenId;
279
- if (token === '[MASK]')
280
- return this.config.maskTokenId ?? this.config.unkTokenId;
281
- if (token === '[UNK]')
282
- return this.config.unkTokenId;
283
- return this.config.unkTokenId;
412
+ // Check added tokens first
413
+ const addedId = this.addedTokens.get(token);
414
+ if (addedId !== undefined)
415
+ return addedId;
416
+ // Check vocabulary
417
+ const vocabId = this.vocab.get(token);
418
+ if (vocabId !== undefined)
419
+ return vocabId;
420
+ // Return UNK
421
+ return this.unkTokenId;
284
422
  });
285
423
  }
286
424
  /**
287
425
  * Convert IDs to tokens
288
426
  */
289
427
  convertIdsToTokens(ids) {
290
- return ids.map(id => {
291
- const token = this.reverseVocab.get(id);
292
- if (token !== undefined)
293
- return token;
294
- // Handle special token IDs
295
- if (id === this.config.clsTokenId)
296
- return '[CLS]';
297
- if (id === this.config.sepTokenId)
298
- return '[SEP]';
299
- if (id === this.config.padTokenId)
300
- return '[PAD]';
301
- if (id === this.config.maskTokenId)
302
- return '[MASK]';
303
- if (id === this.config.unkTokenId)
304
- return '[UNK]';
305
- return '[UNK]';
306
- });
428
+ return ids.map(id => this.reverseVocab.get(id) ?? this.unkToken);
307
429
  }
308
430
  /**
309
- * Check if token is a special token
431
+ * Apply post-processing (add special tokens)
310
432
  */
311
- isSpecialToken(token) {
312
- return ['[CLS]', '[SEP]', '[PAD]', '[MASK]', '[UNK]'].includes(token);
433
+ postProcess(ids, pairIds) {
434
+ if (!this.postProcessor) {
435
+ // Default: [CLS] tokens [SEP] or [CLS] tokens [SEP] pair [SEP]
436
+ const result = [];
437
+ const typeIds = [];
438
+ if (this.clsTokenId !== undefined) {
439
+ result.push(this.clsTokenId);
440
+ typeIds.push(0);
441
+ }
442
+ result.push(...ids);
443
+ typeIds.push(...ids.map(() => 0));
444
+ if (this.sepTokenId !== undefined) {
445
+ result.push(this.sepTokenId);
446
+ typeIds.push(0);
447
+ }
448
+ if (pairIds) {
449
+ result.push(...pairIds);
450
+ typeIds.push(...pairIds.map(() => 1));
451
+ if (this.sepTokenId !== undefined) {
452
+ result.push(this.sepTokenId);
453
+ typeIds.push(1);
454
+ }
455
+ }
456
+ return { ids: result, typeIds };
457
+ }
458
+ // Use post-processor config
459
+ const template = pairIds ? this.postProcessor.pair : this.postProcessor.single;
460
+ if (!template) {
461
+ return { ids, typeIds: ids.map(() => 0) };
462
+ }
463
+ const result = [];
464
+ const typeIds = [];
465
+ for (const item of template) {
466
+ if ('SpecialToken' in item) {
467
+ const specialToken = this.postProcessor.special_tokens?.[item.SpecialToken.id];
468
+ if (specialToken) {
469
+ result.push(...specialToken.ids);
470
+ typeIds.push(...specialToken.ids.map(() => item.SpecialToken.type_id));
471
+ }
472
+ }
473
+ else if ('Sequence' in item) {
474
+ const seqIds = item.Sequence.id === 'A' ? ids : pairIds ?? [];
475
+ result.push(...seqIds);
476
+ typeIds.push(...seqIds.map(() => item.Sequence.type_id));
477
+ }
478
+ }
479
+ return { ids: result, typeIds };
480
+ }
481
+ /**
482
+ * Encode text
483
+ */
484
+ encode(text, options = {}) {
485
+ const { addSpecialTokens = true, maxLength = this.maxLength, padding = 'max_length', truncation = true, returnAttentionMask = true, returnTokenTypeIds = false, textPair, } = options;
486
+ // Tokenize
487
+ const tokens = this.tokenize(text);
488
+ let inputIds = this.convertTokensToIds(tokens);
489
+ // Tokenize pair if provided
490
+ let pairIds;
491
+ if (textPair) {
492
+ const pairTokens = this.tokenize(textPair);
493
+ pairIds = this.convertTokensToIds(pairTokens);
494
+ }
495
+ // Post-process (add special tokens)
496
+ let tokenTypeIds;
497
+ if (addSpecialTokens) {
498
+ const processed = this.postProcess(inputIds, pairIds);
499
+ inputIds = processed.ids;
500
+ if (returnTokenTypeIds) {
501
+ tokenTypeIds = processed.typeIds;
502
+ }
503
+ }
504
+ else if (pairIds) {
505
+ inputIds = [...inputIds, ...pairIds];
506
+ if (returnTokenTypeIds) {
507
+ tokenTypeIds = [...inputIds.map(() => 0), ...pairIds.map(() => 1)];
508
+ }
509
+ }
510
+ // Truncate
511
+ if (truncation && inputIds.length > maxLength) {
512
+ inputIds = inputIds.slice(0, maxLength);
513
+ if (tokenTypeIds) {
514
+ tokenTypeIds = tokenTypeIds.slice(0, maxLength);
515
+ }
516
+ }
517
+ // Create attention mask
518
+ let attentionMask = [];
519
+ if (returnAttentionMask) {
520
+ attentionMask = inputIds.map(() => 1);
521
+ }
522
+ // Padding
523
+ if (padding === 'max_length' && inputIds.length < maxLength) {
524
+ const padLength = maxLength - inputIds.length;
525
+ inputIds = [...inputIds, ...new Array(padLength).fill(this.padTokenId)];
526
+ if (returnAttentionMask) {
527
+ attentionMask = [...attentionMask, ...new Array(padLength).fill(0)];
528
+ }
529
+ if (tokenTypeIds) {
530
+ tokenTypeIds = [...tokenTypeIds, ...new Array(padLength).fill(0)];
531
+ }
532
+ }
533
+ const result = {
534
+ inputIds,
535
+ attentionMask,
536
+ };
537
+ if (returnTokenTypeIds && tokenTypeIds) {
538
+ result.tokenTypeIds = tokenTypeIds;
539
+ }
540
+ return result;
541
+ }
542
+ /**
543
+ * Batch encode
544
+ */
545
+ encodeBatch(texts, options = {}) {
546
+ // For 'longest' padding, first encode all without padding
547
+ if (options.padding === 'longest') {
548
+ const encodings = texts.map(t => this.encode(t, { ...options, padding: 'do_not_pad' }));
549
+ const maxLen = Math.max(...encodings.map(e => e.inputIds.length));
550
+ return texts.map(t => this.encode(t, { ...options, maxLength: maxLen, padding: 'max_length' }));
551
+ }
552
+ return texts.map(t => this.encode(t, options));
553
+ }
554
+ /**
555
+ * Decode IDs to text
556
+ */
557
+ decode(ids, skipSpecialTokens = true) {
558
+ let tokens = this.convertIdsToTokens(ids);
559
+ if (skipSpecialTokens) {
560
+ tokens = tokens.filter(t => !this.specialTokens.has(t));
561
+ }
562
+ if (this.modelType === 'BPE') {
563
+ // BPE: byte-level encoding, join raw and decode bytes
564
+ return this.bytesToText(tokens.join('')).replace(/\s+/g, ' ').trim();
565
+ }
566
+ if (this.modelType === 'WordPiece') {
567
+ // WordPiece: tokens starting with continuingSubwordPrefix (##) are
568
+ // subword continuations and must be appended to the previous word
569
+ // WITHOUT a space. All other tokens are word-starts and get a space.
570
+ const prefix = this.continuingSubwordPrefix; // '##'
571
+ const words = [];
572
+ for (const token of tokens) {
573
+ if (token.startsWith(prefix)) {
574
+ if (words.length > 0) {
575
+ words[words.length - 1] += token.slice(prefix.length);
576
+ }
577
+ else {
578
+ words.push(token.slice(prefix.length));
579
+ }
580
+ }
581
+ else {
582
+ words.push(token);
583
+ }
584
+ }
585
+ return words.join(' ').replace(/\s+/g, ' ').trim();
586
+ }
587
+ if (this.modelType === 'Unigram') {
588
+ // SentencePiece: ▁ marks word boundaries (replaces the leading space)
589
+ return tokens
590
+ .join('')
591
+ .replace(/\u2581/g, ' ')
592
+ .replace(/\s+/g, ' ')
593
+ .trim();
594
+ }
595
+ // Default: space-join
596
+ return tokens.join(' ').replace(/\s+/g, ' ').trim();
313
597
  }
314
598
  /**
315
- * Detokenize (convert tokens back to text)
599
+ * Decode batch
316
600
  */
317
- detokenize(tokens) {
318
- // Handle WordPiece
319
- const text = tokens
320
- .join(' ')
321
- .replace(/ ##/g, '')
322
- .replace(/<\/w>/g, ' ')
323
- .trim();
324
- return text;
601
+ decodeBatch(batchIds, skipSpecialTokens = true) {
602
+ return batchIds.map(ids => this.decode(ids, skipSpecialTokens));
325
603
  }
326
604
  /**
327
605
  * Get vocabulary size
328
606
  */
329
607
  get vocabSize() {
330
- return this.vocab.size;
608
+ return this.vocab.size + this.addedTokens.size;
609
+ }
610
+ /**
611
+ * Get special token IDs
612
+ */
613
+ getSpecialTokenIds() {
614
+ return {
615
+ padTokenId: this.padTokenId,
616
+ unkTokenId: this.unkTokenId,
617
+ clsTokenId: this.clsTokenId,
618
+ sepTokenId: this.sepTokenId,
619
+ maskTokenId: this.maskTokenId,
620
+ bosTokenId: this.bosTokenId,
621
+ eosTokenId: this.eosTokenId,
622
+ };
331
623
  }
332
624
  /**
333
625
  * Get config
334
626
  */
335
627
  getConfig() {
336
- return { ...this.config };
628
+ return {
629
+ vocabSize: this.vocabSize,
630
+ maxLength: this.maxLength,
631
+ padTokenId: this.padTokenId,
632
+ unkTokenId: this.unkTokenId,
633
+ clsTokenId: this.clsTokenId,
634
+ sepTokenId: this.sepTokenId,
635
+ maskTokenId: this.maskTokenId,
636
+ bosTokenId: this.bosTokenId,
637
+ eosTokenId: this.eosTokenId,
638
+ };
639
+ }
640
+ /**
641
+ * Check if token is special
642
+ */
643
+ isSpecialToken(token) {
644
+ return this.specialTokens.has(token);
645
+ }
646
+ /**
647
+ * Get token ID
648
+ */
649
+ getTokenId(token) {
650
+ return this.addedTokens.get(token) ?? this.vocab.get(token);
651
+ }
652
+ /**
653
+ * Get token from ID
654
+ */
655
+ getToken(id) {
656
+ return this.reverseVocab.get(id);
337
657
  }
338
658
  }
339
659
  // ============================================================================
340
- // Pre-trained Tokenizers
660
+ // Factory Functions
341
661
  // ============================================================================
342
662
  /**
343
- * Create a basic English tokenizer
663
+ * Create a basic English tokenizer (for testing)
344
664
  */
345
665
  export function createBasicTokenizer() {
346
- // Create basic vocabulary
347
- const vocab = {
348
- '[PAD]': 0,
349
- '[UNK]': 1,
350
- '[CLS]': 2,
351
- '[SEP]': 3,
352
- '[MASK]': 4,
353
- };
354
- // Add common words
355
- const commonWords = [
356
- 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
357
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
358
- 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used',
359
- 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
360
- 'my', 'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
361
- 'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'whose',
362
- 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'as', 'if', 'when', 'while',
363
- 'not', 'no', 'yes', 'all', 'any', 'both', 'each', 'every', 'few', 'more', 'most',
364
- 'other', 'some', 'such', 'only', 'own', 'same', 'than', 'too', 'very',
365
- 'good', 'bad', 'great', 'new', 'old', 'high', 'low', 'big', 'small', 'long', 'short',
366
- 'love', 'like', 'hate', 'want', 'need', 'think', 'know', 'feel', 'see', 'hear',
367
- ];
368
- let id = 5;
369
- for (const word of commonWords) {
370
- vocab[word] = id++;
371
- }
372
- return new Tokenizer({
373
- vocabSize: id,
374
- maxLength: 128,
375
- padTokenId: 0,
376
- unkTokenId: 1,
377
- clsTokenId: 2,
378
- sepTokenId: 3,
379
- maskTokenId: 4,
380
- }, { vocab, model: 'basic' });
666
+ const tokenizer = new Tokenizer();
667
+ return tokenizer;
381
668
  }
382
669
  /**
383
670
  * Load tokenizer from URL
384
671
  */
385
672
  export async function loadTokenizer(url) {
386
- const response = await fetch(url);
387
- if (!response.ok) {
388
- throw new EdgeFlowError(`Failed to load tokenizer from ${url}`, ErrorCodes.MODEL_NOT_FOUND);
389
- }
390
- const data = await response.json();
391
- return new Tokenizer(data.config ?? {}, {
392
- vocab: data.vocab,
393
- merges: data.merges,
394
- model: data.model,
395
- });
673
+ return Tokenizer.fromUrl(url);
674
+ }
675
+ /**
676
+ * Load tokenizer from HuggingFace Hub
677
+ */
678
+ export async function loadTokenizerFromHub(modelId, options) {
679
+ return Tokenizer.fromHuggingFace(modelId, options);
396
680
  }
397
681
  //# sourceMappingURL=tokenizer.js.map