edgeflowjs 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +200 -66
- package/dist/backends/index.d.ts +9 -2
- package/dist/backends/index.d.ts.map +1 -1
- package/dist/backends/index.js +13 -13
- package/dist/backends/index.js.map +1 -1
- package/dist/backends/onnx.d.ts +11 -4
- package/dist/backends/onnx.d.ts.map +1 -1
- package/dist/backends/onnx.js +97 -78
- package/dist/backends/onnx.js.map +1 -1
- package/dist/backends/transformers-adapter.d.ts +99 -0
- package/dist/backends/transformers-adapter.d.ts.map +1 -0
- package/dist/backends/transformers-adapter.js +171 -0
- package/dist/backends/transformers-adapter.js.map +1 -0
- package/dist/backends/webgpu.d.ts +7 -5
- package/dist/backends/webgpu.d.ts.map +1 -1
- package/dist/backends/webgpu.js +7 -5
- package/dist/backends/webgpu.js.map +1 -1
- package/dist/backends/webnn.d.ts +6 -5
- package/dist/backends/webnn.d.ts.map +1 -1
- package/dist/backends/webnn.js +6 -5
- package/dist/backends/webnn.js.map +1 -1
- package/dist/core/composer.d.ts +118 -0
- package/dist/core/composer.d.ts.map +1 -0
- package/dist/core/composer.js +163 -0
- package/dist/core/composer.js.map +1 -0
- package/dist/core/device-profiler.d.ts +75 -0
- package/dist/core/device-profiler.d.ts.map +1 -0
- package/dist/core/device-profiler.js +131 -0
- package/dist/core/device-profiler.js.map +1 -0
- package/dist/core/index.d.ts +4 -0
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +8 -0
- package/dist/core/index.js.map +1 -1
- package/dist/core/memory.d.ts +22 -2
- package/dist/core/memory.d.ts.map +1 -1
- package/dist/core/memory.js +49 -13
- package/dist/core/memory.js.map +1 -1
- package/dist/core/plugin.d.ts +100 -0
- package/dist/core/plugin.d.ts.map +1 -0
- package/dist/core/plugin.js +106 -0
- package/dist/core/plugin.js.map +1 -0
- package/dist/core/runtime.d.ts +4 -0
- package/dist/core/runtime.d.ts.map +1 -1
- package/dist/core/runtime.js +18 -0
- package/dist/core/runtime.js.map +1 -1
- package/dist/core/scheduler.d.ts +17 -0
- package/dist/core/scheduler.d.ts.map +1 -1
- package/dist/core/scheduler.js +101 -3
- package/dist/core/scheduler.js.map +1 -1
- package/dist/core/types.d.ts +14 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/core/types.js.map +1 -1
- package/dist/core/worker.d.ts +202 -0
- package/dist/core/worker.d.ts.map +1 -0
- package/dist/core/worker.js +477 -0
- package/dist/core/worker.js.map +1 -0
- package/dist/edgeflow.browser.js +9770 -4383
- package/dist/edgeflow.browser.js.map +4 -4
- package/dist/edgeflow.browser.min.js +435 -5
- package/dist/edgeflow.browser.min.js.map +4 -4
- package/dist/index.d.ts +7 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +28 -10
- package/dist/index.js.map +1 -1
- package/dist/pipelines/automatic-speech-recognition.d.ts +63 -0
- package/dist/pipelines/automatic-speech-recognition.d.ts.map +1 -0
- package/dist/pipelines/automatic-speech-recognition.js +269 -0
- package/dist/pipelines/automatic-speech-recognition.js.map +1 -0
- package/dist/pipelines/base.d.ts +6 -1
- package/dist/pipelines/base.d.ts.map +1 -1
- package/dist/pipelines/base.js +12 -2
- package/dist/pipelines/base.js.map +1 -1
- package/dist/pipelines/feature-extraction.d.ts +5 -40
- package/dist/pipelines/feature-extraction.d.ts.map +1 -1
- package/dist/pipelines/feature-extraction.js +44 -63
- package/dist/pipelines/feature-extraction.js.map +1 -1
- package/dist/pipelines/image-classification.d.ts +4 -36
- package/dist/pipelines/image-classification.d.ts.map +1 -1
- package/dist/pipelines/image-classification.js +22 -60
- package/dist/pipelines/image-classification.js.map +1 -1
- package/dist/pipelines/image-segmentation.d.ts +221 -0
- package/dist/pipelines/image-segmentation.d.ts.map +1 -0
- package/dist/pipelines/image-segmentation.js +535 -0
- package/dist/pipelines/image-segmentation.js.map +1 -0
- package/dist/pipelines/index.d.ts +18 -0
- package/dist/pipelines/index.d.ts.map +1 -1
- package/dist/pipelines/index.js +51 -2
- package/dist/pipelines/index.js.map +1 -1
- package/dist/pipelines/object-detection.d.ts +44 -0
- package/dist/pipelines/object-detection.d.ts.map +1 -0
- package/dist/pipelines/object-detection.js +218 -0
- package/dist/pipelines/object-detection.js.map +1 -0
- package/dist/pipelines/question-answering.d.ts +41 -0
- package/dist/pipelines/question-answering.d.ts.map +1 -0
- package/dist/pipelines/question-answering.js +164 -0
- package/dist/pipelines/question-answering.js.map +1 -0
- package/dist/pipelines/text-classification.d.ts +3 -39
- package/dist/pipelines/text-classification.d.ts.map +1 -1
- package/dist/pipelines/text-classification.js +29 -67
- package/dist/pipelines/text-classification.js.map +1 -1
- package/dist/pipelines/text-generation.d.ts +281 -0
- package/dist/pipelines/text-generation.d.ts.map +1 -0
- package/dist/pipelines/text-generation.js +766 -0
- package/dist/pipelines/text-generation.js.map +1 -0
- package/dist/pipelines/zero-shot-classification.d.ts +45 -0
- package/dist/pipelines/zero-shot-classification.d.ts.map +1 -0
- package/dist/pipelines/zero-shot-classification.js +140 -0
- package/dist/pipelines/zero-shot-classification.js.map +1 -0
- package/dist/tools/benchmark.d.ts +92 -0
- package/dist/tools/benchmark.d.ts.map +1 -0
- package/dist/tools/benchmark.js +213 -0
- package/dist/tools/benchmark.js.map +1 -0
- package/dist/tools/debugger.d.ts +258 -0
- package/dist/tools/debugger.d.ts.map +1 -0
- package/dist/tools/debugger.js +624 -0
- package/dist/tools/debugger.js.map +1 -0
- package/dist/tools/index.d.ts +8 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +16 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/monitor.d.ts +284 -0
- package/dist/tools/monitor.d.ts.map +1 -0
- package/dist/tools/monitor.js +921 -0
- package/dist/tools/monitor.js.map +1 -0
- package/dist/tools/quantization.d.ts +235 -0
- package/dist/tools/quantization.d.ts.map +1 -0
- package/dist/tools/quantization.js +830 -0
- package/dist/tools/quantization.js.map +1 -0
- package/dist/utils/hub.d.ts +162 -0
- package/dist/utils/hub.d.ts.map +1 -0
- package/dist/utils/hub.js +311 -0
- package/dist/utils/hub.js.map +1 -0
- package/dist/utils/index.d.ts +3 -1
- package/dist/utils/index.d.ts.map +1 -1
- package/dist/utils/index.js +5 -1
- package/dist/utils/index.js.map +1 -1
- package/dist/utils/model-loader.d.ts.map +1 -1
- package/dist/utils/model-loader.js +106 -30
- package/dist/utils/model-loader.js.map +1 -1
- package/dist/utils/offline.d.ts +147 -0
- package/dist/utils/offline.d.ts.map +1 -0
- package/dist/utils/offline.js +405 -0
- package/dist/utils/offline.js.map +1 -0
- package/dist/utils/preprocessor.d.ts +82 -6
- package/dist/utils/preprocessor.d.ts.map +1 -1
- package/dist/utils/preprocessor.js +278 -21
- package/dist/utils/preprocessor.js.map +1 -1
- package/dist/utils/tokenizer.d.ts +197 -72
- package/dist/utils/tokenizer.d.ts.map +1 -1
- package/dist/utils/tokenizer.js +558 -274
- package/dist/utils/tokenizer.js.map +1 -1
- package/package.json +26 -11
package/dist/utils/tokenizer.js
CHANGED
|
@@ -1,185 +1,283 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* edgeFlow.js - Tokenizer
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* Supports BPE, WordPiece, and
|
|
4
|
+
* Full-featured tokenizer supporting HuggingFace tokenizer.json format.
|
|
5
|
+
* Supports BPE, WordPiece, and Unigram tokenization.
|
|
6
6
|
*/
|
|
7
7
|
import { EdgeFlowError, ErrorCodes, } from '../core/types.js';
|
|
8
8
|
// ============================================================================
|
|
9
|
-
//
|
|
9
|
+
// Tokenizer Implementation
|
|
10
10
|
// ============================================================================
|
|
11
11
|
/**
|
|
12
|
-
* Tokenizer -
|
|
12
|
+
* Tokenizer - Full-featured tokenizer supporting HuggingFace format
|
|
13
13
|
*/
|
|
14
14
|
export class Tokenizer {
|
|
15
|
-
vocab;
|
|
16
|
-
reverseVocab;
|
|
17
|
-
config;
|
|
18
|
-
model;
|
|
15
|
+
vocab = new Map();
|
|
16
|
+
reverseVocab = new Map();
|
|
19
17
|
merges = new Map();
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
18
|
+
addedTokens = new Map();
|
|
19
|
+
specialTokens = new Set();
|
|
20
|
+
modelType = 'BPE';
|
|
21
|
+
unkToken = '[UNK]';
|
|
22
|
+
continuingSubwordPrefix = '##';
|
|
23
|
+
// Special token IDs
|
|
24
|
+
padTokenId = 0;
|
|
25
|
+
unkTokenId = 0;
|
|
26
|
+
clsTokenId;
|
|
27
|
+
sepTokenId;
|
|
28
|
+
maskTokenId;
|
|
29
|
+
bosTokenId;
|
|
30
|
+
eosTokenId;
|
|
31
|
+
// Config
|
|
32
|
+
maxLength = 512;
|
|
33
|
+
doLowerCase = false;
|
|
34
|
+
stripAccents = false;
|
|
35
|
+
// Post-processor config
|
|
36
|
+
postProcessor;
|
|
37
|
+
// Byte encoder for BPE
|
|
38
|
+
byteEncoder = new Map();
|
|
39
|
+
byteDecoder = new Map();
|
|
40
|
+
constructor() {
|
|
41
|
+
this.initByteEncoder();
|
|
43
42
|
}
|
|
44
43
|
/**
|
|
45
|
-
*
|
|
44
|
+
* Initialize byte encoder/decoder for BPE
|
|
46
45
|
*/
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
46
|
+
initByteEncoder() {
|
|
47
|
+
const bytes = [];
|
|
48
|
+
// Printable ASCII
|
|
49
|
+
for (let i = 33; i <= 126; i++)
|
|
50
|
+
bytes.push(i);
|
|
51
|
+
for (let i = 161; i <= 172; i++)
|
|
52
|
+
bytes.push(i);
|
|
53
|
+
for (let i = 174; i <= 255; i++)
|
|
54
|
+
bytes.push(i);
|
|
55
|
+
const chars = [...bytes];
|
|
56
|
+
let n = 0;
|
|
57
|
+
for (let i = 0; i < 256; i++) {
|
|
58
|
+
if (!bytes.includes(i)) {
|
|
59
|
+
bytes.push(i);
|
|
60
|
+
chars.push(256 + n);
|
|
61
|
+
n++;
|
|
62
|
+
}
|
|
53
63
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
64
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
65
|
+
const byte = bytes[i];
|
|
66
|
+
const char = String.fromCharCode(chars[i]);
|
|
67
|
+
this.byteEncoder.set(byte, char);
|
|
68
|
+
this.byteDecoder.set(char, byte);
|
|
57
69
|
}
|
|
58
70
|
}
|
|
59
71
|
/**
|
|
60
|
-
* Load
|
|
72
|
+
* Load from HuggingFace tokenizer.json
|
|
61
73
|
*/
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
74
|
+
static async fromJSON(json) {
|
|
75
|
+
const tokenizer = new Tokenizer();
|
|
76
|
+
const data = typeof json === 'string' ? JSON.parse(json) : json;
|
|
77
|
+
// Load model config
|
|
78
|
+
if (data.model) {
|
|
79
|
+
tokenizer.modelType = data.model.type;
|
|
80
|
+
// Load vocabulary.
|
|
81
|
+
// BPE/WordPiece: vocab is an object { token: id }.
|
|
82
|
+
// Unigram (SentencePiece): vocab is an array of [token, score] pairs
|
|
83
|
+
// where the array *index* is the token ID.
|
|
84
|
+
if (data.model.vocab) {
|
|
85
|
+
if (Array.isArray(data.model.vocab)) {
|
|
86
|
+
// Unigram format
|
|
87
|
+
const unigramVocab = data.model.vocab;
|
|
88
|
+
for (let i = 0; i < unigramVocab.length; i++) {
|
|
89
|
+
const entry = unigramVocab[i];
|
|
90
|
+
const token = Array.isArray(entry) ? entry[0] : entry;
|
|
91
|
+
tokenizer.vocab.set(token, i);
|
|
92
|
+
tokenizer.reverseVocab.set(i, token);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
for (const [token, id] of Object.entries(data.model.vocab)) {
|
|
97
|
+
tokenizer.vocab.set(token, id);
|
|
98
|
+
tokenizer.reverseVocab.set(id, token);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
67
101
|
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
encode(text, options = {}) {
|
|
74
|
-
const { addSpecialTokens = true, maxLength = this.config.maxLength, padding = 'max_length', truncation = true, returnAttentionMask = true, returnTokenTypeIds = false, } = options;
|
|
75
|
-
// Tokenize
|
|
76
|
-
let tokens = this.tokenize(text);
|
|
77
|
-
// Add special tokens
|
|
78
|
-
if (addSpecialTokens) {
|
|
79
|
-
tokens = this.addSpecialTokens(tokens);
|
|
80
|
-
}
|
|
81
|
-
// Convert to IDs
|
|
82
|
-
let inputIds = this.convertTokensToIds(tokens);
|
|
83
|
-
// Truncate if needed
|
|
84
|
-
if (truncation && inputIds.length > maxLength) {
|
|
85
|
-
inputIds = inputIds.slice(0, maxLength);
|
|
86
|
-
// Ensure EOS token if present
|
|
87
|
-
if (addSpecialTokens && this.config.sepTokenId !== undefined) {
|
|
88
|
-
inputIds[inputIds.length - 1] = this.config.sepTokenId;
|
|
102
|
+
// Load merges for BPE
|
|
103
|
+
if (data.model.merges) {
|
|
104
|
+
for (let i = 0; i < data.model.merges.length; i++) {
|
|
105
|
+
tokenizer.merges.set(data.model.merges[i], i);
|
|
106
|
+
}
|
|
89
107
|
}
|
|
108
|
+
// Model-specific config
|
|
109
|
+
tokenizer.unkToken = data.model.unk_token ?? '[UNK]';
|
|
110
|
+
tokenizer.continuingSubwordPrefix = data.model.continuing_subword_prefix ?? '##';
|
|
90
111
|
}
|
|
91
|
-
//
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
112
|
+
// Load added tokens
|
|
113
|
+
if (data.added_tokens) {
|
|
114
|
+
for (const token of data.added_tokens) {
|
|
115
|
+
tokenizer.addedTokens.set(token.content, token.id);
|
|
116
|
+
tokenizer.reverseVocab.set(token.id, token.content);
|
|
117
|
+
if (token.special) {
|
|
118
|
+
tokenizer.specialTokens.add(token.content);
|
|
119
|
+
}
|
|
120
|
+
// Detect special token types
|
|
121
|
+
const content = token.content.toLowerCase();
|
|
122
|
+
if (content.includes('pad'))
|
|
123
|
+
tokenizer.padTokenId = token.id;
|
|
124
|
+
if (content.includes('unk'))
|
|
125
|
+
tokenizer.unkTokenId = token.id;
|
|
126
|
+
if (content.includes('cls') || content === '[cls]')
|
|
127
|
+
tokenizer.clsTokenId = token.id;
|
|
128
|
+
if (content.includes('sep') || content === '[sep]')
|
|
129
|
+
tokenizer.sepTokenId = token.id;
|
|
130
|
+
if (content.includes('mask'))
|
|
131
|
+
tokenizer.maskTokenId = token.id;
|
|
132
|
+
if (content.includes('bos') || content === '<s>')
|
|
133
|
+
tokenizer.bosTokenId = token.id;
|
|
134
|
+
if (content.includes('eos') || content === '</s>')
|
|
135
|
+
tokenizer.eosTokenId = token.id;
|
|
101
136
|
}
|
|
102
137
|
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
// Token type IDs (for segment embeddings)
|
|
108
|
-
if (returnTokenTypeIds) {
|
|
109
|
-
result.tokenTypeIds = inputIds.map(() => 0);
|
|
138
|
+
// Load normalizer config
|
|
139
|
+
if (data.normalizer) {
|
|
140
|
+
tokenizer.doLowerCase = data.normalizer.lowercase ?? false;
|
|
141
|
+
tokenizer.stripAccents = data.normalizer.strip_accents ?? false;
|
|
110
142
|
}
|
|
111
|
-
|
|
143
|
+
// Load truncation config
|
|
144
|
+
if (data.truncation) {
|
|
145
|
+
tokenizer.maxLength = data.truncation.max_length;
|
|
146
|
+
}
|
|
147
|
+
// Load post-processor
|
|
148
|
+
if (data.post_processor) {
|
|
149
|
+
tokenizer.postProcessor = data.post_processor;
|
|
150
|
+
}
|
|
151
|
+
return tokenizer;
|
|
112
152
|
}
|
|
113
153
|
/**
|
|
114
|
-
*
|
|
154
|
+
* Load from URL (tokenizer.json)
|
|
115
155
|
*/
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
const encodings = texts.map(text => this.encode(text, { ...options, padding: 'do_not_pad' }));
|
|
121
|
-
maxLen = Math.max(...encodings.map(e => e.inputIds.length));
|
|
156
|
+
static async fromUrl(url) {
|
|
157
|
+
const response = await fetch(url);
|
|
158
|
+
if (!response.ok) {
|
|
159
|
+
throw new EdgeFlowError(`Failed to load tokenizer from ${url}: ${response.status}`, ErrorCodes.MODEL_NOT_FOUND);
|
|
122
160
|
}
|
|
123
|
-
|
|
161
|
+
const json = await response.json();
|
|
162
|
+
return Tokenizer.fromJSON(json);
|
|
124
163
|
}
|
|
125
164
|
/**
|
|
126
|
-
*
|
|
165
|
+
* Load from HuggingFace Hub
|
|
127
166
|
*/
|
|
128
|
-
|
|
129
|
-
const
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
? tokens.filter(token => !this.isSpecialToken(token))
|
|
133
|
-
: tokens;
|
|
134
|
-
return this.detokenize(filteredTokens);
|
|
167
|
+
static async fromHuggingFace(modelId, options) {
|
|
168
|
+
const revision = options?.revision ?? 'main';
|
|
169
|
+
const url = `https://huggingface.co/${modelId}/resolve/${revision}/tokenizer.json`;
|
|
170
|
+
return Tokenizer.fromUrl(url);
|
|
135
171
|
}
|
|
136
172
|
/**
|
|
137
|
-
*
|
|
173
|
+
* Normalize text
|
|
138
174
|
*/
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
case 'bpe':
|
|
144
|
-
return this.tokenizeBPE(normalized);
|
|
145
|
-
case 'wordpiece':
|
|
146
|
-
return this.tokenizeWordPiece(normalized);
|
|
147
|
-
default:
|
|
148
|
-
return this.tokenizeBasic(normalized);
|
|
175
|
+
normalize(text) {
|
|
176
|
+
let result = text;
|
|
177
|
+
if (this.doLowerCase) {
|
|
178
|
+
result = result.toLowerCase();
|
|
149
179
|
}
|
|
180
|
+
if (this.stripAccents) {
|
|
181
|
+
result = result.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
|
182
|
+
}
|
|
183
|
+
// Normalize whitespace
|
|
184
|
+
result = result.replace(/\s+/g, ' ').trim();
|
|
185
|
+
return result;
|
|
150
186
|
}
|
|
151
187
|
/**
|
|
152
|
-
*
|
|
188
|
+
* Pre-tokenize text (split into words)
|
|
153
189
|
*/
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
.trim();
|
|
190
|
+
preTokenize(text) {
|
|
191
|
+
// GPT-2 style: split on whitespace and punctuation, keeping them
|
|
192
|
+
const pattern = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu;
|
|
193
|
+
const matches = text.match(pattern);
|
|
194
|
+
return matches ?? [text];
|
|
160
195
|
}
|
|
161
196
|
/**
|
|
162
|
-
*
|
|
197
|
+
* Encode text to bytes (for BPE)
|
|
163
198
|
*/
|
|
164
|
-
|
|
165
|
-
|
|
199
|
+
textToBytes(text) {
|
|
200
|
+
const encoder = new TextEncoder();
|
|
201
|
+
const bytes = encoder.encode(text);
|
|
202
|
+
return Array.from(bytes).map(b => this.byteEncoder.get(b) ?? '').join('');
|
|
166
203
|
}
|
|
167
204
|
/**
|
|
168
|
-
*
|
|
205
|
+
* Decode bytes to text (for BPE)
|
|
169
206
|
*/
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
const
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
207
|
+
bytesToText(text) {
|
|
208
|
+
const bytes = new Uint8Array(text.split('').map(c => this.byteDecoder.get(c) ?? 0));
|
|
209
|
+
const decoder = new TextDecoder('utf-8', { fatal: false });
|
|
210
|
+
return decoder.decode(bytes);
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Get BPE pairs from word
|
|
214
|
+
*/
|
|
215
|
+
getPairs(word) {
|
|
216
|
+
const pairs = new Set();
|
|
217
|
+
for (let i = 0; i < word.length - 1; i++) {
|
|
218
|
+
pairs.add(`${word[i]} ${word[i + 1]}`);
|
|
176
219
|
}
|
|
177
|
-
return
|
|
220
|
+
return pairs;
|
|
178
221
|
}
|
|
179
222
|
/**
|
|
180
|
-
*
|
|
223
|
+
* Apply BPE to a word
|
|
181
224
|
*/
|
|
182
|
-
|
|
225
|
+
bpe(token) {
|
|
226
|
+
if (this.vocab.has(token)) {
|
|
227
|
+
return [token];
|
|
228
|
+
}
|
|
229
|
+
let word = token.split('');
|
|
230
|
+
let pairs = this.getPairs(word);
|
|
231
|
+
if (pairs.size === 0) {
|
|
232
|
+
return [token];
|
|
233
|
+
}
|
|
234
|
+
while (true) {
|
|
235
|
+
// Find the pair with lowest merge rank
|
|
236
|
+
let minPair = null;
|
|
237
|
+
let minRank = Infinity;
|
|
238
|
+
for (const pair of pairs) {
|
|
239
|
+
const rank = this.merges.get(pair);
|
|
240
|
+
if (rank !== undefined && rank < minRank) {
|
|
241
|
+
minRank = rank;
|
|
242
|
+
minPair = pair;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
if (minPair === null)
|
|
246
|
+
break;
|
|
247
|
+
const parts = minPair.split(' ');
|
|
248
|
+
const first = parts[0];
|
|
249
|
+
const second = parts[1];
|
|
250
|
+
if (!first || !second)
|
|
251
|
+
break;
|
|
252
|
+
const newWord = [];
|
|
253
|
+
let i = 0;
|
|
254
|
+
while (i < word.length) {
|
|
255
|
+
const j = word.indexOf(first, i);
|
|
256
|
+
if (j === -1) {
|
|
257
|
+
newWord.push(...word.slice(i));
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
newWord.push(...word.slice(i, j));
|
|
261
|
+
if (word[j] === first && j < word.length - 1 && word[j + 1] === second) {
|
|
262
|
+
newWord.push(first + second);
|
|
263
|
+
i = j + 2;
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
newWord.push(word[j]);
|
|
267
|
+
i = j + 1;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
word = newWord;
|
|
271
|
+
if (word.length === 1)
|
|
272
|
+
break;
|
|
273
|
+
pairs = this.getPairs(word);
|
|
274
|
+
}
|
|
275
|
+
return word;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* WordPiece tokenization
|
|
279
|
+
*/
|
|
280
|
+
wordPiece(word) {
|
|
183
281
|
if (this.vocab.has(word)) {
|
|
184
282
|
return [word];
|
|
185
283
|
}
|
|
@@ -187,211 +285,397 @@ export class Tokenizer {
|
|
|
187
285
|
let start = 0;
|
|
188
286
|
while (start < word.length) {
|
|
189
287
|
let end = word.length;
|
|
190
|
-
let
|
|
288
|
+
let curSubstr = null;
|
|
191
289
|
while (start < end) {
|
|
192
|
-
|
|
290
|
+
let substr = word.slice(start, end);
|
|
291
|
+
if (start > 0) {
|
|
292
|
+
substr = this.continuingSubwordPrefix + substr;
|
|
293
|
+
}
|
|
193
294
|
if (this.vocab.has(substr)) {
|
|
194
|
-
|
|
195
|
-
found = true;
|
|
295
|
+
curSubstr = substr;
|
|
196
296
|
break;
|
|
197
297
|
}
|
|
198
298
|
end--;
|
|
199
299
|
}
|
|
200
|
-
if (
|
|
201
|
-
|
|
202
|
-
tokens.push('[UNK]');
|
|
300
|
+
if (curSubstr === null) {
|
|
301
|
+
tokens.push(this.unkToken);
|
|
203
302
|
start++;
|
|
204
303
|
}
|
|
205
304
|
else {
|
|
305
|
+
tokens.push(curSubstr);
|
|
206
306
|
start = end;
|
|
207
307
|
}
|
|
208
308
|
}
|
|
209
309
|
return tokens;
|
|
210
310
|
}
|
|
211
311
|
/**
|
|
212
|
-
*
|
|
312
|
+
* Tokenize a single word
|
|
313
|
+
*/
|
|
314
|
+
tokenizeWord(word) {
|
|
315
|
+
// Check added tokens first
|
|
316
|
+
if (this.addedTokens.has(word)) {
|
|
317
|
+
return [word];
|
|
318
|
+
}
|
|
319
|
+
switch (this.modelType) {
|
|
320
|
+
case 'BPE': {
|
|
321
|
+
// Convert to byte representation
|
|
322
|
+
const byteStr = this.textToBytes(word);
|
|
323
|
+
return this.bpe(byteStr);
|
|
324
|
+
}
|
|
325
|
+
case 'WordPiece':
|
|
326
|
+
return this.wordPiece(word);
|
|
327
|
+
case 'Unigram':
|
|
328
|
+
return this.unigramTokenize(word);
|
|
329
|
+
default:
|
|
330
|
+
return this.vocab.has(word) ? [word] : [this.unkToken];
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Greedy longest-match tokenizer for SentencePiece Unigram models.
|
|
335
|
+
* Adds the U+2581 (▁) word-start prefix expected by SPM-based models.
|
|
213
336
|
*/
|
|
214
|
-
|
|
215
|
-
|
|
337
|
+
unigramTokenize(word) {
|
|
338
|
+
// SentencePiece prepends ▁ to words that follow a space (i.e. the
|
|
339
|
+
// tokenizer receives individual words, so all of them get the prefix).
|
|
340
|
+
const prefixedWord = '\u2581' + word;
|
|
216
341
|
const tokens = [];
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
minScore = score;
|
|
230
|
-
minPair = [i, pair];
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
if (!minPair)
|
|
342
|
+
let start = 0;
|
|
343
|
+
const text = prefixedWord;
|
|
344
|
+
while (start < text.length) {
|
|
345
|
+
let end = text.length;
|
|
346
|
+
let found = false;
|
|
347
|
+
// Greedy longest-match scan
|
|
348
|
+
while (end > start) {
|
|
349
|
+
const sub = text.slice(start, end);
|
|
350
|
+
if (this.vocab.has(sub)) {
|
|
351
|
+
tokens.push(sub);
|
|
352
|
+
start = end;
|
|
353
|
+
found = true;
|
|
235
354
|
break;
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
355
|
+
}
|
|
356
|
+
end--;
|
|
357
|
+
}
|
|
358
|
+
if (!found) {
|
|
359
|
+
// Emit the single character (or unk if it's not in vocab either)
|
|
360
|
+
const ch = text[start];
|
|
361
|
+
tokens.push(this.vocab.has(ch) ? ch : this.unkToken);
|
|
362
|
+
start++;
|
|
243
363
|
}
|
|
244
|
-
tokens.push(...chars);
|
|
245
364
|
}
|
|
246
|
-
return tokens;
|
|
365
|
+
return tokens.length > 0 ? tokens : [this.unkToken];
|
|
247
366
|
}
|
|
248
367
|
/**
|
|
249
|
-
*
|
|
368
|
+
* Main tokenization
|
|
250
369
|
*/
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
370
|
+
tokenize(text) {
|
|
371
|
+
// Normalize
|
|
372
|
+
const normalized = this.normalize(text);
|
|
373
|
+
// Check for added tokens (special tokens)
|
|
374
|
+
const tokens = [];
|
|
375
|
+
let remaining = normalized;
|
|
376
|
+
// Sort added tokens by length (longest first) for greedy matching
|
|
377
|
+
const sortedAddedTokens = Array.from(this.addedTokens.keys())
|
|
378
|
+
.sort((a, b) => b.length - a.length);
|
|
379
|
+
// Split by added tokens
|
|
380
|
+
for (const addedToken of sortedAddedTokens) {
|
|
381
|
+
if (remaining.includes(addedToken)) {
|
|
382
|
+
const parts = remaining.split(addedToken);
|
|
383
|
+
const newRemaining = [];
|
|
384
|
+
for (let i = 0; i < parts.length; i++) {
|
|
385
|
+
if (parts[i]) {
|
|
386
|
+
newRemaining.push(parts[i]);
|
|
387
|
+
}
|
|
388
|
+
if (i < parts.length - 1) {
|
|
389
|
+
tokens.push(addedToken);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
remaining = newRemaining.join(' ');
|
|
393
|
+
}
|
|
256
394
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
395
|
+
// Pre-tokenize remaining text
|
|
396
|
+
if (remaining.trim()) {
|
|
397
|
+
const words = this.preTokenize(remaining);
|
|
398
|
+
for (const word of words) {
|
|
399
|
+
if (!word)
|
|
400
|
+
continue;
|
|
401
|
+
const wordTokens = this.tokenizeWord(word);
|
|
402
|
+
tokens.push(...wordTokens);
|
|
403
|
+
}
|
|
261
404
|
}
|
|
262
|
-
return
|
|
405
|
+
return tokens;
|
|
263
406
|
}
|
|
264
407
|
/**
|
|
265
408
|
* Convert tokens to IDs
|
|
266
409
|
*/
|
|
267
410
|
convertTokensToIds(tokens) {
|
|
268
411
|
return tokens.map(token => {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
if (
|
|
276
|
-
return
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
if (token === '[MASK]')
|
|
280
|
-
return this.config.maskTokenId ?? this.config.unkTokenId;
|
|
281
|
-
if (token === '[UNK]')
|
|
282
|
-
return this.config.unkTokenId;
|
|
283
|
-
return this.config.unkTokenId;
|
|
412
|
+
// Check added tokens first
|
|
413
|
+
const addedId = this.addedTokens.get(token);
|
|
414
|
+
if (addedId !== undefined)
|
|
415
|
+
return addedId;
|
|
416
|
+
// Check vocabulary
|
|
417
|
+
const vocabId = this.vocab.get(token);
|
|
418
|
+
if (vocabId !== undefined)
|
|
419
|
+
return vocabId;
|
|
420
|
+
// Return UNK
|
|
421
|
+
return this.unkTokenId;
|
|
284
422
|
});
|
|
285
423
|
}
|
|
286
424
|
/**
|
|
287
425
|
* Convert IDs to tokens
|
|
288
426
|
*/
|
|
289
427
|
convertIdsToTokens(ids) {
|
|
290
|
-
return ids.map(id =>
|
|
291
|
-
const token = this.reverseVocab.get(id);
|
|
292
|
-
if (token !== undefined)
|
|
293
|
-
return token;
|
|
294
|
-
// Handle special token IDs
|
|
295
|
-
if (id === this.config.clsTokenId)
|
|
296
|
-
return '[CLS]';
|
|
297
|
-
if (id === this.config.sepTokenId)
|
|
298
|
-
return '[SEP]';
|
|
299
|
-
if (id === this.config.padTokenId)
|
|
300
|
-
return '[PAD]';
|
|
301
|
-
if (id === this.config.maskTokenId)
|
|
302
|
-
return '[MASK]';
|
|
303
|
-
if (id === this.config.unkTokenId)
|
|
304
|
-
return '[UNK]';
|
|
305
|
-
return '[UNK]';
|
|
306
|
-
});
|
|
428
|
+
return ids.map(id => this.reverseVocab.get(id) ?? this.unkToken);
|
|
307
429
|
}
|
|
308
430
|
/**
|
|
309
|
-
*
|
|
431
|
+
* Apply post-processing (add special tokens)
|
|
310
432
|
*/
|
|
311
|
-
|
|
312
|
-
|
|
433
|
+
postProcess(ids, pairIds) {
|
|
434
|
+
if (!this.postProcessor) {
|
|
435
|
+
// Default: [CLS] tokens [SEP] or [CLS] tokens [SEP] pair [SEP]
|
|
436
|
+
const result = [];
|
|
437
|
+
const typeIds = [];
|
|
438
|
+
if (this.clsTokenId !== undefined) {
|
|
439
|
+
result.push(this.clsTokenId);
|
|
440
|
+
typeIds.push(0);
|
|
441
|
+
}
|
|
442
|
+
result.push(...ids);
|
|
443
|
+
typeIds.push(...ids.map(() => 0));
|
|
444
|
+
if (this.sepTokenId !== undefined) {
|
|
445
|
+
result.push(this.sepTokenId);
|
|
446
|
+
typeIds.push(0);
|
|
447
|
+
}
|
|
448
|
+
if (pairIds) {
|
|
449
|
+
result.push(...pairIds);
|
|
450
|
+
typeIds.push(...pairIds.map(() => 1));
|
|
451
|
+
if (this.sepTokenId !== undefined) {
|
|
452
|
+
result.push(this.sepTokenId);
|
|
453
|
+
typeIds.push(1);
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
return { ids: result, typeIds };
|
|
457
|
+
}
|
|
458
|
+
// Use post-processor config
|
|
459
|
+
const template = pairIds ? this.postProcessor.pair : this.postProcessor.single;
|
|
460
|
+
if (!template) {
|
|
461
|
+
return { ids, typeIds: ids.map(() => 0) };
|
|
462
|
+
}
|
|
463
|
+
const result = [];
|
|
464
|
+
const typeIds = [];
|
|
465
|
+
for (const item of template) {
|
|
466
|
+
if ('SpecialToken' in item) {
|
|
467
|
+
const specialToken = this.postProcessor.special_tokens?.[item.SpecialToken.id];
|
|
468
|
+
if (specialToken) {
|
|
469
|
+
result.push(...specialToken.ids);
|
|
470
|
+
typeIds.push(...specialToken.ids.map(() => item.SpecialToken.type_id));
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
else if ('Sequence' in item) {
|
|
474
|
+
const seqIds = item.Sequence.id === 'A' ? ids : pairIds ?? [];
|
|
475
|
+
result.push(...seqIds);
|
|
476
|
+
typeIds.push(...seqIds.map(() => item.Sequence.type_id));
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
return { ids: result, typeIds };
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Encode text
|
|
483
|
+
*/
|
|
484
|
+
encode(text, options = {}) {
|
|
485
|
+
const { addSpecialTokens = true, maxLength = this.maxLength, padding = 'max_length', truncation = true, returnAttentionMask = true, returnTokenTypeIds = false, textPair, } = options;
|
|
486
|
+
// Tokenize
|
|
487
|
+
const tokens = this.tokenize(text);
|
|
488
|
+
let inputIds = this.convertTokensToIds(tokens);
|
|
489
|
+
// Tokenize pair if provided
|
|
490
|
+
let pairIds;
|
|
491
|
+
if (textPair) {
|
|
492
|
+
const pairTokens = this.tokenize(textPair);
|
|
493
|
+
pairIds = this.convertTokensToIds(pairTokens);
|
|
494
|
+
}
|
|
495
|
+
// Post-process (add special tokens)
|
|
496
|
+
let tokenTypeIds;
|
|
497
|
+
if (addSpecialTokens) {
|
|
498
|
+
const processed = this.postProcess(inputIds, pairIds);
|
|
499
|
+
inputIds = processed.ids;
|
|
500
|
+
if (returnTokenTypeIds) {
|
|
501
|
+
tokenTypeIds = processed.typeIds;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
else if (pairIds) {
|
|
505
|
+
inputIds = [...inputIds, ...pairIds];
|
|
506
|
+
if (returnTokenTypeIds) {
|
|
507
|
+
tokenTypeIds = [...inputIds.map(() => 0), ...pairIds.map(() => 1)];
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
// Truncate
|
|
511
|
+
if (truncation && inputIds.length > maxLength) {
|
|
512
|
+
inputIds = inputIds.slice(0, maxLength);
|
|
513
|
+
if (tokenTypeIds) {
|
|
514
|
+
tokenTypeIds = tokenTypeIds.slice(0, maxLength);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
// Create attention mask
|
|
518
|
+
let attentionMask = [];
|
|
519
|
+
if (returnAttentionMask) {
|
|
520
|
+
attentionMask = inputIds.map(() => 1);
|
|
521
|
+
}
|
|
522
|
+
// Padding
|
|
523
|
+
if (padding === 'max_length' && inputIds.length < maxLength) {
|
|
524
|
+
const padLength = maxLength - inputIds.length;
|
|
525
|
+
inputIds = [...inputIds, ...new Array(padLength).fill(this.padTokenId)];
|
|
526
|
+
if (returnAttentionMask) {
|
|
527
|
+
attentionMask = [...attentionMask, ...new Array(padLength).fill(0)];
|
|
528
|
+
}
|
|
529
|
+
if (tokenTypeIds) {
|
|
530
|
+
tokenTypeIds = [...tokenTypeIds, ...new Array(padLength).fill(0)];
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
const result = {
|
|
534
|
+
inputIds,
|
|
535
|
+
attentionMask,
|
|
536
|
+
};
|
|
537
|
+
if (returnTokenTypeIds && tokenTypeIds) {
|
|
538
|
+
result.tokenTypeIds = tokenTypeIds;
|
|
539
|
+
}
|
|
540
|
+
return result;
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Batch encode
|
|
544
|
+
*/
|
|
545
|
+
encodeBatch(texts, options = {}) {
|
|
546
|
+
// For 'longest' padding, first encode all without padding
|
|
547
|
+
if (options.padding === 'longest') {
|
|
548
|
+
const encodings = texts.map(t => this.encode(t, { ...options, padding: 'do_not_pad' }));
|
|
549
|
+
const maxLen = Math.max(...encodings.map(e => e.inputIds.length));
|
|
550
|
+
return texts.map(t => this.encode(t, { ...options, maxLength: maxLen, padding: 'max_length' }));
|
|
551
|
+
}
|
|
552
|
+
return texts.map(t => this.encode(t, options));
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Decode IDs to text
|
|
556
|
+
*/
|
|
557
|
+
decode(ids, skipSpecialTokens = true) {
|
|
558
|
+
let tokens = this.convertIdsToTokens(ids);
|
|
559
|
+
if (skipSpecialTokens) {
|
|
560
|
+
tokens = tokens.filter(t => !this.specialTokens.has(t));
|
|
561
|
+
}
|
|
562
|
+
if (this.modelType === 'BPE') {
|
|
563
|
+
// BPE: byte-level encoding, join raw and decode bytes
|
|
564
|
+
return this.bytesToText(tokens.join('')).replace(/\s+/g, ' ').trim();
|
|
565
|
+
}
|
|
566
|
+
if (this.modelType === 'WordPiece') {
|
|
567
|
+
// WordPiece: tokens starting with continuingSubwordPrefix (##) are
|
|
568
|
+
// subword continuations and must be appended to the previous word
|
|
569
|
+
// WITHOUT a space. All other tokens are word-starts and get a space.
|
|
570
|
+
const prefix = this.continuingSubwordPrefix; // '##'
|
|
571
|
+
const words = [];
|
|
572
|
+
for (const token of tokens) {
|
|
573
|
+
if (token.startsWith(prefix)) {
|
|
574
|
+
if (words.length > 0) {
|
|
575
|
+
words[words.length - 1] += token.slice(prefix.length);
|
|
576
|
+
}
|
|
577
|
+
else {
|
|
578
|
+
words.push(token.slice(prefix.length));
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
else {
|
|
582
|
+
words.push(token);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
return words.join(' ').replace(/\s+/g, ' ').trim();
|
|
586
|
+
}
|
|
587
|
+
if (this.modelType === 'Unigram') {
|
|
588
|
+
// SentencePiece: ▁ marks word boundaries (replaces the leading space)
|
|
589
|
+
return tokens
|
|
590
|
+
.join('')
|
|
591
|
+
.replace(/\u2581/g, ' ')
|
|
592
|
+
.replace(/\s+/g, ' ')
|
|
593
|
+
.trim();
|
|
594
|
+
}
|
|
595
|
+
// Default: space-join
|
|
596
|
+
return tokens.join(' ').replace(/\s+/g, ' ').trim();
|
|
313
597
|
}
|
|
314
598
|
/**
|
|
315
|
-
*
|
|
599
|
+
* Decode batch
|
|
316
600
|
*/
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
const text = tokens
|
|
320
|
-
.join(' ')
|
|
321
|
-
.replace(/ ##/g, '')
|
|
322
|
-
.replace(/<\/w>/g, ' ')
|
|
323
|
-
.trim();
|
|
324
|
-
return text;
|
|
601
|
+
decodeBatch(batchIds, skipSpecialTokens = true) {
|
|
602
|
+
return batchIds.map(ids => this.decode(ids, skipSpecialTokens));
|
|
325
603
|
}
|
|
326
604
|
/**
|
|
327
605
|
* Get vocabulary size
|
|
328
606
|
*/
|
|
329
607
|
get vocabSize() {
|
|
330
|
-
return this.vocab.size;
|
|
608
|
+
return this.vocab.size + this.addedTokens.size;
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Get special token IDs
|
|
612
|
+
*/
|
|
613
|
+
getSpecialTokenIds() {
|
|
614
|
+
return {
|
|
615
|
+
padTokenId: this.padTokenId,
|
|
616
|
+
unkTokenId: this.unkTokenId,
|
|
617
|
+
clsTokenId: this.clsTokenId,
|
|
618
|
+
sepTokenId: this.sepTokenId,
|
|
619
|
+
maskTokenId: this.maskTokenId,
|
|
620
|
+
bosTokenId: this.bosTokenId,
|
|
621
|
+
eosTokenId: this.eosTokenId,
|
|
622
|
+
};
|
|
331
623
|
}
|
|
332
624
|
/**
|
|
333
625
|
* Get config
|
|
334
626
|
*/
|
|
335
627
|
getConfig() {
|
|
336
|
-
return {
|
|
628
|
+
return {
|
|
629
|
+
vocabSize: this.vocabSize,
|
|
630
|
+
maxLength: this.maxLength,
|
|
631
|
+
padTokenId: this.padTokenId,
|
|
632
|
+
unkTokenId: this.unkTokenId,
|
|
633
|
+
clsTokenId: this.clsTokenId,
|
|
634
|
+
sepTokenId: this.sepTokenId,
|
|
635
|
+
maskTokenId: this.maskTokenId,
|
|
636
|
+
bosTokenId: this.bosTokenId,
|
|
637
|
+
eosTokenId: this.eosTokenId,
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Check if token is special
|
|
642
|
+
*/
|
|
643
|
+
isSpecialToken(token) {
|
|
644
|
+
return this.specialTokens.has(token);
|
|
645
|
+
}
|
|
646
|
+
/**
|
|
647
|
+
* Get token ID
|
|
648
|
+
*/
|
|
649
|
+
getTokenId(token) {
|
|
650
|
+
return this.addedTokens.get(token) ?? this.vocab.get(token);
|
|
651
|
+
}
|
|
652
|
+
/**
|
|
653
|
+
* Get token from ID
|
|
654
|
+
*/
|
|
655
|
+
getToken(id) {
|
|
656
|
+
return this.reverseVocab.get(id);
|
|
337
657
|
}
|
|
338
658
|
}
|
|
339
659
|
// ============================================================================
|
|
340
|
-
//
|
|
660
|
+
// Factory Functions
|
|
341
661
|
// ============================================================================
|
|
342
662
|
/**
|
|
343
|
-
* Create a basic English tokenizer
|
|
663
|
+
* Create a basic English tokenizer (for testing)
|
|
344
664
|
*/
|
|
345
665
|
export function createBasicTokenizer() {
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
'[PAD]': 0,
|
|
349
|
-
'[UNK]': 1,
|
|
350
|
-
'[CLS]': 2,
|
|
351
|
-
'[SEP]': 3,
|
|
352
|
-
'[MASK]': 4,
|
|
353
|
-
};
|
|
354
|
-
// Add common words
|
|
355
|
-
const commonWords = [
|
|
356
|
-
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
357
|
-
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
|
|
358
|
-
'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used',
|
|
359
|
-
'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
|
|
360
|
-
'my', 'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
|
|
361
|
-
'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'whose',
|
|
362
|
-
'and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'as', 'if', 'when', 'while',
|
|
363
|
-
'not', 'no', 'yes', 'all', 'any', 'both', 'each', 'every', 'few', 'more', 'most',
|
|
364
|
-
'other', 'some', 'such', 'only', 'own', 'same', 'than', 'too', 'very',
|
|
365
|
-
'good', 'bad', 'great', 'new', 'old', 'high', 'low', 'big', 'small', 'long', 'short',
|
|
366
|
-
'love', 'like', 'hate', 'want', 'need', 'think', 'know', 'feel', 'see', 'hear',
|
|
367
|
-
];
|
|
368
|
-
let id = 5;
|
|
369
|
-
for (const word of commonWords) {
|
|
370
|
-
vocab[word] = id++;
|
|
371
|
-
}
|
|
372
|
-
return new Tokenizer({
|
|
373
|
-
vocabSize: id,
|
|
374
|
-
maxLength: 128,
|
|
375
|
-
padTokenId: 0,
|
|
376
|
-
unkTokenId: 1,
|
|
377
|
-
clsTokenId: 2,
|
|
378
|
-
sepTokenId: 3,
|
|
379
|
-
maskTokenId: 4,
|
|
380
|
-
}, { vocab, model: 'basic' });
|
|
666
|
+
const tokenizer = new Tokenizer();
|
|
667
|
+
return tokenizer;
|
|
381
668
|
}
|
|
382
669
|
/**
|
|
383
670
|
* Load tokenizer from URL
|
|
384
671
|
*/
|
|
385
672
|
export async function loadTokenizer(url) {
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
merges: data.merges,
|
|
394
|
-
model: data.model,
|
|
395
|
-
});
|
|
673
|
+
return Tokenizer.fromUrl(url);
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Load tokenizer from HuggingFace Hub
|
|
677
|
+
*/
|
|
678
|
+
export async function loadTokenizerFromHub(modelId, options) {
|
|
679
|
+
return Tokenizer.fromHuggingFace(modelId, options);
|
|
396
680
|
}
|
|
397
681
|
//# sourceMappingURL=tokenizer.js.map
|