bpe-lite 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -2
- package/package.json +17 -3
- package/src/bpe.js +389 -150
- package/src/spm.js +173 -55
- package/src/tokenizer.js +22 -7
- package/vocabs/anthropic.json.gz +0 -0
- package/vocabs/openai.json.gz +0 -0
- package/vocabs/anthropic.json +0 -1
- package/vocabs/openai.json +0 -1
package/src/bpe.js
CHANGED
|
@@ -6,80 +6,264 @@
|
|
|
6
6
|
* Lower rank = higher priority in merges.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
// Byte →
|
|
10
|
-
const
|
|
9
|
+
// Byte → single-byte "binary string" lookup (pre-built at module load)
|
|
10
|
+
const BYTE_STRS = (() => {
|
|
11
11
|
const out = new Array(256);
|
|
12
|
-
for (let i = 0; i < 256; i++)
|
|
13
|
-
out[i] = Buffer.from([i]).toString('base64');
|
|
14
|
-
}
|
|
12
|
+
for (let i = 0; i < 256; i++) out[i] = String.fromCharCode(i);
|
|
15
13
|
return out;
|
|
16
14
|
})();
|
|
17
15
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
* @param {Object} vocabData — { engine, pattern, vocab, specialTokens }
|
|
22
|
-
* @returns {number[]} token ids
|
|
23
|
-
*/
|
|
24
|
-
function encodeTiktoken(text, vocabData) {
|
|
25
|
-
if (!text) return [];
|
|
16
|
+
// opt D — shared UTF-8 encode buffer; one allocation for the process lifetime.
|
|
17
|
+
// Node.js is single-threaded and all encode paths are synchronous, so this is safe.
|
|
18
|
+
const _sb = { buf: Buffer.allocUnsafe(4096), cap: 4096 };
|
|
26
19
|
|
|
27
|
-
|
|
20
|
+
function writeChunk(chunk) {
|
|
21
|
+
const maxNeeded = chunk.length * 4; // max 4 UTF-8 bytes per JS char
|
|
22
|
+
if (maxNeeded > _sb.cap) {
|
|
23
|
+
_sb.cap = maxNeeded * 2;
|
|
24
|
+
_sb.buf = Buffer.allocUnsafe(_sb.cap);
|
|
25
|
+
}
|
|
26
|
+
return _sb.buf.write(chunk, 0, 'utf8');
|
|
27
|
+
}
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
class MinHeap {
|
|
30
|
+
constructor() {
|
|
31
|
+
this.ranks = [];
|
|
32
|
+
this.left = [];
|
|
33
|
+
this.right = [];
|
|
34
|
+
this.verL = [];
|
|
35
|
+
this.verR = [];
|
|
36
|
+
}
|
|
31
37
|
|
|
32
|
-
//
|
|
33
|
-
|
|
38
|
+
// opt B — reset to empty without de-allocating internal arrays
|
|
39
|
+
reset() {
|
|
40
|
+
this.ranks.length = 0;
|
|
41
|
+
this.left.length = 0;
|
|
42
|
+
this.right.length = 0;
|
|
43
|
+
this.verL.length = 0;
|
|
44
|
+
this.verR.length = 0;
|
|
45
|
+
}
|
|
34
46
|
|
|
35
|
-
|
|
47
|
+
get size() { return this.ranks.length; }
|
|
36
48
|
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
push(rank, left, right, verL, verR) {
|
|
50
|
+
const i = this.ranks.length;
|
|
51
|
+
this.ranks.push(rank);
|
|
52
|
+
this.left.push(left);
|
|
53
|
+
this.right.push(right);
|
|
54
|
+
this.verL.push(verL);
|
|
55
|
+
this.verR.push(verR);
|
|
56
|
+
this._siftUp(i);
|
|
57
|
+
}
|
|
39
58
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
59
|
+
pop() {
|
|
60
|
+
const n = this.ranks.length;
|
|
61
|
+
if (n === 0) return null;
|
|
62
|
+
|
|
63
|
+
const rank = this.ranks[0], left = this.left[0], right = this.right[0];
|
|
64
|
+
const verL = this.verL[0], verR = this.verR[0];
|
|
65
|
+
|
|
66
|
+
const last = n - 1;
|
|
67
|
+
if (last === 0) {
|
|
68
|
+
this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
|
|
69
|
+
return { rank, left, right, verL, verR };
|
|
50
70
|
}
|
|
71
|
+
|
|
72
|
+
this.ranks[0] = this.ranks[last]; this.left[0] = this.left[last];
|
|
73
|
+
this.right[0] = this.right[last]; this.verL[0] = this.verL[last]; this.verR[0] = this.verR[last];
|
|
74
|
+
|
|
75
|
+
this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
|
|
76
|
+
this._siftDown(0);
|
|
77
|
+
return { rank, left, right, verL, verR };
|
|
51
78
|
}
|
|
52
79
|
|
|
53
|
-
|
|
80
|
+
_siftUp(i) {
|
|
81
|
+
while (i > 0) {
|
|
82
|
+
const p = (i - 1) >> 1;
|
|
83
|
+
if (this.ranks[p] <= this.ranks[i]) break;
|
|
84
|
+
this._swap(i, p);
|
|
85
|
+
i = p;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_siftDown(i) {
|
|
90
|
+
const n = this.ranks.length;
|
|
91
|
+
while (true) {
|
|
92
|
+
const l = i * 2 + 1;
|
|
93
|
+
if (l >= n) break;
|
|
94
|
+
const r = l + 1;
|
|
95
|
+
let m = l;
|
|
96
|
+
if (r < n && this.ranks[r] < this.ranks[l]) m = r;
|
|
97
|
+
if (this.ranks[i] <= this.ranks[m]) break;
|
|
98
|
+
this._swap(i, m);
|
|
99
|
+
i = m;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
_swap(i, j) {
|
|
104
|
+
[this.ranks[i], this.ranks[j]] = [this.ranks[j], this.ranks[i]];
|
|
105
|
+
[this.left[i], this.left[j]] = [this.left[j], this.left[i]];
|
|
106
|
+
[this.right[i], this.right[j]] = [this.right[j], this.right[i]];
|
|
107
|
+
[this.verL[i], this.verL[j]] = [this.verL[j], this.verL[i]];
|
|
108
|
+
[this.verR[i], this.verR[j]] = [this.verR[j], this.verR[i]];
|
|
109
|
+
}
|
|
54
110
|
}
|
|
55
111
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
* @param {number[]} ids
|
|
59
|
-
* @param {Object} vocabData
|
|
60
|
-
* @returns {string}
|
|
61
|
-
*/
|
|
62
|
-
function decodeTiktoken(ids, vocabData) {
|
|
63
|
-
if (!ids || ids.length === 0) return '';
|
|
112
|
+
function compilePretokenizer(patternStr) {
|
|
113
|
+
if (!patternStr) return { type: 'none' };
|
|
64
114
|
|
|
65
|
-
const
|
|
115
|
+
const regexStr = patternStr
|
|
116
|
+
.replace(/\(\?i:/g, '(?:')
|
|
117
|
+
.replace(/\\p\{L\}/g, '\\p{L}')
|
|
118
|
+
.replace(/\\p\{N\}/g, '\\p{N}');
|
|
66
119
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
120
|
+
try {
|
|
121
|
+
return { type: 'regex', re: new RegExp(regexStr, 'guy') };
|
|
122
|
+
} catch {
|
|
123
|
+
try {
|
|
124
|
+
return { type: 'regex', re: new RegExp(regexStr, 'gi') };
|
|
125
|
+
} catch {
|
|
126
|
+
return { type: 'fallback' };
|
|
127
|
+
}
|
|
71
128
|
}
|
|
72
|
-
|
|
73
|
-
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function pretokenize(text, compiled) {
|
|
132
|
+
if (!text) return [];
|
|
133
|
+
if (!compiled || compiled.type === 'none') return [text];
|
|
134
|
+
if (compiled.type === 'fallback') return text.match(/\S+|\s+/g) || [text];
|
|
135
|
+
return text.match(compiled.re) || [text];
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function buildPreparedTiktoken(vocabData) {
|
|
139
|
+
const { vocab, specialTokens = {}, pattern } = vocabData;
|
|
140
|
+
|
|
141
|
+
const vocabBin = new Map();
|
|
142
|
+
let maxId = -1;
|
|
143
|
+
for (const [b64, id] of Object.entries(vocab)) {
|
|
144
|
+
const buf = Buffer.from(b64, 'base64');
|
|
145
|
+
vocabBin.set(buf.toString('latin1'), id);
|
|
146
|
+
if (id > maxId) maxId = id;
|
|
74
147
|
}
|
|
75
148
|
|
|
76
|
-
const
|
|
77
|
-
for (const id of
|
|
78
|
-
|
|
79
|
-
if (bytes) bufs.push(bytes);
|
|
149
|
+
const specials = Object.entries(specialTokens);
|
|
150
|
+
for (const [, id] of specials) {
|
|
151
|
+
if (id > maxId) maxId = id;
|
|
80
152
|
}
|
|
81
153
|
|
|
82
|
-
|
|
154
|
+
const idToBytes = new Array(maxId + 1);
|
|
155
|
+
for (const [b64, id] of Object.entries(vocab)) {
|
|
156
|
+
idToBytes[id] = Buffer.from(b64, 'base64');
|
|
157
|
+
}
|
|
158
|
+
for (const [str, id] of specials) {
|
|
159
|
+
idToBytes[id] = Buffer.from(str, 'utf8');
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
vocabBin,
|
|
164
|
+
idToBytes,
|
|
165
|
+
specials,
|
|
166
|
+
patternCompiled: compilePretokenizer(pattern),
|
|
167
|
+
// opt A — per-instance chunk cache: chunk string → ids[]
|
|
168
|
+
cache: new Map(),
|
|
169
|
+
// opt B — per-instance grow-only scratch (reused across chunks)
|
|
170
|
+
scratch: { str: null, prev: null, next: null, ver: null, alive: null, cap: 0, heap: new MinHeap() },
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// opt B — grow scratch arrays only when needed
|
|
175
|
+
function ensureScratch(scratch, n) {
|
|
176
|
+
if (n <= scratch.cap) return;
|
|
177
|
+
const cap = n * 2;
|
|
178
|
+
scratch.str = new Array(cap);
|
|
179
|
+
scratch.prev = new Int32Array(cap);
|
|
180
|
+
scratch.next = new Int32Array(cap);
|
|
181
|
+
scratch.ver = new Int32Array(cap);
|
|
182
|
+
scratch.alive = new Uint8Array(cap);
|
|
183
|
+
scratch.cap = cap;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// opt E — unified encode for one pre-tokenized chunk (replaces bpeEncode + bpeCount)
|
|
187
|
+
// Precondition: writeChunk(chunk) was just called and returned n.
|
|
188
|
+
// Reads from _sb.buf[0..n-1]. Returns ids[].
|
|
189
|
+
function bpeChunk(n, vocabBin, scratch) {
|
|
190
|
+
const buf = _sb.buf;
|
|
191
|
+
|
|
192
|
+
// opt C — fast path: single byte
|
|
193
|
+
if (n === 1) {
|
|
194
|
+
const id = vocabBin.get(BYTE_STRS[buf[0]]);
|
|
195
|
+
return id === undefined ? [] : [id];
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// opt C — fast path: two bytes
|
|
199
|
+
if (n === 2) {
|
|
200
|
+
const s0 = BYTE_STRS[buf[0]], s1 = BYTE_STRS[buf[1]];
|
|
201
|
+
const merged = vocabBin.get(s0 + s1);
|
|
202
|
+
if (merged !== undefined) return [merged];
|
|
203
|
+
const r = [];
|
|
204
|
+
const i0 = vocabBin.get(s0); if (i0 !== undefined) r.push(i0);
|
|
205
|
+
const i1 = vocabBin.get(s1); if (i1 !== undefined) r.push(i1);
|
|
206
|
+
return r;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// General path — reuse scratch arrays (opt B), reuse heap (opt B)
|
|
210
|
+
ensureScratch(scratch, n);
|
|
211
|
+
const { str, prev, next, ver, alive, heap } = scratch;
|
|
212
|
+
heap.reset();
|
|
213
|
+
|
|
214
|
+
for (let i = 0; i < n; i++) {
|
|
215
|
+
str[i] = BYTE_STRS[buf[i]];
|
|
216
|
+
prev[i] = i - 1;
|
|
217
|
+
next[i] = i + 1;
|
|
218
|
+
ver[i] = 0;
|
|
219
|
+
alive[i] = 1;
|
|
220
|
+
}
|
|
221
|
+
next[n - 1] = -1;
|
|
222
|
+
|
|
223
|
+
for (let i = 0; i < n - 1; i++) {
|
|
224
|
+
const rank = vocabBin.get(str[i] + str[i + 1]);
|
|
225
|
+
if (rank !== undefined) heap.push(rank, i, i + 1, 0, 0);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
while (heap.size > 0) {
|
|
229
|
+
const top = heap.pop();
|
|
230
|
+
if (!top) break;
|
|
231
|
+
const { left, right, verL, verR } = top;
|
|
232
|
+
if (!alive[left] || !alive[right]) continue;
|
|
233
|
+
if (next[left] !== right) continue;
|
|
234
|
+
if (ver[left] !== verL || ver[right] !== verR) continue;
|
|
235
|
+
|
|
236
|
+
str[left] = str[left] + str[right];
|
|
237
|
+
ver[left]++;
|
|
238
|
+
alive[right] = 0;
|
|
239
|
+
ver[right]++;
|
|
240
|
+
|
|
241
|
+
const nr = next[right];
|
|
242
|
+
next[left] = nr;
|
|
243
|
+
if (nr !== -1) prev[nr] = left;
|
|
244
|
+
|
|
245
|
+
const pl = prev[left];
|
|
246
|
+
if (pl !== -1 && alive[pl]) {
|
|
247
|
+
const rank = vocabBin.get(str[pl] + str[left]);
|
|
248
|
+
if (rank !== undefined) heap.push(rank, pl, left, ver[pl], ver[left]);
|
|
249
|
+
}
|
|
250
|
+
const nl = next[left];
|
|
251
|
+
if (nl !== -1 && alive[nl]) {
|
|
252
|
+
const rank = vocabBin.get(str[left] + str[nl]);
|
|
253
|
+
if (rank !== undefined) heap.push(rank, left, nl, ver[left], ver[nl]);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const ids = [];
|
|
258
|
+
let i = 0;
|
|
259
|
+
while (i !== -1) {
|
|
260
|
+
if (alive[i]) {
|
|
261
|
+
const id = vocabBin.get(str[i]);
|
|
262
|
+
if (id !== undefined) ids.push(id);
|
|
263
|
+
}
|
|
264
|
+
i = next[i];
|
|
265
|
+
}
|
|
266
|
+
return ids;
|
|
83
267
|
}
|
|
84
268
|
|
|
85
269
|
// ─── Internal helpers ─────────────────────────────────────────────────────────
|
|
@@ -87,142 +271,197 @@ function decodeTiktoken(ids, vocabData) {
|
|
|
87
271
|
function splitOnSpecials(text, specials) {
|
|
88
272
|
if (specials.length === 0) return [{ text, isSpecial: false }];
|
|
89
273
|
|
|
90
|
-
// Sort by length descending so longer tokens match first
|
|
91
|
-
const sorted = specials.slice().sort((a, b) => b[0].length - a[0].length);
|
|
92
|
-
|
|
93
274
|
const result = [];
|
|
94
275
|
let remaining = text;
|
|
95
276
|
|
|
96
277
|
while (remaining.length > 0) {
|
|
97
|
-
let
|
|
98
|
-
|
|
278
|
+
let bestIdx = -1, bestStr = null, bestId = null;
|
|
279
|
+
|
|
280
|
+
for (const [str, id] of specials) {
|
|
99
281
|
const idx = remaining.indexOf(str);
|
|
100
|
-
if (idx
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
282
|
+
if (idx === -1) continue;
|
|
283
|
+
if (
|
|
284
|
+
bestIdx === -1 ||
|
|
285
|
+
idx < bestIdx ||
|
|
286
|
+
(idx === bestIdx && bestStr && str.length > bestStr.length)
|
|
287
|
+
) {
|
|
288
|
+
bestIdx = idx; bestStr = str; bestId = id;
|
|
106
289
|
}
|
|
107
290
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
}
|
|
291
|
+
|
|
292
|
+
if (bestIdx === -1) { result.push({ text: remaining, isSpecial: false }); break; }
|
|
293
|
+
if (bestIdx > 0) result.push({ text: remaining.slice(0, bestIdx), isSpecial: false });
|
|
294
|
+
result.push({ isSpecial: true, id: bestId });
|
|
295
|
+
remaining = remaining.slice(bestIdx + bestStr.length);
|
|
112
296
|
}
|
|
113
297
|
|
|
114
298
|
return result;
|
|
115
299
|
}
|
|
116
300
|
|
|
117
|
-
|
|
118
|
-
if (!patternStr || !text) return text ? [text] : [];
|
|
301
|
+
// ─── Prepared-object API ──────────────────────────────────────────────────────
|
|
119
302
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
// The pattern from cl100k_base uses (?i:...) syntax which is not standard JS
|
|
123
|
-
// We convert it to use the 'i' flag selectively via alternation
|
|
124
|
-
let regexStr = patternStr
|
|
125
|
-
.replace(/\(\?i:/g, '(?:') // (?i:...) → (?:...) and we use 'i' flag on the whole regex
|
|
126
|
-
.replace(/\\p\{L\}/g, '\\p{L}')
|
|
127
|
-
.replace(/\\p\{N\}/g, '\\p{N}');
|
|
303
|
+
function encodeTiktokenPrepared(text, prepared) {
|
|
304
|
+
if (!text) return [];
|
|
128
305
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
306
|
+
const ids = [];
|
|
307
|
+
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
308
|
+
const pieces = splitOnSpecials(text, specials);
|
|
309
|
+
|
|
310
|
+
for (const piece of pieces) {
|
|
311
|
+
if (piece.isSpecial) { ids.push(piece.id); continue; }
|
|
312
|
+
const t = piece.text;
|
|
313
|
+
|
|
314
|
+
if (patternCompiled.type === 'regex') {
|
|
315
|
+
// opt — exec loop avoids materialising the full matches array
|
|
316
|
+
const re = patternCompiled.re;
|
|
317
|
+
re.lastIndex = 0;
|
|
318
|
+
let m;
|
|
319
|
+
while ((m = re.exec(t)) !== null) {
|
|
320
|
+
const chunk = m[0];
|
|
321
|
+
let chunkIds = cache.get(chunk);
|
|
322
|
+
if (chunkIds === undefined) {
|
|
323
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
324
|
+
cache.set(chunk, chunkIds);
|
|
325
|
+
}
|
|
326
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
327
|
+
}
|
|
328
|
+
} else {
|
|
329
|
+
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
330
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
331
|
+
const chunk = chunks[ci];
|
|
332
|
+
let chunkIds = cache.get(chunk);
|
|
333
|
+
if (chunkIds === undefined) {
|
|
334
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
335
|
+
cache.set(chunk, chunkIds);
|
|
336
|
+
}
|
|
337
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
338
|
+
}
|
|
138
339
|
}
|
|
139
340
|
}
|
|
140
341
|
|
|
141
|
-
return
|
|
342
|
+
return ids;
|
|
142
343
|
}
|
|
143
344
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
// Convert chunk to UTF-8 bytes, get initial token ids (one per byte)
|
|
152
|
-
const bytes = Buffer.from(chunk, 'utf8');
|
|
153
|
-
if (bytes.length === 0) return [];
|
|
154
|
-
|
|
155
|
-
// Start with individual bytes as tokens
|
|
156
|
-
let tokens = [];
|
|
157
|
-
for (let i = 0; i < bytes.length; i++) {
|
|
158
|
-
const b64 = BYTE_TOKENS[bytes[i]];
|
|
159
|
-
const rank = vocab[b64];
|
|
160
|
-
if (rank === undefined) {
|
|
161
|
-
// Unknown byte — use a fallback (should not happen with byte-level vocab)
|
|
162
|
-
tokens.push({ b64, rank: Infinity });
|
|
163
|
-
} else {
|
|
164
|
-
tokens.push({ b64, rank });
|
|
165
|
-
}
|
|
345
|
+
function decodeTiktokenPrepared(ids, prepared) {
|
|
346
|
+
if (!ids || ids.length === 0) return '';
|
|
347
|
+
|
|
348
|
+
const bufs = [];
|
|
349
|
+
for (let i = 0; i < ids.length; i++) {
|
|
350
|
+
const bytes = prepared.idToBytes[ids[i]];
|
|
351
|
+
if (bytes) bufs.push(bytes);
|
|
166
352
|
}
|
|
353
|
+
return Buffer.concat(bufs).toString('utf8');
|
|
354
|
+
}
|
|
167
355
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
let bestRank = Infinity;
|
|
171
|
-
let bestIdx = -1;
|
|
356
|
+
function countTiktokenPrepared(text, prepared) {
|
|
357
|
+
if (!text) return 0;
|
|
172
358
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
359
|
+
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
360
|
+
const pieces = splitOnSpecials(text, specials);
|
|
361
|
+
let count = 0;
|
|
362
|
+
|
|
363
|
+
for (const piece of pieces) {
|
|
364
|
+
if (piece.isSpecial) { count++; continue; }
|
|
365
|
+
const t = piece.text;
|
|
366
|
+
|
|
367
|
+
if (patternCompiled.type === 'regex') {
|
|
368
|
+
const re = patternCompiled.re;
|
|
369
|
+
re.lastIndex = 0;
|
|
370
|
+
let m;
|
|
371
|
+
while ((m = re.exec(t)) !== null) {
|
|
372
|
+
const chunk = m[0];
|
|
373
|
+
let chunkIds = cache.get(chunk);
|
|
374
|
+
if (chunkIds === undefined) {
|
|
375
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
376
|
+
cache.set(chunk, chunkIds);
|
|
377
|
+
}
|
|
378
|
+
count += chunkIds.length;
|
|
379
|
+
}
|
|
380
|
+
} else {
|
|
381
|
+
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
382
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
383
|
+
const chunk = chunks[ci];
|
|
384
|
+
let chunkIds = cache.get(chunk);
|
|
385
|
+
if (chunkIds === undefined) {
|
|
386
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
387
|
+
cache.set(chunk, chunkIds);
|
|
388
|
+
}
|
|
389
|
+
count += chunkIds.length;
|
|
179
390
|
}
|
|
180
391
|
}
|
|
181
|
-
|
|
182
|
-
if (bestIdx === -1) break; // No more merges possible
|
|
183
|
-
|
|
184
|
-
const mergedB64 = mergeB64(tokens[bestIdx].b64, tokens[bestIdx + 1].b64);
|
|
185
|
-
tokens.splice(bestIdx, 2, { b64: mergedB64, rank: bestRank });
|
|
186
392
|
}
|
|
187
393
|
|
|
188
|
-
return
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
function mergeB64(a, b) {
|
|
192
|
-
// Decode both, concatenate bytes, re-encode
|
|
193
|
-
const buf = Buffer.concat([Buffer.from(a, 'base64'), Buffer.from(b, 'base64')]);
|
|
194
|
-
return buf.toString('base64');
|
|
394
|
+
return count;
|
|
195
395
|
}
|
|
196
396
|
|
|
197
|
-
|
|
198
|
-
* Count tokens up to a limit, short-circuiting once exceeded.
|
|
199
|
-
* @param {string} text
|
|
200
|
-
* @param {Object} vocabData
|
|
201
|
-
* @param {number} limit
|
|
202
|
-
* @returns {number} token count (may exceed limit by up to one chunk)
|
|
203
|
-
*/
|
|
204
|
-
function countTiktokenUpTo(text, vocabData, limit) {
|
|
397
|
+
function countTiktokenUpToPrepared(text, prepared, limit) {
|
|
205
398
|
if (!text) return 0;
|
|
206
399
|
|
|
207
|
-
const {
|
|
208
|
-
const specials = Object.entries(specialTokens);
|
|
400
|
+
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
209
401
|
const pieces = splitOnSpecials(text, specials);
|
|
210
|
-
|
|
211
402
|
let count = 0;
|
|
212
|
-
|
|
403
|
+
|
|
404
|
+
outer: for (const piece of pieces) {
|
|
213
405
|
if (piece.isSpecial) {
|
|
214
|
-
count
|
|
406
|
+
if (++count > limit) break;
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
const t = piece.text;
|
|
410
|
+
|
|
411
|
+
if (patternCompiled.type === 'regex') {
|
|
412
|
+
const re = patternCompiled.re;
|
|
413
|
+
re.lastIndex = 0;
|
|
414
|
+
let m;
|
|
415
|
+
while ((m = re.exec(t)) !== null) {
|
|
416
|
+
const chunk = m[0];
|
|
417
|
+
let chunkIds = cache.get(chunk);
|
|
418
|
+
if (chunkIds === undefined) {
|
|
419
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
420
|
+
cache.set(chunk, chunkIds);
|
|
421
|
+
}
|
|
422
|
+
count += chunkIds.length;
|
|
423
|
+
if (count > limit) break outer;
|
|
424
|
+
}
|
|
215
425
|
} else {
|
|
216
|
-
const chunks =
|
|
217
|
-
for (
|
|
218
|
-
|
|
219
|
-
|
|
426
|
+
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
427
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
428
|
+
const chunk = chunks[ci];
|
|
429
|
+
let chunkIds = cache.get(chunk);
|
|
430
|
+
if (chunkIds === undefined) {
|
|
431
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
432
|
+
cache.set(chunk, chunkIds);
|
|
433
|
+
}
|
|
434
|
+
count += chunkIds.length;
|
|
435
|
+
if (count > limit) break outer;
|
|
220
436
|
}
|
|
221
437
|
}
|
|
222
|
-
if (count > limit) return count;
|
|
223
438
|
}
|
|
224
439
|
|
|
225
440
|
return count;
|
|
226
441
|
}
|
|
227
442
|
|
|
228
|
-
|
|
443
|
+
// ─── Standalone wrappers (build prepared fresh each call — used by tests / direct API) ──
|
|
444
|
+
|
|
445
|
+
function encodeTiktoken(text, vocabData) {
|
|
446
|
+
return encodeTiktokenPrepared(text, buildPreparedTiktoken(vocabData));
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function decodeTiktoken(ids, vocabData) {
|
|
450
|
+
return decodeTiktokenPrepared(ids, buildPreparedTiktoken(vocabData));
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
function countTiktokenUpTo(text, vocabData, limit) {
|
|
454
|
+
return countTiktokenUpToPrepared(text, buildPreparedTiktoken(vocabData), limit);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
module.exports = {
|
|
458
|
+
MinHeap,
|
|
459
|
+
encodeTiktoken,
|
|
460
|
+
decodeTiktoken,
|
|
461
|
+
countTiktokenUpTo,
|
|
462
|
+
buildPreparedTiktoken,
|
|
463
|
+
encodeTiktokenPrepared,
|
|
464
|
+
decodeTiktokenPrepared,
|
|
465
|
+
countTiktokenPrepared,
|
|
466
|
+
countTiktokenUpToPrepared,
|
|
467
|
+
};
|