bpe-lite 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/bpe.js CHANGED
@@ -6,80 +6,264 @@
6
6
  * Lower rank = higher priority in merges.
7
7
  */
8
8
 
9
- // Byte → base64 token lookup (pre-built at module load)
10
- const BYTE_TOKENS = (() => {
9
+ // Byte → single-byte "binary string" lookup (pre-built at module load)
10
+ const BYTE_STRS = (() => {
11
11
  const out = new Array(256);
12
- for (let i = 0; i < 256; i++) {
13
- out[i] = Buffer.from([i]).toString('base64');
14
- }
12
+ for (let i = 0; i < 256; i++) out[i] = String.fromCharCode(i);
15
13
  return out;
16
14
  })();
17
15
 
18
- /**
19
- * Encode text using tiktoken-style BPE.
20
- * @param {string} text
21
- * @param {Object} vocabData — { engine, pattern, vocab, specialTokens }
22
- * @returns {number[]} token ids
23
- */
24
- function encodeTiktoken(text, vocabData) {
25
- if (!text) return [];
16
+ // opt D — shared UTF-8 encode buffer; one allocation for the process lifetime.
17
+ // Node.js is single-threaded and all encode paths are synchronous, so this is safe.
18
+ const _sb = { buf: Buffer.allocUnsafe(4096), cap: 4096 };
26
19
 
27
- const { vocab, specialTokens = {}, pattern } = vocabData;
20
+ function writeChunk(chunk) {
21
+ const maxNeeded = chunk.length * 4; // max 4 UTF-8 bytes per JS char
22
+ if (maxNeeded > _sb.cap) {
23
+ _sb.cap = maxNeeded * 2;
24
+ _sb.buf = Buffer.allocUnsafe(_sb.cap);
25
+ }
26
+ return _sb.buf.write(chunk, 0, 'utf8');
27
+ }
28
28
 
29
- // Build reverse map: rank → id (rank IS the id in tiktoken)
30
- // vocab[base64token] = rank = token_id
29
+ class MinHeap {
30
+ constructor() {
31
+ this.ranks = [];
32
+ this.left = [];
33
+ this.right = [];
34
+ this.verL = [];
35
+ this.verR = [];
36
+ }
31
37
 
32
- // Build special token map (string id)
33
- const specials = Object.entries(specialTokens);
38
+ // opt B reset to empty without de-allocating internal arrays
39
+ reset() {
40
+ this.ranks.length = 0;
41
+ this.left.length = 0;
42
+ this.right.length = 0;
43
+ this.verL.length = 0;
44
+ this.verR.length = 0;
45
+ }
34
46
 
35
- const ids = [];
47
+ get size() { return this.ranks.length; }
36
48
 
37
- // Split text around special tokens first
38
- const pieces = splitOnSpecials(text, specials);
49
+ push(rank, left, right, verL, verR) {
50
+ const i = this.ranks.length;
51
+ this.ranks.push(rank);
52
+ this.left.push(left);
53
+ this.right.push(right);
54
+ this.verL.push(verL);
55
+ this.verR.push(verR);
56
+ this._siftUp(i);
57
+ }
39
58
 
40
- for (const piece of pieces) {
41
- if (piece.isSpecial) {
42
- ids.push(piece.id);
43
- } else {
44
- // Apply regex pre-tokenization then BPE encode each chunk
45
- const chunks = pretokenize(piece.text, pattern);
46
- for (const chunk of chunks) {
47
- const chunkIds = bpeEncode(chunk, vocab);
48
- for (const id of chunkIds) ids.push(id);
49
- }
59
+ pop() {
60
+ const n = this.ranks.length;
61
+ if (n === 0) return null;
62
+
63
+ const rank = this.ranks[0], left = this.left[0], right = this.right[0];
64
+ const verL = this.verL[0], verR = this.verR[0];
65
+
66
+ const last = n - 1;
67
+ if (last === 0) {
68
+ this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
69
+ return { rank, left, right, verL, verR };
50
70
  }
71
+
72
+ this.ranks[0] = this.ranks[last]; this.left[0] = this.left[last];
73
+ this.right[0] = this.right[last]; this.verL[0] = this.verL[last]; this.verR[0] = this.verR[last];
74
+
75
+ this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
76
+ this._siftDown(0);
77
+ return { rank, left, right, verL, verR };
51
78
  }
52
79
 
53
- return ids;
80
+ _siftUp(i) {
81
+ while (i > 0) {
82
+ const p = (i - 1) >> 1;
83
+ if (this.ranks[p] <= this.ranks[i]) break;
84
+ this._swap(i, p);
85
+ i = p;
86
+ }
87
+ }
88
+
89
+ _siftDown(i) {
90
+ const n = this.ranks.length;
91
+ while (true) {
92
+ const l = i * 2 + 1;
93
+ if (l >= n) break;
94
+ const r = l + 1;
95
+ let m = l;
96
+ if (r < n && this.ranks[r] < this.ranks[l]) m = r;
97
+ if (this.ranks[i] <= this.ranks[m]) break;
98
+ this._swap(i, m);
99
+ i = m;
100
+ }
101
+ }
102
+
103
+ _swap(i, j) {
104
+ [this.ranks[i], this.ranks[j]] = [this.ranks[j], this.ranks[i]];
105
+ [this.left[i], this.left[j]] = [this.left[j], this.left[i]];
106
+ [this.right[i], this.right[j]] = [this.right[j], this.right[i]];
107
+ [this.verL[i], this.verL[j]] = [this.verL[j], this.verL[i]];
108
+ [this.verR[i], this.verR[j]] = [this.verR[j], this.verR[i]];
109
+ }
54
110
  }
55
111
 
56
- /**
57
- * Decode token ids back to string.
58
- * @param {number[]} ids
59
- * @param {Object} vocabData
60
- * @returns {string}
61
- */
62
- function decodeTiktoken(ids, vocabData) {
63
- if (!ids || ids.length === 0) return '';
112
+ function compilePretokenizer(patternStr) {
113
+ if (!patternStr) return { type: 'none' };
64
114
 
65
- const { vocab, specialTokens = {} } = vocabData;
115
+ const regexStr = patternStr
116
+ .replace(/\(\?i:/g, '(?:')
117
+ .replace(/\\p\{L\}/g, '\\p{L}')
118
+ .replace(/\\p\{N\}/g, '\\p{N}');
66
119
 
67
- // Build id → bytes map
68
- const idToBytes = new Map();
69
- for (const [b64, rank] of Object.entries(vocab)) {
70
- idToBytes.set(rank, Buffer.from(b64, 'base64'));
120
+ try {
121
+ return { type: 'regex', re: new RegExp(regexStr, 'guy') };
122
+ } catch {
123
+ try {
124
+ return { type: 'regex', re: new RegExp(regexStr, 'gi') };
125
+ } catch {
126
+ return { type: 'fallback' };
127
+ }
71
128
  }
72
- for (const [str, id] of Object.entries(specialTokens)) {
73
- idToBytes.set(id, Buffer.from(str, 'utf8'));
129
+ }
130
+
131
+ function pretokenize(text, compiled) {
132
+ if (!text) return [];
133
+ if (!compiled || compiled.type === 'none') return [text];
134
+ if (compiled.type === 'fallback') return text.match(/\S+|\s+/g) || [text];
135
+ return text.match(compiled.re) || [text];
136
+ }
137
+
138
+ function buildPreparedTiktoken(vocabData) {
139
+ const { vocab, specialTokens = {}, pattern } = vocabData;
140
+
141
+ const vocabBin = new Map();
142
+ let maxId = -1;
143
+ for (const [b64, id] of Object.entries(vocab)) {
144
+ const buf = Buffer.from(b64, 'base64');
145
+ vocabBin.set(buf.toString('latin1'), id);
146
+ if (id > maxId) maxId = id;
74
147
  }
75
148
 
76
- const bufs = [];
77
- for (const id of ids) {
78
- const bytes = idToBytes.get(id);
79
- if (bytes) bufs.push(bytes);
149
+ const specials = Object.entries(specialTokens);
150
+ for (const [, id] of specials) {
151
+ if (id > maxId) maxId = id;
80
152
  }
81
153
 
82
- return Buffer.concat(bufs).toString('utf8');
154
+ const idToBytes = new Array(maxId + 1);
155
+ for (const [b64, id] of Object.entries(vocab)) {
156
+ idToBytes[id] = Buffer.from(b64, 'base64');
157
+ }
158
+ for (const [str, id] of specials) {
159
+ idToBytes[id] = Buffer.from(str, 'utf8');
160
+ }
161
+
162
+ return {
163
+ vocabBin,
164
+ idToBytes,
165
+ specials,
166
+ patternCompiled: compilePretokenizer(pattern),
167
+ // opt A — per-instance chunk cache: chunk string → ids[]
168
+ cache: new Map(),
169
+ // opt B — per-instance grow-only scratch (reused across chunks)
170
+ scratch: { str: null, prev: null, next: null, ver: null, alive: null, cap: 0, heap: new MinHeap() },
171
+ };
172
+ }
173
+
174
+ // opt B — grow scratch arrays only when needed
175
+ function ensureScratch(scratch, n) {
176
+ if (n <= scratch.cap) return;
177
+ const cap = n * 2;
178
+ scratch.str = new Array(cap);
179
+ scratch.prev = new Int32Array(cap);
180
+ scratch.next = new Int32Array(cap);
181
+ scratch.ver = new Int32Array(cap);
182
+ scratch.alive = new Uint8Array(cap);
183
+ scratch.cap = cap;
184
+ }
185
+
186
+ // opt E — unified encode for one pre-tokenized chunk (replaces bpeEncode + bpeCount)
187
+ // Precondition: writeChunk(chunk) was just called and returned n.
188
+ // Reads from _sb.buf[0..n-1]. Returns ids[].
189
+ function bpeChunk(n, vocabBin, scratch) {
190
+ const buf = _sb.buf;
191
+
192
+ // opt C — fast path: single byte
193
+ if (n === 1) {
194
+ const id = vocabBin.get(BYTE_STRS[buf[0]]);
195
+ return id === undefined ? [] : [id];
196
+ }
197
+
198
+ // opt C — fast path: two bytes
199
+ if (n === 2) {
200
+ const s0 = BYTE_STRS[buf[0]], s1 = BYTE_STRS[buf[1]];
201
+ const merged = vocabBin.get(s0 + s1);
202
+ if (merged !== undefined) return [merged];
203
+ const r = [];
204
+ const i0 = vocabBin.get(s0); if (i0 !== undefined) r.push(i0);
205
+ const i1 = vocabBin.get(s1); if (i1 !== undefined) r.push(i1);
206
+ return r;
207
+ }
208
+
209
+ // General path — reuse scratch arrays (opt B), reuse heap (opt B)
210
+ ensureScratch(scratch, n);
211
+ const { str, prev, next, ver, alive, heap } = scratch;
212
+ heap.reset();
213
+
214
+ for (let i = 0; i < n; i++) {
215
+ str[i] = BYTE_STRS[buf[i]];
216
+ prev[i] = i - 1;
217
+ next[i] = i + 1;
218
+ ver[i] = 0;
219
+ alive[i] = 1;
220
+ }
221
+ next[n - 1] = -1;
222
+
223
+ for (let i = 0; i < n - 1; i++) {
224
+ const rank = vocabBin.get(str[i] + str[i + 1]);
225
+ if (rank !== undefined) heap.push(rank, i, i + 1, 0, 0);
226
+ }
227
+
228
+ while (heap.size > 0) {
229
+ const top = heap.pop();
230
+ if (!top) break;
231
+ const { left, right, verL, verR } = top;
232
+ if (!alive[left] || !alive[right]) continue;
233
+ if (next[left] !== right) continue;
234
+ if (ver[left] !== verL || ver[right] !== verR) continue;
235
+
236
+ str[left] = str[left] + str[right];
237
+ ver[left]++;
238
+ alive[right] = 0;
239
+ ver[right]++;
240
+
241
+ const nr = next[right];
242
+ next[left] = nr;
243
+ if (nr !== -1) prev[nr] = left;
244
+
245
+ const pl = prev[left];
246
+ if (pl !== -1 && alive[pl]) {
247
+ const rank = vocabBin.get(str[pl] + str[left]);
248
+ if (rank !== undefined) heap.push(rank, pl, left, ver[pl], ver[left]);
249
+ }
250
+ const nl = next[left];
251
+ if (nl !== -1 && alive[nl]) {
252
+ const rank = vocabBin.get(str[left] + str[nl]);
253
+ if (rank !== undefined) heap.push(rank, left, nl, ver[left], ver[nl]);
254
+ }
255
+ }
256
+
257
+ const ids = [];
258
+ let i = 0;
259
+ while (i !== -1) {
260
+ if (alive[i]) {
261
+ const id = vocabBin.get(str[i]);
262
+ if (id !== undefined) ids.push(id);
263
+ }
264
+ i = next[i];
265
+ }
266
+ return ids;
83
267
  }
84
268
 
85
269
  // ─── Internal helpers ─────────────────────────────────────────────────────────
@@ -87,142 +271,197 @@ function decodeTiktoken(ids, vocabData) {
87
271
  function splitOnSpecials(text, specials) {
88
272
  if (specials.length === 0) return [{ text, isSpecial: false }];
89
273
 
90
- // Sort by length descending so longer tokens match first
91
- const sorted = specials.slice().sort((a, b) => b[0].length - a[0].length);
92
-
93
274
  const result = [];
94
275
  let remaining = text;
95
276
 
96
277
  while (remaining.length > 0) {
97
- let found = false;
98
- for (const [str, id] of sorted) {
278
+ let bestIdx = -1, bestStr = null, bestId = null;
279
+
280
+ for (const [str, id] of specials) {
99
281
  const idx = remaining.indexOf(str);
100
- if (idx !== -1) {
101
- if (idx > 0) result.push({ text: remaining.slice(0, idx), isSpecial: false });
102
- result.push({ isSpecial: true, id });
103
- remaining = remaining.slice(idx + str.length);
104
- found = true;
105
- break;
282
+ if (idx === -1) continue;
283
+ if (
284
+ bestIdx === -1 ||
285
+ idx < bestIdx ||
286
+ (idx === bestIdx && bestStr && str.length > bestStr.length)
287
+ ) {
288
+ bestIdx = idx; bestStr = str; bestId = id;
106
289
  }
107
290
  }
108
- if (!found) {
109
- result.push({ text: remaining, isSpecial: false });
110
- break;
111
- }
291
+
292
+ if (bestIdx === -1) { result.push({ text: remaining, isSpecial: false }); break; }
293
+ if (bestIdx > 0) result.push({ text: remaining.slice(0, bestIdx), isSpecial: false });
294
+ result.push({ isSpecial: true, id: bestId });
295
+ remaining = remaining.slice(bestIdx + bestStr.length);
112
296
  }
113
297
 
114
298
  return result;
115
299
  }
116
300
 
117
- function pretokenize(text, patternStr) {
118
- if (!patternStr || !text) return text ? [text] : [];
301
+ // ─── Prepared-object API ──────────────────────────────────────────────────────
119
302
 
120
- // Node 18+ supports Unicode property escapes in RegExp natively
121
- // We need the 'v' or 'u' flag for \p{L}, \p{N}
122
- // The pattern from cl100k_base uses (?i:...) syntax which is not standard JS
123
- // We convert it to use the 'i' flag selectively via alternation
124
- let regexStr = patternStr
125
- .replace(/\(\?i:/g, '(?:') // (?i:...) → (?:...) and we use 'i' flag on the whole regex
126
- .replace(/\\p\{L\}/g, '\\p{L}')
127
- .replace(/\\p\{N\}/g, '\\p{N}');
303
+ function encodeTiktokenPrepared(text, prepared) {
304
+ if (!text) return [];
128
305
 
129
- let re;
130
- try {
131
- re = new RegExp(regexStr, 'guy'); // g=global, u=unicode, y=... actually just 'gu'
132
- } catch {
133
- try {
134
- re = new RegExp(regexStr, 'gi');
135
- } catch {
136
- // Fallback: split on whitespace
137
- return text.match(/\S+|\s+/g) || [text];
306
+ const ids = [];
307
+ const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
308
+ const pieces = splitOnSpecials(text, specials);
309
+
310
+ for (const piece of pieces) {
311
+ if (piece.isSpecial) { ids.push(piece.id); continue; }
312
+ const t = piece.text;
313
+
314
+ if (patternCompiled.type === 'regex') {
315
+ // opt — exec loop avoids materialising the full matches array
316
+ const re = patternCompiled.re;
317
+ re.lastIndex = 0;
318
+ let m;
319
+ while ((m = re.exec(t)) !== null) {
320
+ const chunk = m[0];
321
+ let chunkIds = cache.get(chunk);
322
+ if (chunkIds === undefined) {
323
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
324
+ cache.set(chunk, chunkIds);
325
+ }
326
+ for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
327
+ }
328
+ } else {
329
+ const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
330
+ for (let ci = 0; ci < chunks.length; ci++) {
331
+ const chunk = chunks[ci];
332
+ let chunkIds = cache.get(chunk);
333
+ if (chunkIds === undefined) {
334
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
335
+ cache.set(chunk, chunkIds);
336
+ }
337
+ for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
338
+ }
138
339
  }
139
340
  }
140
341
 
141
- return text.match(re) || [text];
342
+ return ids;
142
343
  }
143
344
 
144
- /**
145
- * BPE encode a single pre-tokenized chunk.
146
- * @param {string} chunk — raw string chunk
147
- * @param {Object} vocab — { base64token: rank }
148
- * @returns {number[]}
149
- */
150
- function bpeEncode(chunk, vocab) {
151
- // Convert chunk to UTF-8 bytes, get initial token ids (one per byte)
152
- const bytes = Buffer.from(chunk, 'utf8');
153
- if (bytes.length === 0) return [];
154
-
155
- // Start with individual bytes as tokens
156
- let tokens = [];
157
- for (let i = 0; i < bytes.length; i++) {
158
- const b64 = BYTE_TOKENS[bytes[i]];
159
- const rank = vocab[b64];
160
- if (rank === undefined) {
161
- // Unknown byte — use a fallback (should not happen with byte-level vocab)
162
- tokens.push({ b64, rank: Infinity });
163
- } else {
164
- tokens.push({ b64, rank });
165
- }
345
+ function decodeTiktokenPrepared(ids, prepared) {
346
+ if (!ids || ids.length === 0) return '';
347
+
348
+ const bufs = [];
349
+ for (let i = 0; i < ids.length; i++) {
350
+ const bytes = prepared.idToBytes[ids[i]];
351
+ if (bytes) bufs.push(bytes);
166
352
  }
353
+ return Buffer.concat(bufs).toString('utf8');
354
+ }
167
355
 
168
- // Greedy merge: repeatedly find the adjacent pair with the lowest max rank
169
- while (tokens.length >= 2) {
170
- let bestRank = Infinity;
171
- let bestIdx = -1;
356
+ function countTiktokenPrepared(text, prepared) {
357
+ if (!text) return 0;
172
358
 
173
- for (let i = 0; i < tokens.length - 1; i++) {
174
- const mergedB64 = mergeB64(tokens[i].b64, tokens[i + 1].b64);
175
- const rank = vocab[mergedB64];
176
- if (rank !== undefined && rank < bestRank) {
177
- bestRank = rank;
178
- bestIdx = i;
359
+ const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
360
+ const pieces = splitOnSpecials(text, specials);
361
+ let count = 0;
362
+
363
+ for (const piece of pieces) {
364
+ if (piece.isSpecial) { count++; continue; }
365
+ const t = piece.text;
366
+
367
+ if (patternCompiled.type === 'regex') {
368
+ const re = patternCompiled.re;
369
+ re.lastIndex = 0;
370
+ let m;
371
+ while ((m = re.exec(t)) !== null) {
372
+ const chunk = m[0];
373
+ let chunkIds = cache.get(chunk);
374
+ if (chunkIds === undefined) {
375
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
376
+ cache.set(chunk, chunkIds);
377
+ }
378
+ count += chunkIds.length;
379
+ }
380
+ } else {
381
+ const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
382
+ for (let ci = 0; ci < chunks.length; ci++) {
383
+ const chunk = chunks[ci];
384
+ let chunkIds = cache.get(chunk);
385
+ if (chunkIds === undefined) {
386
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
387
+ cache.set(chunk, chunkIds);
388
+ }
389
+ count += chunkIds.length;
179
390
  }
180
391
  }
181
-
182
- if (bestIdx === -1) break; // No more merges possible
183
-
184
- const mergedB64 = mergeB64(tokens[bestIdx].b64, tokens[bestIdx + 1].b64);
185
- tokens.splice(bestIdx, 2, { b64: mergedB64, rank: bestRank });
186
392
  }
187
393
 
188
- return tokens.map(t => t.rank);
189
- }
190
-
191
- function mergeB64(a, b) {
192
- // Decode both, concatenate bytes, re-encode
193
- const buf = Buffer.concat([Buffer.from(a, 'base64'), Buffer.from(b, 'base64')]);
194
- return buf.toString('base64');
394
+ return count;
195
395
  }
196
396
 
197
- /**
198
- * Count tokens up to a limit, short-circuiting once exceeded.
199
- * @param {string} text
200
- * @param {Object} vocabData
201
- * @param {number} limit
202
- * @returns {number} token count (may exceed limit by up to one chunk)
203
- */
204
- function countTiktokenUpTo(text, vocabData, limit) {
397
+ function countTiktokenUpToPrepared(text, prepared, limit) {
205
398
  if (!text) return 0;
206
399
 
207
- const { vocab, specialTokens = {}, pattern } = vocabData;
208
- const specials = Object.entries(specialTokens);
400
+ const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
209
401
  const pieces = splitOnSpecials(text, specials);
210
-
211
402
  let count = 0;
212
- for (const piece of pieces) {
403
+
404
+ outer: for (const piece of pieces) {
213
405
  if (piece.isSpecial) {
214
- count++;
406
+ if (++count > limit) break;
407
+ continue;
408
+ }
409
+ const t = piece.text;
410
+
411
+ if (patternCompiled.type === 'regex') {
412
+ const re = patternCompiled.re;
413
+ re.lastIndex = 0;
414
+ let m;
415
+ while ((m = re.exec(t)) !== null) {
416
+ const chunk = m[0];
417
+ let chunkIds = cache.get(chunk);
418
+ if (chunkIds === undefined) {
419
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
420
+ cache.set(chunk, chunkIds);
421
+ }
422
+ count += chunkIds.length;
423
+ if (count > limit) break outer;
424
+ }
215
425
  } else {
216
- const chunks = pretokenize(piece.text, pattern);
217
- for (const chunk of chunks) {
218
- count += bpeEncode(chunk, vocab).length;
219
- if (count > limit) return count;
426
+ const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
427
+ for (let ci = 0; ci < chunks.length; ci++) {
428
+ const chunk = chunks[ci];
429
+ let chunkIds = cache.get(chunk);
430
+ if (chunkIds === undefined) {
431
+ chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
432
+ cache.set(chunk, chunkIds);
433
+ }
434
+ count += chunkIds.length;
435
+ if (count > limit) break outer;
220
436
  }
221
437
  }
222
- if (count > limit) return count;
223
438
  }
224
439
 
225
440
  return count;
226
441
  }
227
442
 
228
- module.exports = { encodeTiktoken, decodeTiktoken, countTiktokenUpTo };
443
+ // ─── Standalone wrappers (build prepared fresh each call — used by tests / direct API) ──
444
+
445
+ function encodeTiktoken(text, vocabData) {
446
+ return encodeTiktokenPrepared(text, buildPreparedTiktoken(vocabData));
447
+ }
448
+
449
+ function decodeTiktoken(ids, vocabData) {
450
+ return decodeTiktokenPrepared(ids, buildPreparedTiktoken(vocabData));
451
+ }
452
+
453
+ function countTiktokenUpTo(text, vocabData, limit) {
454
+ return countTiktokenUpToPrepared(text, buildPreparedTiktoken(vocabData), limit);
455
+ }
456
+
457
+ module.exports = {
458
+ MinHeap,
459
+ encodeTiktoken,
460
+ decodeTiktoken,
461
+ countTiktokenUpTo,
462
+ buildPreparedTiktoken,
463
+ encodeTiktokenPrepared,
464
+ decodeTiktokenPrepared,
465
+ countTiktokenPrepared,
466
+ countTiktokenUpToPrepared,
467
+ };