bpe-lite 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -32
- package/package.json +1 -1
- package/src/bpe.js +73 -14
- package/src/spm.js +11 -29
- package/vocabs/anthropic.json.gz +0 -0
package/README.md
CHANGED
|
@@ -30,7 +30,6 @@ import { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, a
|
|
|
30
30
|
```
|
|
31
31
|
|
|
32
32
|
```js
|
|
33
|
-
|
|
34
33
|
// Count tokens
|
|
35
34
|
countTokens('Your text here', 'openai-o200k'); // → number (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
|
|
36
35
|
countTokens('Your text here', 'openai'); // → number (GPT-4, GPT-3.5)
|
|
@@ -53,44 +52,92 @@ tok.count('Hello, world!'); // → 4
|
|
|
53
52
|
|
|
54
53
|
## Providers
|
|
55
54
|
|
|
56
|
-
| Provider | Vocab |
|
|
57
|
-
|
|
58
|
-
| `openai-o200k` | o200k_base
|
|
59
|
-
| `openai` | cl100k_base
|
|
60
|
-
| `anthropic` | Xenova/claude-tokenizer
|
|
61
|
-
| `gemini` | Gemma 3
|
|
55
|
+
| Provider | Vocab | Models | Accuracy |
|
|
56
|
+
|----------|-------|--------|----------|
|
|
57
|
+
| `openai-o200k` | o200k_base (200k) | GPT-4o, o1, o3, o4, GPT-4.1, GPT-5 | Exact — vocab sourced from OpenAI |
|
|
58
|
+
| `openai` | cl100k_base (100k) | GPT-4, GPT-3.5 | Exact — vocab sourced from OpenAI |
|
|
59
|
+
| `anthropic` | Xenova/claude-tokenizer (65k BPE) | Claude | See accuracy section below |
|
|
60
|
+
| `gemini` | Gemma 3 SPM (262k) | Gemini | See accuracy section below |
|
|
62
61
|
|
|
63
62
|
Vocab files are bundled in the package — no network required at runtime or install time.
|
|
64
63
|
|
|
64
|
+
## Accuracy — Anthropic
|
|
65
|
+
|
|
66
|
+
Anthropic has not released the Claude tokenizer. bpe-lite uses [`Xenova/claude-tokenizer`](https://huggingface.co/Xenova/claude-tokenizer), a community reverse-engineering of the ~65k BPE vocabulary, with hand-tuned byte-level corrections applied on top.
|
|
67
|
+
|
|
68
|
+
### Benchmark methodology
|
|
69
|
+
|
|
70
|
+
Tested against `claude-haiku-4-5-20251001` via the Anthropic `count_tokens` API on a 120-sample stratified corpus across 12 categories. The corpus deliberately over-represents difficult content (Arabic, symbols, emoji, numbers) to expose systematic failures — overall numbers are lower than you would see on typical prose-only workloads by design.
|
|
71
|
+
|
|
72
|
+
### Overall results (120 samples, 114 eligible)
|
|
73
|
+
|
|
74
|
+
| Metric | bpe-lite | ai-tokenizer |
|
|
75
|
+
|--------|----------|--------------|
|
|
76
|
+
| Within 5% | 46.5% | 18.4% |
|
|
77
|
+
| Within 10% | 62.3% ±8.8% CI | 37.7% ±8.8% CI |
|
|
78
|
+
| Mean abs error | 9.4% | 16.0% |
|
|
79
|
+
| Median abs error | 5.7% | 13.6% |
|
|
80
|
+
| Max abs error | 42.9% | 82.6% |
|
|
81
|
+
|
|
82
|
+
### Per-category breakdown
|
|
83
|
+
|
|
84
|
+
| Category | Within 10% | Mean error | Notes |
|
|
85
|
+
|----------|-----------|------------|-------|
|
|
86
|
+
| `code-js` | 100% | 4.2% | |
|
|
87
|
+
| `english-prose` | 90% | 5.5% | |
|
|
88
|
+
| `code-python` | 90% | 4.8% | |
|
|
89
|
+
| `structured` | 90% | 3.6% | JSON, HTML, XML, Markdown, SQL |
|
|
90
|
+
| `numbers` | 80% | 7.3% | |
|
|
91
|
+
| `hex-binary` | 80% | 5.3% | |
|
|
92
|
+
| `urls` | 80% | 3.6% | |
|
|
93
|
+
| `cjk` | 40% | 8.8% | |
|
|
94
|
+
| `short` | 30% | 6.8% | |
|
|
95
|
+
| `emoji` | 20% | 17.7% | ZWJ sequences, flags, skin tones |
|
|
96
|
+
| `symbols` | 10% | 17.6% | Cross-byte merges unreplicable |
|
|
97
|
+
| `arabic` | 0% | 26.1% | Structural vocabulary gap — unfixable |
|
|
98
|
+
|
|
99
|
+
For prose, code, structured data, and URLs — the dominant content types in real-world prompts — bpe-lite is within 10% on 80–100% of samples. Arabic and symbol-cluster-heavy content cannot be accurately estimated without the actual Claude tokenizer.
|
|
100
|
+
|
|
101
|
+
### Why bpe-lite outperforms ai-tokenizer on Claude
|
|
102
|
+
|
|
103
|
+
ai-tokenizer's `claude` encoding uses `\p{N}+` (greedy, unlimited digit chunks). Current Claude models use `\p{N}{1,3}` (1–3 digits). This causes 20–43% errors on anything involving numbers — including code, hex, and data. bpe-lite uses the correct pattern.
|
|
104
|
+
|
|
105
|
+
ai-tokenizer also does not have a Gemini encoding: all Gemini models are mapped to OpenAI's `o200k_base` vocabulary with a fudge multiplier. This is wrong by construction — see below.
|
|
106
|
+
|
|
107
|
+
## Accuracy — Gemini
|
|
108
|
+
|
|
109
|
+
bpe-lite implements the full Gemma 3 SentencePiece BPE algorithm using the actual Gemini vocabulary. On a 25-sample test against the Gemini API, bpe-lite scored 100% exact (no failures found; 25 samples is a limited basis — treat this as a lower bound, not a guarantee across all content types).
|
|
110
|
+
|
|
111
|
+
ai-tokenizer does not implement Gemini natively. Inspecting their bundled source (`dist/index.js`), every Gemini model is defined as `"encoding": "o200k_base"` with a `"contentMultiplier": 1.08` fudge factor — it runs the OpenAI vocabulary through a multiplier rather than using Gemini's actual tokenizer. bpe-lite uses the actual Gemma 3 vocabulary and SentencePiece algorithm.
|
|
112
|
+
|
|
65
113
|
## Performance
|
|
66
114
|
|
|
67
|
-
Benchmarked on Node v24 (win32/x64).
|
|
115
|
+
Benchmarked on Node v24 (win32/x64). Run `node --expose-gc scripts/bench.js` locally for numbers on your hardware.
|
|
116
|
+
|
|
117
|
+
**Large text (~500 KB) — ops/s**
|
|
68
118
|
|
|
69
|
-
|
|
119
|
+
| impl | cl100k | Anthropic | Gemini | note |
|
|
120
|
+
|------|-------:|----------:|-------:|------|
|
|
121
|
+
| bpe-lite | 291 | 289 | 998 | |
|
|
122
|
+
| ai-tokenizer | 291 | 291 | 215 | Gemini column uses o200k — wrong algorithm |
|
|
123
|
+
| js-tiktoken | 30 | — | — | WASM overhead |
|
|
70
124
|
|
|
71
|
-
|
|
72
|
-
|------|------:|---------:|-----:|
|
|
73
|
-
| bpe-lite | **291** | **3.56M** | **15.4** |
|
|
74
|
-
| ai-tokenizer | 291 | 3.56M | 15.4 |
|
|
75
|
-
| js-tiktoken | 30 | 370k | 1.6 |
|
|
125
|
+
bpe-lite matches ai-tokenizer throughput for OpenAI and Anthropic large text. On Gemini, bpe-lite's SPM engine is ~4.6x faster — the ai-tokenizer column there is not a valid comparison since it uses a different algorithm.
|
|
76
126
|
|
|
77
|
-
**
|
|
127
|
+
**Small text cold vs warm**
|
|
78
128
|
|
|
79
|
-
|
|
80
|
-
|------|------:|---------:|-----:|
|
|
81
|
-
| bpe-lite | **289** | **5.34M** | **15.3** |
|
|
82
|
-
| ai-tokenizer | 291 | 5.31M | 15.4 |
|
|
129
|
+
bpe-lite maintains a per-instance chunk cache. For repeated text patterns (e.g. the same prompt template encoded thousands of times), cache hits eliminate BPE work entirely:
|
|
83
130
|
|
|
84
|
-
|
|
131
|
+
| scenario | bpe-lite | ai-tokenizer |
|
|
132
|
+
|----------|----------|--------------|
|
|
133
|
+
| Small text, cold (novel input) | ~352k ops/s | ~453k ops/s |
|
|
134
|
+
| Small text, warm (repeated input) | ~1.45M ops/s | ~453k ops/s |
|
|
85
135
|
|
|
86
|
-
|
|
87
|
-
|------|------:|---------:|-----:|------|
|
|
88
|
-
| bpe-lite | **998** | **11.1M** | **52.9** | actual Gemma3 SPM |
|
|
89
|
-
| ai-tokenizer | 215 | 2.40M | 11.4 | o200k BPE — different algorithm, different results |
|
|
136
|
+
For diverse, non-repeating inputs, ai-tokenizer is ~29% faster on very short strings. For any repeated-text workload, bpe-lite is ~3x faster.
|
|
90
137
|
|
|
91
|
-
|
|
138
|
+
**Initialization**
|
|
92
139
|
|
|
93
|
-
|
|
140
|
+
bpe-lite lazy-loads the gzipped vocab on first encode — one-time cost of ~235ms per provider per process. Negligible for any persistent process. Relevant only for cold serverless invocations that encode once and exit.
|
|
94
141
|
|
|
95
142
|
## API
|
|
96
143
|
|
|
@@ -108,7 +155,7 @@ Decodes an array of token ids back to a string.
|
|
|
108
155
|
|
|
109
156
|
### `isWithinTokenLimit(text, limit, provider?)`
|
|
110
157
|
|
|
111
|
-
Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts —
|
|
158
|
+
Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts — short-circuits as soon as the limit is crossed.
|
|
112
159
|
|
|
113
160
|
### Tokenizer instances
|
|
114
161
|
|
|
@@ -118,11 +165,6 @@ Returns the token count if `text` is within `limit` tokens, or `false` if exceed
|
|
|
118
165
|
|
|
119
166
|
`tiktoken` is accurate for OpenAI but requires Rust/WASM native bindings, which can break in Docker containers, edge runtimes, and serverless environments. `bpe-lite` is pure JavaScript — it runs anywhere Node 18+ runs, with no native compilation step.
|
|
120
167
|
|
|
121
|
-
## Caveats
|
|
122
|
-
|
|
123
|
-
- **Anthropic**: Anthropic has not released the Claude tokenizer. The vocab is sourced from [Xenova/claude-tokenizer](https://huggingface.co/Xenova/claude-tokenizer), a community reverse-engineering of the ~65k BPE vocab. NFKC normalization and probe-based merge corrections are applied. Accuracy varies by text type — common prose and code are usually within 10%, but Arabic, Japanese, repeated characters, and some symbol/emoji combinations diverge significantly.
|
|
124
|
-
- **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
|
|
125
|
-
|
|
126
168
|
## License
|
|
127
169
|
|
|
128
170
|
MIT
|
package/package.json
CHANGED
package/src/bpe.js
CHANGED
|
@@ -6,6 +6,10 @@
|
|
|
6
6
|
* Lower rank = higher priority in merges.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
+
// Fast check: does a pre-tokenized chunk contain any letter or digit?
|
|
10
|
+
// Non-word chunks (pure symbols, punctuation, spaces) are eligible for batching.
|
|
11
|
+
const WORD_CHAR_RE = /[\p{L}\p{N}]/u;
|
|
12
|
+
|
|
9
13
|
// Byte → single-byte "binary string" lookup (pre-built at module load)
|
|
10
14
|
const BYTE_STRS = (() => {
|
|
11
15
|
const out = new Array(256);
|
|
@@ -170,6 +174,9 @@ function buildPreparedTiktoken(vocabData) {
|
|
|
170
174
|
specials,
|
|
171
175
|
patternCompiled: compilePretokenizer(pattern),
|
|
172
176
|
normalize: normalize || null,
|
|
177
|
+
// symbolBatch: merge consecutive non-word regex chunks before BPE so that
|
|
178
|
+
// cross-character byte merges can fire (matches Claude's no-regex byte BPE).
|
|
179
|
+
symbolBatch: !!vocabData.symbolBatch,
|
|
173
180
|
// opt A — per-instance chunk cache: chunk string → ids[]
|
|
174
181
|
cache: new Map(),
|
|
175
182
|
// opt B — per-instance grow-only scratch (reused across chunks)
|
|
@@ -323,14 +330,41 @@ function encodeTiktokenPrepared(text, prepared) {
|
|
|
323
330
|
const re = patternCompiled.re;
|
|
324
331
|
re.lastIndex = 0;
|
|
325
332
|
let m;
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
333
|
+
if (prepared.symbolBatch) {
|
|
334
|
+
// Merge consecutive non-word chunks (symbols, punctuation, spaces) into one
|
|
335
|
+
// BPE input so cross-character byte merges can fire — matches Claude's behavior.
|
|
336
|
+
let symBuf = null;
|
|
337
|
+
while ((m = re.exec(t)) !== null) {
|
|
338
|
+
const chunk = m[0];
|
|
339
|
+
if (!WORD_CHAR_RE.test(chunk)) {
|
|
340
|
+
symBuf = symBuf === null ? chunk : symBuf + chunk;
|
|
341
|
+
} else {
|
|
342
|
+
if (symBuf !== null) {
|
|
343
|
+
let symIds = cache.get(symBuf);
|
|
344
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
345
|
+
for (let i = 0; i < symIds.length; i++) ids.push(symIds[i]);
|
|
346
|
+
symBuf = null;
|
|
347
|
+
}
|
|
348
|
+
let chunkIds = cache.get(chunk);
|
|
349
|
+
if (chunkIds === undefined) { chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch); cache.set(chunk, chunkIds); }
|
|
350
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
if (symBuf !== null) {
|
|
354
|
+
let symIds = cache.get(symBuf);
|
|
355
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
356
|
+
for (let i = 0; i < symIds.length; i++) ids.push(symIds[i]);
|
|
357
|
+
}
|
|
358
|
+
} else {
|
|
359
|
+
while ((m = re.exec(t)) !== null) {
|
|
360
|
+
const chunk = m[0];
|
|
361
|
+
let chunkIds = cache.get(chunk);
|
|
362
|
+
if (chunkIds === undefined) {
|
|
363
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
364
|
+
cache.set(chunk, chunkIds);
|
|
365
|
+
}
|
|
366
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
332
367
|
}
|
|
333
|
-
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
334
368
|
}
|
|
335
369
|
} else {
|
|
336
370
|
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
@@ -376,14 +410,39 @@ function countTiktokenPrepared(text, prepared) {
|
|
|
376
410
|
const re = patternCompiled.re;
|
|
377
411
|
re.lastIndex = 0;
|
|
378
412
|
let m;
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
413
|
+
if (prepared.symbolBatch) {
|
|
414
|
+
let symBuf = null;
|
|
415
|
+
while ((m = re.exec(t)) !== null) {
|
|
416
|
+
const chunk = m[0];
|
|
417
|
+
if (!WORD_CHAR_RE.test(chunk)) {
|
|
418
|
+
symBuf = symBuf === null ? chunk : symBuf + chunk;
|
|
419
|
+
} else {
|
|
420
|
+
if (symBuf !== null) {
|
|
421
|
+
let symIds = cache.get(symBuf);
|
|
422
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
423
|
+
count += symIds.length;
|
|
424
|
+
symBuf = null;
|
|
425
|
+
}
|
|
426
|
+
let chunkIds = cache.get(chunk);
|
|
427
|
+
if (chunkIds === undefined) { chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch); cache.set(chunk, chunkIds); }
|
|
428
|
+
count += chunkIds.length;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (symBuf !== null) {
|
|
432
|
+
let symIds = cache.get(symBuf);
|
|
433
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
434
|
+
count += symIds.length;
|
|
435
|
+
}
|
|
436
|
+
} else {
|
|
437
|
+
while ((m = re.exec(t)) !== null) {
|
|
438
|
+
const chunk = m[0];
|
|
439
|
+
let chunkIds = cache.get(chunk);
|
|
440
|
+
if (chunkIds === undefined) {
|
|
441
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
442
|
+
cache.set(chunk, chunkIds);
|
|
443
|
+
}
|
|
444
|
+
count += chunkIds.length;
|
|
385
445
|
}
|
|
386
|
-
count += chunkIds.length;
|
|
387
446
|
}
|
|
388
447
|
} else {
|
|
389
448
|
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
package/src/spm.js
CHANGED
|
@@ -86,10 +86,8 @@ function encodeSPM(text, vocabData) {
|
|
|
86
86
|
* Isolated from encodeSegment so V8 can keep this function optimised
|
|
87
87
|
* even when encodeSPMPrepared is called with wildly different text lengths.
|
|
88
88
|
*
|
|
89
|
-
* Segmentation:
|
|
90
|
-
*
|
|
91
|
-
* N>1 → two segments [▁×(N-1), ▁word]
|
|
92
|
-
* This matches how Gemma BPE merges multi-space tokens (▁▁=138, ▁▁▁=139, …).
|
|
89
|
+
* Segmentation: each run of ▁ chars plus the following non-▁ word is one segment.
|
|
90
|
+
* Leading ▁s are included so BPE can naturally merge ▁▁→138, ▁▁▁→139, etc.
|
|
93
91
|
*/
|
|
94
92
|
function _scanFromCache(normalized, cache, result) {
|
|
95
93
|
let i = 0;
|
|
@@ -113,20 +111,11 @@ function _scanFromCache(normalized, cache, result) {
|
|
|
113
111
|
// Collect word chars (non-▁)
|
|
114
112
|
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
115
113
|
|
|
116
|
-
//
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
for (let j = 0; j < spaceIds.length; j++) result.push(spaceIds[j]);
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// Emit ▁ + word as one segment
|
|
125
|
-
const wordStart = spaceCount > 0 ? segStart + spaceCount - 1 : segStart;
|
|
126
|
-
const wordSeg = normalized.slice(wordStart, i);
|
|
127
|
-
const wordIds = cache.get(wordSeg);
|
|
128
|
-
if (wordIds === undefined) return false;
|
|
129
|
-
for (let j = 0; j < wordIds.length; j++) result.push(wordIds[j]);
|
|
114
|
+
// Emit all leading ▁s + word as one segment; BPE merges ▁▁, ▁▁▁, etc.
|
|
115
|
+
const seg = normalized.slice(segStart, i);
|
|
116
|
+
const segIds = cache.get(seg);
|
|
117
|
+
if (segIds === undefined) return false;
|
|
118
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
130
119
|
}
|
|
131
120
|
return true;
|
|
132
121
|
}
|
|
@@ -163,7 +152,7 @@ function encodeSPMPrepared(text, prepared) {
|
|
|
163
152
|
if (i === normalized.length) {
|
|
164
153
|
if (spaceCount > 0) {
|
|
165
154
|
const seg = normalized.slice(segStart, i);
|
|
166
|
-
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache);
|
|
155
|
+
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seedsByAngleBracket);
|
|
167
156
|
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
168
157
|
}
|
|
169
158
|
break;
|
|
@@ -171,16 +160,9 @@ function encodeSPMPrepared(text, prepared) {
|
|
|
171
160
|
|
|
172
161
|
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
173
162
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
for (let j = 0; j < spaceIds.length; j++) result.push(spaceIds[j]);
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
const wordStart = spaceCount > 0 ? segStart + spaceCount - 1 : segStart;
|
|
181
|
-
const wordSeg = normalized.slice(wordStart, i);
|
|
182
|
-
const wordIds = cache.get(wordSeg) ?? _encodeAndCache(wordSeg, vocab, mergeRank, scratch, cache);
|
|
183
|
-
for (let j = 0; j < wordIds.length; j++) result.push(wordIds[j]);
|
|
163
|
+
const seg = normalized.slice(segStart, i);
|
|
164
|
+
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seedsByAngleBracket);
|
|
165
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
184
166
|
}
|
|
185
167
|
return result;
|
|
186
168
|
}
|
package/vocabs/anthropic.json.gz
CHANGED
|
Binary file
|