bpe-lite 0.4.3 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -11
- package/index.d.ts +42 -42
- package/package.json +2 -1
- package/src/bpe.js +13 -4
- package/src/index.js +90 -90
- package/src/index.mjs +24 -24
- package/src/spm.js +141 -43
- package/src/tokenizer.js +59 -59
- package/vocabs/anthropic.json.gz +0 -0
package/README.md
CHANGED
|
@@ -57,8 +57,8 @@ tok.count('Hello, world!'); // → 4
|
|
|
57
57
|
|----------|-------|--------|--------|----------|
|
|
58
58
|
| `openai-o200k` | o200k_base | 199,998 | GPT-4o, o1, o3, o4, GPT-4.1, GPT-5 | Exact — vocab sourced directly from OpenAI's CDN |
|
|
59
59
|
| `openai` | cl100k_base | 100,256 | GPT-4, GPT-3.5 | Exact — vocab sourced directly from OpenAI's CDN |
|
|
60
|
-
| `anthropic` |
|
|
61
|
-
| `gemini` | Gemma 3 | 262,144 | Gemini |
|
|
60
|
+
| `anthropic` | Xenova/claude-tokenizer | 65,000 | Claude | ~80% within 10%, mean error 7.0% — Anthropic has not released the tokenizer. Vocab sourced from Xenova's reverse-engineered 65k BPE (HuggingFace) with NFKC normalization, symbol/emoji byte-level corrections, whitespace sequence injections, and probe-based merge adjustments. Typical prose and code: well within 10%. Outliers: repeated chars (39%), Arabic (20%), symbols (16%), currency (14%). Tested against Claude 4 API across 25 diverse samples. |
|
|
61
|
+
| `gemini` | Gemma 3 | 262,144 | Gemini | ~99.7% — 92% exact, mean error 0.34% across 25 diverse samples vs Gemini 2.0 Flash API |
|
|
62
62
|
|
|
63
63
|
Vocab files are bundled in the package — no network required at runtime or install time.
|
|
64
64
|
|
|
@@ -70,23 +70,23 @@ Benchmarked on Node v24 (win32/x64). Benchmark command: `node --expose-gc script
|
|
|
70
70
|
|
|
71
71
|
| impl | ops/s | tokens/s | MB/s |
|
|
72
72
|
|------|------:|---------:|-----:|
|
|
73
|
-
| bpe-lite | **
|
|
74
|
-
| ai-tokenizer |
|
|
75
|
-
| js-tiktoken |
|
|
73
|
+
| bpe-lite | **291** | **3.56M** | **15.4** |
|
|
74
|
+
| ai-tokenizer | 291 | 3.56M | 15.4 |
|
|
75
|
+
| js-tiktoken | 30 | 370k | 1.6 |
|
|
76
76
|
|
|
77
77
|
**Anthropic — large text (~54 KB)**
|
|
78
78
|
|
|
79
79
|
| impl | ops/s | tokens/s | MB/s |
|
|
80
80
|
|------|------:|---------:|-----:|
|
|
81
|
-
| bpe-lite | **
|
|
82
|
-
| ai-tokenizer |
|
|
81
|
+
| bpe-lite | **289** | **5.34M** | **15.3** |
|
|
82
|
+
| ai-tokenizer | 291 | 5.31M | 15.4 |
|
|
83
83
|
|
|
84
|
-
**Gemini — large text (
|
|
84
|
+
**Gemini — large text (~54 KB)**
|
|
85
85
|
|
|
86
86
|
| impl | ops/s | tokens/s | MB/s | note |
|
|
87
87
|
|------|------:|---------:|-----:|------|
|
|
88
|
-
| bpe-lite | **
|
|
89
|
-
| ai-tokenizer |
|
|
88
|
+
| bpe-lite | **998** | **11.1M** | **52.9** | actual Gemma3 SPM |
|
|
89
|
+
| ai-tokenizer | 215 | 2.40M | 11.4 | o200k BPE — different algorithm, different results |
|
|
90
90
|
|
|
91
91
|
ai-tokenizer does not implement Gemini tokenization. The row above uses their o200k encoding on the same input string; it produces different token ids and counts than the Gemini tokenizer, so it is not a real comparison.
|
|
92
92
|
|
|
@@ -120,7 +120,7 @@ Returns the token count if `text` is within `limit` tokens, or `false` if exceed
|
|
|
120
120
|
|
|
121
121
|
## Caveats
|
|
122
122
|
|
|
123
|
-
- **Anthropic**: Anthropic has not released the Claude
|
|
123
|
+
- **Anthropic**: Anthropic has not released the Claude tokenizer. The vocab is sourced from [Xenova/claude-tokenizer](https://huggingface.co/Xenova/claude-tokenizer), a community reverse-engineering of the ~65k BPE vocab. NFKC normalization and probe-based merge corrections are applied. Accuracy varies by text type — common prose and code are usually within 10%, but Arabic, Japanese, repeated characters, and some symbol/emoji combinations diverge significantly.
|
|
124
124
|
- **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
|
|
125
125
|
|
|
126
126
|
## License
|
package/index.d.ts
CHANGED
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
|
|
2
|
-
|
|
3
|
-
export interface Tokenizer {
|
|
4
|
-
encode(text: string): number[];
|
|
5
|
-
decode(ids: number[]): string;
|
|
6
|
-
count(text: string): number;
|
|
7
|
-
countUpTo(text: string, limit: number): number;
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Count the number of tokens in text for the given provider.
|
|
12
|
-
*/
|
|
13
|
-
export function countTokens(text: string, provider?: Provider): number;
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* Encode text to token ids.
|
|
17
|
-
*/
|
|
18
|
-
export function encode(text: string, provider?: Provider): number[];
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Decode token ids back to text.
|
|
22
|
-
*/
|
|
23
|
-
export function decode(ids: number[], provider?: Provider): string;
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Check if text is within a token limit.
|
|
27
|
-
* Returns the token count if within the limit, or false if exceeded.
|
|
28
|
-
* More efficient than encode() for long texts since it short-circuits.
|
|
29
|
-
*/
|
|
30
|
-
export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
|
|
31
|
-
|
|
32
|
-
/** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
|
|
33
|
-
export function openai(): Tokenizer;
|
|
34
|
-
|
|
35
|
-
/** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
36
|
-
export function openaiO200k(): Tokenizer;
|
|
37
|
-
|
|
38
|
-
/** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
|
|
39
|
-
export function anthropic(): Tokenizer;
|
|
40
|
-
|
|
41
|
-
/** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
|
|
42
|
-
export function gemini(): Tokenizer;
|
|
1
|
+
export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
|
|
2
|
+
|
|
3
|
+
export interface Tokenizer {
|
|
4
|
+
encode(text: string): number[];
|
|
5
|
+
decode(ids: number[]): string;
|
|
6
|
+
count(text: string): number;
|
|
7
|
+
countUpTo(text: string, limit: number): number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Count the number of tokens in text for the given provider.
|
|
12
|
+
*/
|
|
13
|
+
export function countTokens(text: string, provider?: Provider): number;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Encode text to token ids.
|
|
17
|
+
*/
|
|
18
|
+
export function encode(text: string, provider?: Provider): number[];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Decode token ids back to text.
|
|
22
|
+
*/
|
|
23
|
+
export function decode(ids: number[], provider?: Provider): string;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if text is within a token limit.
|
|
27
|
+
* Returns the token count if within the limit, or false if exceeded.
|
|
28
|
+
* More efficient than encode() for long texts since it short-circuits.
|
|
29
|
+
*/
|
|
30
|
+
export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
|
|
31
|
+
|
|
32
|
+
/** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
|
|
33
|
+
export function openai(): Tokenizer;
|
|
34
|
+
|
|
35
|
+
/** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
36
|
+
export function openaiO200k(): Tokenizer;
|
|
37
|
+
|
|
38
|
+
/** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
|
|
39
|
+
export function anthropic(): Tokenizer;
|
|
40
|
+
|
|
41
|
+
/** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
|
|
42
|
+
export function gemini(): Tokenizer;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bpe-lite",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.1",
|
|
4
4
|
"description": "Offline BPE tokenizer for OpenAI, Anthropic, and Gemini — zero dependencies",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
"license": "MIT",
|
|
37
37
|
"devDependencies": {
|
|
38
38
|
"ai-tokenizer": "^1.0.6",
|
|
39
|
+
"gpt-tokenizer": "^3.4.0",
|
|
39
40
|
"js-tiktoken": "^1.0.21"
|
|
40
41
|
},
|
|
41
42
|
"repository": {
|
package/src/bpe.js
CHANGED
|
@@ -80,7 +80,8 @@ class MinHeap {
|
|
|
80
80
|
_siftUp(i) {
|
|
81
81
|
while (i > 0) {
|
|
82
82
|
const p = (i - 1) >> 1;
|
|
83
|
-
if (this.ranks[p]
|
|
83
|
+
if (this.ranks[p] < this.ranks[i]) break;
|
|
84
|
+
if (this.ranks[p] === this.ranks[i] && this.left[p] <= this.left[i]) break;
|
|
84
85
|
this._swap(i, p);
|
|
85
86
|
i = p;
|
|
86
87
|
}
|
|
@@ -93,8 +94,12 @@ class MinHeap {
|
|
|
93
94
|
if (l >= n) break;
|
|
94
95
|
const r = l + 1;
|
|
95
96
|
let m = l;
|
|
96
|
-
if (r < n
|
|
97
|
-
|
|
97
|
+
if (r < n) {
|
|
98
|
+
if (this.ranks[r] < this.ranks[l] ||
|
|
99
|
+
(this.ranks[r] === this.ranks[l] && this.left[r] < this.left[l])) m = r;
|
|
100
|
+
}
|
|
101
|
+
if (this.ranks[i] < this.ranks[m]) break;
|
|
102
|
+
if (this.ranks[i] === this.ranks[m] && this.left[i] <= this.left[m]) break;
|
|
98
103
|
this._swap(i, m);
|
|
99
104
|
i = m;
|
|
100
105
|
}
|
|
@@ -136,7 +141,7 @@ function pretokenize(text, compiled) {
|
|
|
136
141
|
}
|
|
137
142
|
|
|
138
143
|
function buildPreparedTiktoken(vocabData) {
|
|
139
|
-
const { vocab, specialTokens = {}, pattern } = vocabData;
|
|
144
|
+
const { vocab, specialTokens = {}, pattern, normalize } = vocabData;
|
|
140
145
|
|
|
141
146
|
const vocabBin = new Map();
|
|
142
147
|
let maxId = -1;
|
|
@@ -164,6 +169,7 @@ function buildPreparedTiktoken(vocabData) {
|
|
|
164
169
|
idToBytes,
|
|
165
170
|
specials,
|
|
166
171
|
patternCompiled: compilePretokenizer(pattern),
|
|
172
|
+
normalize: normalize || null,
|
|
167
173
|
// opt A — per-instance chunk cache: chunk string → ids[]
|
|
168
174
|
cache: new Map(),
|
|
169
175
|
// opt B — per-instance grow-only scratch (reused across chunks)
|
|
@@ -302,6 +308,7 @@ function splitOnSpecials(text, specials) {
|
|
|
302
308
|
|
|
303
309
|
function encodeTiktokenPrepared(text, prepared) {
|
|
304
310
|
if (!text) return [];
|
|
311
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
305
312
|
|
|
306
313
|
const ids = [];
|
|
307
314
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
@@ -355,6 +362,7 @@ function decodeTiktokenPrepared(ids, prepared) {
|
|
|
355
362
|
|
|
356
363
|
function countTiktokenPrepared(text, prepared) {
|
|
357
364
|
if (!text) return 0;
|
|
365
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
358
366
|
|
|
359
367
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
360
368
|
const pieces = splitOnSpecials(text, specials);
|
|
@@ -396,6 +404,7 @@ function countTiktokenPrepared(text, prepared) {
|
|
|
396
404
|
|
|
397
405
|
function countTiktokenUpToPrepared(text, prepared, limit) {
|
|
398
406
|
if (!text) return 0;
|
|
407
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
399
408
|
|
|
400
409
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
401
410
|
const pieces = splitOnSpecials(text, specials);
|
package/src/index.js
CHANGED
|
@@ -1,90 +1,90 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const path = require('path');
|
|
4
|
-
const fs = require('fs');
|
|
5
|
-
const zlib = require('zlib');
|
|
6
|
-
const { Tokenizer } = require('./tokenizer');
|
|
7
|
-
|
|
8
|
-
const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
|
|
9
|
-
|
|
10
|
-
// Lazy-loaded tokenizer instances (created once per provider per process)
|
|
11
|
-
const _cache = {};
|
|
12
|
-
|
|
13
|
-
function loadTokenizer(provider) {
|
|
14
|
-
if (_cache[provider]) return _cache[provider];
|
|
15
|
-
|
|
16
|
-
const gzPath = path.join(VOCABS_DIR, `${provider}.json.gz`);
|
|
17
|
-
const jsonPath = path.join(VOCABS_DIR, `${provider}.json`);
|
|
18
|
-
|
|
19
|
-
let data;
|
|
20
|
-
if (fs.existsSync(gzPath)) {
|
|
21
|
-
data = JSON.parse(zlib.gunzipSync(fs.readFileSync(gzPath)).toString('utf8'));
|
|
22
|
-
} else if (fs.existsSync(jsonPath)) {
|
|
23
|
-
data = JSON.parse(fs.readFileSync(jsonPath, 'utf8'));
|
|
24
|
-
} else {
|
|
25
|
-
throw new Error(
|
|
26
|
-
`Vocab file not found for provider "${provider}".\n` +
|
|
27
|
-
'Run "node scripts/build-vocabs.js" to build vocab files.'
|
|
28
|
-
);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
_cache[provider] = new Tokenizer(data);
|
|
32
|
-
return _cache[provider];
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Count tokens in text for a given provider.
|
|
37
|
-
* @param {string} text
|
|
38
|
-
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
39
|
-
* @returns {number}
|
|
40
|
-
*/
|
|
41
|
-
function countTokens(text, provider = 'openai') {
|
|
42
|
-
return loadTokenizer(provider).count(text);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Encode text to token ids.
|
|
47
|
-
* @param {string} text
|
|
48
|
-
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
49
|
-
* @returns {number[]}
|
|
50
|
-
*/
|
|
51
|
-
function encode(text, provider = 'openai') {
|
|
52
|
-
return loadTokenizer(provider).encode(text);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Decode token ids back to text.
|
|
57
|
-
* @param {number[]} ids
|
|
58
|
-
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
59
|
-
* @returns {string}
|
|
60
|
-
*/
|
|
61
|
-
function decode(ids, provider = 'openai') {
|
|
62
|
-
return loadTokenizer(provider).decode(ids);
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Check if text is within a token limit without necessarily encoding the whole string.
|
|
67
|
-
* Returns false if the limit is exceeded, otherwise returns the token count.
|
|
68
|
-
* @param {string} text
|
|
69
|
-
* @param {number} limit
|
|
70
|
-
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
71
|
-
* @returns {number|false}
|
|
72
|
-
*/
|
|
73
|
-
function isWithinTokenLimit(text, limit, provider = 'openai') {
|
|
74
|
-
const count = loadTokenizer(provider).countUpTo(text, limit);
|
|
75
|
-
return count <= limit ? count : false;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
|
|
79
|
-
function openai() { return loadTokenizer('openai'); }
|
|
80
|
-
|
|
81
|
-
/** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
82
|
-
function openaiO200k() { return loadTokenizer('openai-o200k'); }
|
|
83
|
-
|
|
84
|
-
/** Get a Tokenizer instance for Anthropic (cl100k approximation). */
|
|
85
|
-
function anthropic() { return loadTokenizer('anthropic'); }
|
|
86
|
-
|
|
87
|
-
/** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
|
|
88
|
-
function gemini() { return loadTokenizer('gemini'); }
|
|
89
|
-
|
|
90
|
-
module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const zlib = require('zlib');
|
|
6
|
+
const { Tokenizer } = require('./tokenizer');
|
|
7
|
+
|
|
8
|
+
const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
|
|
9
|
+
|
|
10
|
+
// Lazy-loaded tokenizer instances (created once per provider per process)
|
|
11
|
+
const _cache = {};
|
|
12
|
+
|
|
13
|
+
function loadTokenizer(provider) {
|
|
14
|
+
if (_cache[provider]) return _cache[provider];
|
|
15
|
+
|
|
16
|
+
const gzPath = path.join(VOCABS_DIR, `${provider}.json.gz`);
|
|
17
|
+
const jsonPath = path.join(VOCABS_DIR, `${provider}.json`);
|
|
18
|
+
|
|
19
|
+
let data;
|
|
20
|
+
if (fs.existsSync(gzPath)) {
|
|
21
|
+
data = JSON.parse(zlib.gunzipSync(fs.readFileSync(gzPath)).toString('utf8'));
|
|
22
|
+
} else if (fs.existsSync(jsonPath)) {
|
|
23
|
+
data = JSON.parse(fs.readFileSync(jsonPath, 'utf8'));
|
|
24
|
+
} else {
|
|
25
|
+
throw new Error(
|
|
26
|
+
`Vocab file not found for provider "${provider}".\n` +
|
|
27
|
+
'Run "node scripts/build-vocabs.js" to build vocab files.'
|
|
28
|
+
);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
_cache[provider] = new Tokenizer(data);
|
|
32
|
+
return _cache[provider];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Count tokens in text for a given provider.
|
|
37
|
+
* @param {string} text
|
|
38
|
+
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
39
|
+
* @returns {number}
|
|
40
|
+
*/
|
|
41
|
+
function countTokens(text, provider = 'openai') {
|
|
42
|
+
return loadTokenizer(provider).count(text);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Encode text to token ids.
|
|
47
|
+
* @param {string} text
|
|
48
|
+
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
49
|
+
* @returns {number[]}
|
|
50
|
+
*/
|
|
51
|
+
function encode(text, provider = 'openai') {
|
|
52
|
+
return loadTokenizer(provider).encode(text);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Decode token ids back to text.
|
|
57
|
+
* @param {number[]} ids
|
|
58
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
59
|
+
* @returns {string}
|
|
60
|
+
*/
|
|
61
|
+
function decode(ids, provider = 'openai') {
|
|
62
|
+
return loadTokenizer(provider).decode(ids);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Check if text is within a token limit without necessarily encoding the whole string.
|
|
67
|
+
* Returns false if the limit is exceeded, otherwise returns the token count.
|
|
68
|
+
* @param {string} text
|
|
69
|
+
* @param {number} limit
|
|
70
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
71
|
+
* @returns {number|false}
|
|
72
|
+
*/
|
|
73
|
+
function isWithinTokenLimit(text, limit, provider = 'openai') {
|
|
74
|
+
const count = loadTokenizer(provider).countUpTo(text, limit);
|
|
75
|
+
return count <= limit ? count : false;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
|
|
79
|
+
function openai() { return loadTokenizer('openai'); }
|
|
80
|
+
|
|
81
|
+
/** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
82
|
+
function openaiO200k() { return loadTokenizer('openai-o200k'); }
|
|
83
|
+
|
|
84
|
+
/** Get a Tokenizer instance for Anthropic (cl100k approximation). */
|
|
85
|
+
function anthropic() { return loadTokenizer('anthropic'); }
|
|
86
|
+
|
|
87
|
+
/** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
|
|
88
|
+
function gemini() { return loadTokenizer('gemini'); }
|
|
89
|
+
|
|
90
|
+
module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
|
package/src/index.mjs
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
import { createRequire } from 'module';
|
|
2
|
-
|
|
3
|
-
const require = createRequire(import.meta.url);
|
|
4
|
-
const {
|
|
5
|
-
countTokens,
|
|
6
|
-
encode,
|
|
7
|
-
decode,
|
|
8
|
-
isWithinTokenLimit,
|
|
9
|
-
openai,
|
|
10
|
-
openaiO200k,
|
|
11
|
-
anthropic,
|
|
12
|
-
gemini,
|
|
13
|
-
} = require('./index.js');
|
|
14
|
-
|
|
15
|
-
export {
|
|
16
|
-
countTokens,
|
|
17
|
-
encode,
|
|
18
|
-
decode,
|
|
19
|
-
isWithinTokenLimit,
|
|
20
|
-
openai,
|
|
21
|
-
openaiO200k,
|
|
22
|
-
anthropic,
|
|
23
|
-
gemini,
|
|
24
|
-
};
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const {
|
|
5
|
+
countTokens,
|
|
6
|
+
encode,
|
|
7
|
+
decode,
|
|
8
|
+
isWithinTokenLimit,
|
|
9
|
+
openai,
|
|
10
|
+
openaiO200k,
|
|
11
|
+
anthropic,
|
|
12
|
+
gemini,
|
|
13
|
+
} = require('./index.js');
|
|
14
|
+
|
|
15
|
+
export {
|
|
16
|
+
countTokens,
|
|
17
|
+
encode,
|
|
18
|
+
decode,
|
|
19
|
+
isWithinTokenLimit,
|
|
20
|
+
openai,
|
|
21
|
+
openaiO200k,
|
|
22
|
+
anthropic,
|
|
23
|
+
gemini,
|
|
24
|
+
};
|
package/src/spm.js
CHANGED
|
@@ -23,12 +23,31 @@ function buildPreparedSPM(vocabData) {
|
|
|
23
23
|
idToStr.set(id, str);
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
+
// Seed tokens: multi-char vocab entries with no producing merge.
|
|
27
|
+
// In the original SentencePiece model these were user-defined symbols
|
|
28
|
+
// (HTML tags, special tokens, etc.) that the encoder recognizes atomically
|
|
29
|
+
// before BPE. We handle the common '<' case with a greedy longest-match
|
|
30
|
+
// lookup keyed by '<' — the only first-char with multiple seed tokens.
|
|
31
|
+
const producible = new Set();
|
|
32
|
+
for (const m of merges) {
|
|
33
|
+
const sp = m.indexOf(' ');
|
|
34
|
+
if (sp !== -1) producible.add(m.slice(0, sp) + m.slice(sp + 1));
|
|
35
|
+
}
|
|
36
|
+
const seedsByAngleBracket = [];
|
|
37
|
+
for (const [str, id] of Object.entries(vocab)) {
|
|
38
|
+
if (str.length > 1 && str[0] === '<' && !producible.has(str)) {
|
|
39
|
+
seedsByAngleBracket.push({ str, id, chars: [...str] });
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
seedsByAngleBracket.sort((a, b) => b.chars.length - a.chars.length);
|
|
43
|
+
|
|
26
44
|
return {
|
|
27
45
|
vocab,
|
|
28
46
|
merges,
|
|
29
47
|
mergeRank,
|
|
30
48
|
idToStr,
|
|
31
|
-
|
|
49
|
+
seedsByAngleBracket,
|
|
50
|
+
// opt A — segment-level cache: each word segment → ids[]
|
|
32
51
|
// Generalises across different inputs (same words reused across texts).
|
|
33
52
|
// Note: 1 of 514,906 Gemma merges crosses a ▁ boundary ("> ▁</"),
|
|
34
53
|
// making this negligibly imprecise for that HTML pattern.
|
|
@@ -66,24 +85,55 @@ function encodeSPM(text, vocabData) {
|
|
|
66
85
|
* Returns true if every segment was a cache hit; false on first miss.
|
|
67
86
|
* Isolated from encodeSegment so V8 can keep this function optimised
|
|
68
87
|
* even when encodeSPMPrepared is called with wildly different text lengths.
|
|
88
|
+
*
|
|
89
|
+
* Segmentation: for a run of N consecutive ▁ chars before a word:
|
|
90
|
+
* N=1 → one segment [▁word]
|
|
91
|
+
* N>1 → two segments [▁×(N-1), ▁word]
|
|
92
|
+
* This matches how Gemma BPE merges multi-space tokens (▁▁=138, ▁▁▁=139, …).
|
|
69
93
|
*/
|
|
70
94
|
function _scanFromCache(normalized, cache, result) {
|
|
71
|
-
let
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
95
|
+
let i = 0;
|
|
96
|
+
while (i < normalized.length) {
|
|
97
|
+
// Count leading ▁ chars
|
|
98
|
+
const segStart = i;
|
|
99
|
+
while (i < normalized.length && normalized[i] === SPACE_CHAR) i++;
|
|
100
|
+
const spaceCount = i - segStart;
|
|
101
|
+
|
|
102
|
+
if (i === normalized.length) {
|
|
103
|
+
// Trailing spaces only
|
|
104
|
+
if (spaceCount > 0) {
|
|
105
|
+
const seg = normalized.slice(segStart, i);
|
|
106
|
+
const segIds = cache.get(seg);
|
|
107
|
+
if (segIds === undefined) return false;
|
|
108
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
109
|
+
}
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Collect word chars (non-▁)
|
|
114
|
+
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
115
|
+
|
|
116
|
+
// If N>1 spaces, emit (N-1) ▁s as standalone segment first
|
|
117
|
+
if (spaceCount > 1) {
|
|
118
|
+
const spaceSeg = normalized.slice(segStart, segStart + spaceCount - 1);
|
|
119
|
+
const spaceIds = cache.get(spaceSeg);
|
|
120
|
+
if (spaceIds === undefined) return false;
|
|
121
|
+
for (let j = 0; j < spaceIds.length; j++) result.push(spaceIds[j]);
|
|
79
122
|
}
|
|
123
|
+
|
|
124
|
+
// Emit ▁ + word as one segment
|
|
125
|
+
const wordStart = spaceCount > 0 ? segStart + spaceCount - 1 : segStart;
|
|
126
|
+
const wordSeg = normalized.slice(wordStart, i);
|
|
127
|
+
const wordIds = cache.get(wordSeg);
|
|
128
|
+
if (wordIds === undefined) return false;
|
|
129
|
+
for (let j = 0; j < wordIds.length; j++) result.push(wordIds[j]);
|
|
80
130
|
}
|
|
81
131
|
return true;
|
|
82
132
|
}
|
|
83
133
|
|
|
84
134
|
// Cold-path helper — kept separate so it is never inlined into the hot loop.
|
|
85
|
-
function _encodeAndCache(seg, vocab, mergeRank, scratch, cache) {
|
|
86
|
-
const ids = encodeSegment(seg, vocab, mergeRank, scratch);
|
|
135
|
+
function _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seeds) {
|
|
136
|
+
const ids = encodeSegment(seg, vocab, mergeRank, scratch, seeds);
|
|
87
137
|
cache.set(seg, ids);
|
|
88
138
|
return ids;
|
|
89
139
|
}
|
|
@@ -91,10 +141,10 @@ function _encodeAndCache(seg, vocab, mergeRank, scratch, cache) {
|
|
|
91
141
|
function encodeSPMPrepared(text, prepared) {
|
|
92
142
|
if (!text) return [];
|
|
93
143
|
|
|
94
|
-
const { vocab, mergeRank, scratch, cache } = prepared;
|
|
144
|
+
const { vocab, mergeRank, scratch, cache, seedsByAngleBracket } = prepared;
|
|
95
145
|
|
|
96
|
-
// Normalize: replace spaces with
|
|
97
|
-
const normalized =
|
|
146
|
+
// Normalize: replace spaces with ▁ (Gemma3: no ▁ prepend for first char)
|
|
147
|
+
const normalized = text.replace(/ /g, SPACE_CHAR);
|
|
98
148
|
|
|
99
149
|
// Fast path: serve every segment from the segment cache.
|
|
100
150
|
// After the first call, this path handles all subsequent calls for common text.
|
|
@@ -104,20 +154,41 @@ function encodeSPMPrepared(text, prepared) {
|
|
|
104
154
|
// Cold path: at least one segment is missing — encode everything from scratch.
|
|
105
155
|
// (Simpler to re-scan than to continue from the miss point.)
|
|
106
156
|
result.length = 0;
|
|
107
|
-
let
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
157
|
+
let i = 0;
|
|
158
|
+
while (i < normalized.length) {
|
|
159
|
+
const segStart = i;
|
|
160
|
+
while (i < normalized.length && normalized[i] === SPACE_CHAR) i++;
|
|
161
|
+
const spaceCount = i - segStart;
|
|
162
|
+
|
|
163
|
+
if (i === normalized.length) {
|
|
164
|
+
if (spaceCount > 0) {
|
|
165
|
+
const seg = normalized.slice(segStart, i);
|
|
166
|
+
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache);
|
|
167
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
168
|
+
}
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
173
|
+
|
|
174
|
+
if (spaceCount > 1) {
|
|
175
|
+
const spaceSeg = normalized.slice(segStart, segStart + spaceCount - 1);
|
|
176
|
+
const spaceIds = cache.get(spaceSeg) ?? _encodeAndCache(spaceSeg, vocab, mergeRank, scratch, cache);
|
|
177
|
+
for (let j = 0; j < spaceIds.length; j++) result.push(spaceIds[j]);
|
|
114
178
|
}
|
|
179
|
+
|
|
180
|
+
const wordStart = spaceCount > 0 ? segStart + spaceCount - 1 : segStart;
|
|
181
|
+
const wordSeg = normalized.slice(wordStart, i);
|
|
182
|
+
const wordIds = cache.get(wordSeg) ?? _encodeAndCache(wordSeg, vocab, mergeRank, scratch, cache);
|
|
183
|
+
for (let j = 0; j < wordIds.length; j++) result.push(wordIds[j]);
|
|
115
184
|
}
|
|
116
185
|
return result;
|
|
117
186
|
}
|
|
118
187
|
|
|
119
|
-
// Encode a single
|
|
120
|
-
|
|
188
|
+
// Encode a single segment using MinHeap BPE.
|
|
189
|
+
// seeds: sorted array of {str, id, chars[]} for '<'-prefixed vocab entries that have
|
|
190
|
+
// no producing merge (SentencePiece user-defined symbols); matched greedily before BPE.
|
|
191
|
+
function encodeSegment(seg, vocab, mergeRank, scratch, seeds) {
|
|
121
192
|
const chars = [...seg];
|
|
122
193
|
const n = chars.length;
|
|
123
194
|
|
|
@@ -125,32 +196,59 @@ function encodeSegment(seg, vocab, mergeRank, scratch) {
|
|
|
125
196
|
const { str, ids, prev, next, ver, alive, heap } = scratch;
|
|
126
197
|
heap.reset();
|
|
127
198
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
199
|
+
// Initialize nodes: one node per Unicode char, except seed tokens which collapse
|
|
200
|
+
// multiple chars into a single node (longest match at each '<' position).
|
|
201
|
+
let nNodes = 0;
|
|
202
|
+
let pos = 0;
|
|
203
|
+
while (pos < n) {
|
|
204
|
+
const node = nNodes++;
|
|
205
|
+
prev[node] = node - 1;
|
|
206
|
+
next[node] = node + 1;
|
|
207
|
+
ver[node] = 0;
|
|
208
|
+
alive[node] = 1;
|
|
209
|
+
|
|
210
|
+
let matched = false;
|
|
211
|
+
if (seeds && chars[pos] === '<') {
|
|
212
|
+
for (const seed of seeds) {
|
|
213
|
+
const sl = seed.chars.length;
|
|
214
|
+
if (pos + sl > n) continue;
|
|
215
|
+
let ok = true;
|
|
216
|
+
for (let k = 0; k < sl; k++) {
|
|
217
|
+
if (chars[pos + k] !== seed.chars[k]) { ok = false; break; }
|
|
218
|
+
}
|
|
219
|
+
if (ok) {
|
|
220
|
+
str[node] = seed.str;
|
|
221
|
+
ids[node] = seed.id;
|
|
222
|
+
pos += sl;
|
|
223
|
+
matched = true;
|
|
224
|
+
break;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
134
228
|
|
|
135
|
-
if (
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
|
|
141
|
-
const byteKey = `<0x${hex}>`;
|
|
142
|
-
if (vocab[byteKey] !== undefined) {
|
|
143
|
-
str[i] = byteKey;
|
|
144
|
-
ids[i] = vocab[byteKey];
|
|
229
|
+
if (!matched) {
|
|
230
|
+
const c = chars[pos];
|
|
231
|
+
if (vocab[c] !== undefined) {
|
|
232
|
+
str[node] = c;
|
|
233
|
+
ids[node] = vocab[c];
|
|
145
234
|
} else {
|
|
146
|
-
|
|
147
|
-
|
|
235
|
+
const codePoint = c.codePointAt(0);
|
|
236
|
+
const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
|
|
237
|
+
const byteKey = `<0x${hex}>`;
|
|
238
|
+
if (vocab[byteKey] !== undefined) {
|
|
239
|
+
str[node] = byteKey;
|
|
240
|
+
ids[node] = vocab[byteKey];
|
|
241
|
+
} else {
|
|
242
|
+
str[node] = c;
|
|
243
|
+
ids[node] = vocab['<unk>'] ?? 0;
|
|
244
|
+
}
|
|
148
245
|
}
|
|
246
|
+
pos++;
|
|
149
247
|
}
|
|
150
248
|
}
|
|
151
|
-
next[
|
|
249
|
+
next[nNodes - 1] = -1;
|
|
152
250
|
|
|
153
|
-
for (let i = 0; i <
|
|
251
|
+
for (let i = 0; i < nNodes - 1; i++) {
|
|
154
252
|
const rank = mergeRank.get(`${str[i]} ${str[i + 1]}`);
|
|
155
253
|
if (rank !== undefined) heap.push(rank, i, i + 1, ver[i], ver[i + 1]);
|
|
156
254
|
}
|
package/src/tokenizer.js
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const {
|
|
4
|
-
buildPreparedTiktoken,
|
|
5
|
-
encodeTiktokenPrepared,
|
|
6
|
-
decodeTiktokenPrepared,
|
|
7
|
-
countTiktokenPrepared,
|
|
8
|
-
countTiktokenUpToPrepared,
|
|
9
|
-
} = require('./bpe');
|
|
10
|
-
const { buildPreparedSPM, encodeSPMPrepared, decodeSPMPrepared } = require('./spm');
|
|
11
|
-
|
|
12
|
-
class Tokenizer {
|
|
13
|
-
constructor(vocabData) {
|
|
14
|
-
this._data = vocabData;
|
|
15
|
-
this._engine = vocabData.engine;
|
|
16
|
-
this._preparedTiktoken = null;
|
|
17
|
-
this._preparedSPM = null;
|
|
18
|
-
|
|
19
|
-
if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
|
|
20
|
-
throw new Error(`Unknown tokenizer engine: ${this._engine}`);
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
if (this._engine === 'tiktoken') {
|
|
24
|
-
this._preparedTiktoken = buildPreparedTiktoken(vocabData);
|
|
25
|
-
} else {
|
|
26
|
-
this._preparedSPM = buildPreparedSPM(vocabData);
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
encode(text) {
|
|
31
|
-
if (this._engine === 'tiktoken') return encodeTiktokenPrepared(text, this._preparedTiktoken);
|
|
32
|
-
return encodeSPMPrepared(text, this._preparedSPM);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
decode(ids) {
|
|
36
|
-
if (this._engine === 'tiktoken') return decodeTiktokenPrepared(ids, this._preparedTiktoken);
|
|
37
|
-
return decodeSPMPrepared(ids, this._preparedSPM);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
count(text) {
|
|
41
|
-
if (this._engine === 'tiktoken') return countTiktokenPrepared(text, this._preparedTiktoken);
|
|
42
|
-
return this.encode(text).length;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Count tokens, stopping as soon as the count exceeds limit.
|
|
47
|
-
* More efficient than encode() for token limit checks on long text.
|
|
48
|
-
* @param {string} text
|
|
49
|
-
* @param {number} limit
|
|
50
|
-
* @returns {number}
|
|
51
|
-
*/
|
|
52
|
-
countUpTo(text, limit) {
|
|
53
|
-
if (this._engine === 'tiktoken') return countTiktokenUpToPrepared(text, this._preparedTiktoken, limit);
|
|
54
|
-
// SPM encodes the whole text as one unit — no clean early exit, just encode and count
|
|
55
|
-
return this.encode(text).length;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
module.exports = { Tokenizer };
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const {
|
|
4
|
+
buildPreparedTiktoken,
|
|
5
|
+
encodeTiktokenPrepared,
|
|
6
|
+
decodeTiktokenPrepared,
|
|
7
|
+
countTiktokenPrepared,
|
|
8
|
+
countTiktokenUpToPrepared,
|
|
9
|
+
} = require('./bpe');
|
|
10
|
+
const { buildPreparedSPM, encodeSPMPrepared, decodeSPMPrepared } = require('./spm');
|
|
11
|
+
|
|
12
|
+
class Tokenizer {
|
|
13
|
+
constructor(vocabData) {
|
|
14
|
+
this._data = vocabData;
|
|
15
|
+
this._engine = vocabData.engine;
|
|
16
|
+
this._preparedTiktoken = null;
|
|
17
|
+
this._preparedSPM = null;
|
|
18
|
+
|
|
19
|
+
if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
|
|
20
|
+
throw new Error(`Unknown tokenizer engine: ${this._engine}`);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (this._engine === 'tiktoken') {
|
|
24
|
+
this._preparedTiktoken = buildPreparedTiktoken(vocabData);
|
|
25
|
+
} else {
|
|
26
|
+
this._preparedSPM = buildPreparedSPM(vocabData);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
encode(text) {
|
|
31
|
+
if (this._engine === 'tiktoken') return encodeTiktokenPrepared(text, this._preparedTiktoken);
|
|
32
|
+
return encodeSPMPrepared(text, this._preparedSPM);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
decode(ids) {
|
|
36
|
+
if (this._engine === 'tiktoken') return decodeTiktokenPrepared(ids, this._preparedTiktoken);
|
|
37
|
+
return decodeSPMPrepared(ids, this._preparedSPM);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
count(text) {
|
|
41
|
+
if (this._engine === 'tiktoken') return countTiktokenPrepared(text, this._preparedTiktoken);
|
|
42
|
+
return this.encode(text).length;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Count tokens, stopping as soon as the count exceeds limit.
|
|
47
|
+
* More efficient than encode() for token limit checks on long text.
|
|
48
|
+
* @param {string} text
|
|
49
|
+
* @param {number} limit
|
|
50
|
+
* @returns {number}
|
|
51
|
+
*/
|
|
52
|
+
countUpTo(text, limit) {
|
|
53
|
+
if (this._engine === 'tiktoken') return countTiktokenUpToPrepared(text, this._preparedTiktoken, limit);
|
|
54
|
+
// SPM encodes the whole text as one unit — no clean early exit, just encode and count
|
|
55
|
+
return this.encode(text).length;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
module.exports = { Tokenizer };
|
package/vocabs/anthropic.json.gz
CHANGED
|
Binary file
|