bpe-lite 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -20
- package/index.d.ts +42 -0
- package/package.json +11 -2
- package/src/bpe.js +32 -1
- package/src/index.js +30 -7
- package/src/index.mjs +24 -0
- package/src/tokenizer.js +14 -1
- package/vocabs/gemini.json.gz +0 -0
- package/vocabs/openai-o200k.json.gz +0 -0
- package/vocabs/gemini.json +0 -1
package/README.md
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# bpe-lite
|
|
2
2
|
|
|
3
|
-
Offline BPE tokenizer for OpenAI, Anthropic, and Gemini. Zero dependencies, no network calls at runtime. Works in any Node 18+ environment including Docker and edge runtimes.
|
|
3
|
+
Offline BPE tokenizer for OpenAI, Anthropic, and Gemini. Zero dependencies, no network calls at runtime. Works in any Node 18+ environment including Docker and edge runtimes. Ships CJS and ESM. TypeScript types included.
|
|
4
4
|
|
|
5
5
|
```js
|
|
6
6
|
const { countTokens } = require('bpe-lite');
|
|
7
7
|
|
|
8
|
-
countTokens('Hello, world!', 'openai')
|
|
9
|
-
countTokens('Hello, world!', '
|
|
10
|
-
countTokens('Hello, world!', '
|
|
8
|
+
countTokens('Hello, world!', 'openai-o200k') // → 4 (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
|
|
9
|
+
countTokens('Hello, world!', 'openai') // → 4 (GPT-4, GPT-3.5)
|
|
10
|
+
countTokens('Hello, world!', 'anthropic') // → 4
|
|
11
|
+
countTokens('Hello, world!', 'gemini') // → 4
|
|
11
12
|
```
|
|
12
13
|
|
|
13
14
|
## Install
|
|
@@ -18,33 +19,71 @@ npm install bpe-lite
|
|
|
18
19
|
|
|
19
20
|
## Usage
|
|
20
21
|
|
|
22
|
+
Both CommonJS and ESM are supported:
|
|
23
|
+
|
|
24
|
+
```js
|
|
25
|
+
// CommonJS
|
|
26
|
+
const { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini } = require('bpe-lite');
|
|
27
|
+
|
|
28
|
+
// ESM
|
|
29
|
+
import { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini } from 'bpe-lite';
|
|
30
|
+
```
|
|
31
|
+
|
|
21
32
|
```js
|
|
22
|
-
const { countTokens, encode, decode, openai, anthropic, gemini } = require('bpe-lite');
|
|
23
33
|
|
|
24
34
|
// Count tokens
|
|
25
|
-
countTokens('Your text here', 'openai');
|
|
35
|
+
countTokens('Your text here', 'openai-o200k'); // → number (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
|
|
36
|
+
countTokens('Your text here', 'openai'); // → number (GPT-4, GPT-3.5)
|
|
26
37
|
|
|
27
38
|
// Encode / decode
|
|
28
|
-
const ids = encode('Hello', 'openai');
|
|
29
|
-
decode(ids, 'openai');
|
|
30
|
-
|
|
31
|
-
//
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
39
|
+
const ids = encode('Hello', 'openai-o200k'); // → [13225]
|
|
40
|
+
decode(ids, 'openai-o200k'); // → 'Hello'
|
|
41
|
+
|
|
42
|
+
// Check token limit — short-circuits early on long text, more efficient than encode()
|
|
43
|
+
// Returns the token count if within the limit, false if exceeded
|
|
44
|
+
isWithinTokenLimit('Hello, world!', 10, 'openai-o200k'); // → 4
|
|
45
|
+
isWithinTokenLimit('Hello, world!', 3, 'openai-o200k'); // → false
|
|
46
|
+
|
|
47
|
+
// Tokenizer instances — lazy-loaded and cached per provider
|
|
48
|
+
const tok = openaiO200k();
|
|
49
|
+
tok.encode('Hello'); // → [13225]
|
|
50
|
+
tok.decode([13225]); // → 'Hello'
|
|
51
|
+
tok.count('Hello, world!'); // → 4
|
|
36
52
|
```
|
|
37
53
|
|
|
38
54
|
## Providers
|
|
39
55
|
|
|
40
|
-
| Provider | Vocab | Tokens | Accuracy |
|
|
41
|
-
|
|
42
|
-
| `openai` |
|
|
43
|
-
| `
|
|
44
|
-
| `
|
|
56
|
+
| Provider | Vocab | Tokens | Models | Accuracy |
|
|
57
|
+
|----------|-------|--------|--------|----------|
|
|
58
|
+
| `openai-o200k` | o200k_base | 199,998 | GPT-4o, o1, o3, o4, GPT-4.1, GPT-5 | Exact — vocab sourced directly from OpenAI's CDN |
|
|
59
|
+
| `openai` | cl100k_base | 100,256 | GPT-4, GPT-3.5 | Exact — vocab sourced directly from OpenAI's CDN |
|
|
60
|
+
| `anthropic` | cl100k approximation | 100,256 | Claude 3+ | ~95% — Anthropic has not released the Claude 3+ tokenizer |
|
|
61
|
+
| `gemini` | Gemma 3 | 262,144 | Gemini | Exact — Gemini uses the same tokenizer as Gemma 3 open-weights |
|
|
45
62
|
|
|
46
63
|
Vocab files are bundled in the package — no network required at runtime or install time.
|
|
47
64
|
|
|
65
|
+
## API
|
|
66
|
+
|
|
67
|
+
### `countTokens(text, provider?)`
|
|
68
|
+
|
|
69
|
+
Returns the number of tokens in `text`. Default provider: `'openai'`.
|
|
70
|
+
|
|
71
|
+
### `encode(text, provider?)`
|
|
72
|
+
|
|
73
|
+
Returns an array of token ids.
|
|
74
|
+
|
|
75
|
+
### `decode(ids, provider?)`
|
|
76
|
+
|
|
77
|
+
Decodes an array of token ids back to a string.
|
|
78
|
+
|
|
79
|
+
### `isWithinTokenLimit(text, limit, provider?)`
|
|
80
|
+
|
|
81
|
+
Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts — the tiktoken engine short-circuits as soon as the limit is crossed.
|
|
82
|
+
|
|
83
|
+
### Tokenizer instances
|
|
84
|
+
|
|
85
|
+
`openai()`, `openaiO200k()`, `anthropic()`, `gemini()` each return a cached `Tokenizer` object with `.encode()`, `.decode()`, and `.count()` methods. Instances are created once per provider per process.
|
|
86
|
+
|
|
48
87
|
## Why not tiktoken?
|
|
49
88
|
|
|
50
89
|
`tiktoken` is accurate for OpenAI but requires Rust/WASM native bindings, which can break in Docker containers, edge runtimes, and serverless environments. `bpe-lite` is pure JavaScript — it runs anywhere Node 18+ runs.
|
|
@@ -52,7 +91,7 @@ Vocab files are bundled in the package — no network required at runtime or ins
|
|
|
52
91
|
## Caveats
|
|
53
92
|
|
|
54
93
|
- **Anthropic**: Anthropic has not released the Claude 3+ tokenizer. The cl100k approximation is accurate to ~95% for most text.
|
|
55
|
-
- **Speed**: Pure JS is slower than tiktoken's native implementation. For token
|
|
94
|
+
- **Speed**: Pure JS is slower than tiktoken's native implementation. For token counting (not bulk processing) the difference is negligible.
|
|
56
95
|
- **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
|
|
57
96
|
|
|
58
97
|
## License
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
|
|
2
|
+
|
|
3
|
+
export interface Tokenizer {
|
|
4
|
+
encode(text: string): number[];
|
|
5
|
+
decode(ids: number[]): string;
|
|
6
|
+
count(text: string): number;
|
|
7
|
+
countUpTo(text: string, limit: number): number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Count the number of tokens in text for the given provider.
|
|
12
|
+
*/
|
|
13
|
+
export function countTokens(text: string, provider?: Provider): number;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Encode text to token ids.
|
|
17
|
+
*/
|
|
18
|
+
export function encode(text: string, provider?: Provider): number[];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Decode token ids back to text.
|
|
22
|
+
*/
|
|
23
|
+
export function decode(ids: number[], provider?: Provider): string;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if text is within a token limit.
|
|
27
|
+
* Returns the token count if within the limit, or false if exceeded.
|
|
28
|
+
* More efficient than encode() for long texts since it short-circuits.
|
|
29
|
+
*/
|
|
30
|
+
export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
|
|
31
|
+
|
|
32
|
+
/** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
|
|
33
|
+
export function openai(): Tokenizer;
|
|
34
|
+
|
|
35
|
+
/** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
36
|
+
export function openaiO200k(): Tokenizer;
|
|
37
|
+
|
|
38
|
+
/** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
|
|
39
|
+
export function anthropic(): Tokenizer;
|
|
40
|
+
|
|
41
|
+
/** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
|
|
42
|
+
export function gemini(): Tokenizer;
|
package/package.json
CHANGED
|
@@ -1,12 +1,21 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bpe-lite",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Offline BPE tokenizer for OpenAI, Anthropic, and Gemini — zero dependencies",
|
|
5
5
|
"main": "src/index.js",
|
|
6
|
+
"types": "index.d.ts",
|
|
6
7
|
"type": "commonjs",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./index.d.ts",
|
|
11
|
+
"import": "./src/index.mjs",
|
|
12
|
+
"require": "./src/index.js"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
7
15
|
"files": [
|
|
8
16
|
"src/",
|
|
9
|
-
"vocabs/"
|
|
17
|
+
"vocabs/",
|
|
18
|
+
"index.d.ts"
|
|
10
19
|
],
|
|
11
20
|
"scripts": {
|
|
12
21
|
"build": "node scripts/build-vocabs.js",
|
package/src/bpe.js
CHANGED
|
@@ -194,4 +194,35 @@ function mergeB64(a, b) {
|
|
|
194
194
|
return buf.toString('base64');
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
|
|
197
|
+
/**
|
|
198
|
+
* Count tokens up to a limit, short-circuiting once exceeded.
|
|
199
|
+
* @param {string} text
|
|
200
|
+
* @param {Object} vocabData
|
|
201
|
+
* @param {number} limit
|
|
202
|
+
* @returns {number} token count (may exceed limit by up to one chunk)
|
|
203
|
+
*/
|
|
204
|
+
function countTiktokenUpTo(text, vocabData, limit) {
|
|
205
|
+
if (!text) return 0;
|
|
206
|
+
|
|
207
|
+
const { vocab, specialTokens = {}, pattern } = vocabData;
|
|
208
|
+
const specials = Object.entries(specialTokens);
|
|
209
|
+
const pieces = splitOnSpecials(text, specials);
|
|
210
|
+
|
|
211
|
+
let count = 0;
|
|
212
|
+
for (const piece of pieces) {
|
|
213
|
+
if (piece.isSpecial) {
|
|
214
|
+
count++;
|
|
215
|
+
} else {
|
|
216
|
+
const chunks = pretokenize(piece.text, pattern);
|
|
217
|
+
for (const chunk of chunks) {
|
|
218
|
+
count += bpeEncode(chunk, vocab).length;
|
|
219
|
+
if (count > limit) return count;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (count > limit) return count;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return count;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
module.exports = { encodeTiktoken, decodeTiktoken, countTiktokenUpTo };
|
package/src/index.js
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
const path = require('path');
|
|
4
4
|
const fs = require('fs');
|
|
5
|
+
const zlib = require('zlib');
|
|
5
6
|
const { Tokenizer } = require('./tokenizer');
|
|
6
7
|
|
|
7
8
|
const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
|
|
@@ -12,15 +13,21 @@ const _cache = {};
|
|
|
12
13
|
function loadTokenizer(provider) {
|
|
13
14
|
if (_cache[provider]) return _cache[provider];
|
|
14
15
|
|
|
15
|
-
const
|
|
16
|
-
|
|
16
|
+
const gzPath = path.join(VOCABS_DIR, `${provider}.json.gz`);
|
|
17
|
+
const jsonPath = path.join(VOCABS_DIR, `${provider}.json`);
|
|
18
|
+
|
|
19
|
+
let data;
|
|
20
|
+
if (fs.existsSync(gzPath)) {
|
|
21
|
+
data = JSON.parse(zlib.gunzipSync(fs.readFileSync(gzPath)).toString('utf8'));
|
|
22
|
+
} else if (fs.existsSync(jsonPath)) {
|
|
23
|
+
data = JSON.parse(fs.readFileSync(jsonPath, 'utf8'));
|
|
24
|
+
} else {
|
|
17
25
|
throw new Error(
|
|
18
|
-
`Vocab file not found for provider "${provider}"
|
|
26
|
+
`Vocab file not found for provider "${provider}".\n` +
|
|
19
27
|
'Run "node scripts/build-vocabs.js" to build vocab files.'
|
|
20
28
|
);
|
|
21
29
|
}
|
|
22
30
|
|
|
23
|
-
const data = JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
24
31
|
_cache[provider] = new Tokenizer(data);
|
|
25
32
|
return _cache[provider];
|
|
26
33
|
}
|
|
@@ -48,20 +55,36 @@ function encode(text, provider = 'openai') {
|
|
|
48
55
|
/**
|
|
49
56
|
* Decode token ids back to text.
|
|
50
57
|
* @param {number[]} ids
|
|
51
|
-
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
58
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
52
59
|
* @returns {string}
|
|
53
60
|
*/
|
|
54
61
|
function decode(ids, provider = 'openai') {
|
|
55
62
|
return loadTokenizer(provider).decode(ids);
|
|
56
63
|
}
|
|
57
64
|
|
|
58
|
-
/**
|
|
65
|
+
/**
|
|
66
|
+
* Check if text is within a token limit without necessarily encoding the whole string.
|
|
67
|
+
* Returns false if the limit is exceeded, otherwise returns the token count.
|
|
68
|
+
* @param {string} text
|
|
69
|
+
* @param {number} limit
|
|
70
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
71
|
+
* @returns {number|false}
|
|
72
|
+
*/
|
|
73
|
+
function isWithinTokenLimit(text, limit, provider = 'openai') {
|
|
74
|
+
const count = loadTokenizer(provider).countUpTo(text, limit);
|
|
75
|
+
return count <= limit ? count : false;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
|
|
59
79
|
function openai() { return loadTokenizer('openai'); }
|
|
60
80
|
|
|
81
|
+
/** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
82
|
+
function openaiO200k() { return loadTokenizer('openai-o200k'); }
|
|
83
|
+
|
|
61
84
|
/** Get a Tokenizer instance for Anthropic (cl100k approximation). */
|
|
62
85
|
function anthropic() { return loadTokenizer('anthropic'); }
|
|
63
86
|
|
|
64
87
|
/** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
|
|
65
88
|
function gemini() { return loadTokenizer('gemini'); }
|
|
66
89
|
|
|
67
|
-
module.exports = { countTokens, encode, decode, openai, anthropic, gemini };
|
|
90
|
+
module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
|
package/src/index.mjs
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const {
|
|
5
|
+
countTokens,
|
|
6
|
+
encode,
|
|
7
|
+
decode,
|
|
8
|
+
isWithinTokenLimit,
|
|
9
|
+
openai,
|
|
10
|
+
openaiO200k,
|
|
11
|
+
anthropic,
|
|
12
|
+
gemini,
|
|
13
|
+
} = require('./index.js');
|
|
14
|
+
|
|
15
|
+
export {
|
|
16
|
+
countTokens,
|
|
17
|
+
encode,
|
|
18
|
+
decode,
|
|
19
|
+
isWithinTokenLimit,
|
|
20
|
+
openai,
|
|
21
|
+
openaiO200k,
|
|
22
|
+
anthropic,
|
|
23
|
+
gemini,
|
|
24
|
+
};
|
package/src/tokenizer.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
const { encodeTiktoken, decodeTiktoken } = require('./bpe');
|
|
3
|
+
const { encodeTiktoken, decodeTiktoken, countTiktokenUpTo } = require('./bpe');
|
|
4
4
|
const { encodeSPM, decodeSPM } = require('./spm');
|
|
5
5
|
|
|
6
6
|
class Tokenizer {
|
|
@@ -26,6 +26,19 @@ class Tokenizer {
|
|
|
26
26
|
count(text) {
|
|
27
27
|
return this.encode(text).length;
|
|
28
28
|
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Count tokens, stopping as soon as the count exceeds limit.
|
|
32
|
+
* More efficient than encode() for token limit checks on long text.
|
|
33
|
+
* @param {string} text
|
|
34
|
+
* @param {number} limit
|
|
35
|
+
* @returns {number}
|
|
36
|
+
*/
|
|
37
|
+
countUpTo(text, limit) {
|
|
38
|
+
if (this._engine === 'tiktoken') return countTiktokenUpTo(text, this._data, limit);
|
|
39
|
+
// SPM encodes the whole text as one unit — no clean early exit, just encode and count
|
|
40
|
+
return this.encode(text).length;
|
|
41
|
+
}
|
|
29
42
|
}
|
|
30
43
|
|
|
31
44
|
module.exports = { Tokenizer };
|
|
Binary file
|
|
Binary file
|