bpe-lite 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # bpe-lite
2
+
3
+ Offline BPE tokenizer for OpenAI, Anthropic, and Gemini. Zero dependencies, no network calls at runtime. Works in any Node 18+ environment including Docker and edge runtimes.
4
+
5
+ ```js
6
+ const { countTokens } = require('bpe-lite');
7
+
8
+ countTokens('Hello, world!', 'openai') // → 4
9
+ countTokens('Hello, world!', 'anthropic') // → 4
10
+ countTokens('Hello, world!', 'gemini') // → 4
11
+ ```
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ npm install bpe-lite
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```js
22
+ const { countTokens, encode, decode, openai, anthropic, gemini } = require('bpe-lite');
23
+
24
+ // Count tokens
25
+ countTokens('Your text here', 'openai'); // → number
26
+
27
+ // Encode / decode
28
+ const ids = encode('Hello', 'openai'); // → [9906]
29
+ decode(ids, 'openai'); // → 'Hello'
30
+
31
+ // Tokenizer instances (lazy-loaded, cached per provider)
32
+ const tok = openai();
33
+ tok.encode('Hello'); // → [9906]
34
+ tok.decode([9906]); // → 'Hello'
35
+ tok.count('Hello, world!'); // → 4
36
+ ```
37
+
38
+ ## Providers
39
+
40
+ | Provider | Vocab | Tokens | Accuracy |
41
+ |----------|-------|--------|----------|
42
+ | `openai` | cl100k_base | 100,256 | Exact — vocab sourced directly from OpenAI's CDN (same source as tiktoken) |
43
+ | `anthropic` | cl100k approximation | 100,256 | ~95% — Claude 3+ tokenizer has not been publicly released |
44
+ | `gemini` | Gemma 3 | 262,144 | Exact — Gemini uses the same tokenizer as Gemma 3 open-weights |
45
+
46
+ Vocab files are bundled in the package — no network required at runtime or install time.
47
+
48
+ ## Why not tiktoken?
49
+
50
+ `tiktoken` is accurate for OpenAI but requires Rust/WASM native bindings, which can break in Docker containers, edge runtimes, and serverless environments. `bpe-lite` is pure JavaScript — it runs anywhere Node 18+ runs.
51
+
52
+ ## Caveats
53
+
54
+ - **Anthropic**: Anthropic has not released the Claude 3+ tokenizer. The cl100k approximation is accurate to ~95% for most text.
55
+ - **Speed**: Pure JS is slower than tiktoken's native implementation. For token *counting* (not bulk processing) the difference is negligible.
56
+ - **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
57
+
58
+ ## License
59
+
60
+ MIT
package/package.json ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "bpe-lite",
3
+ "version": "0.1.0",
4
+ "description": "Offline BPE tokenizer for OpenAI, Anthropic, and Gemini — zero dependencies",
5
+ "main": "src/index.js",
6
+ "type": "commonjs",
7
+ "files": [
8
+ "src/",
9
+ "vocabs/"
10
+ ],
11
+ "scripts": {
12
+ "build": "node scripts/build-vocabs.js",
13
+ "test": "node tests/run-tests.js"
14
+ },
15
+ "keywords": ["tokenizer", "bpe", "openai", "anthropic", "gemini", "tokens", "llm"],
16
+ "license": "MIT"
17
+ }
package/src/bpe.js ADDED
@@ -0,0 +1,197 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * tiktoken-style BPE encoder.
5
+ * Vocab keys are base64-encoded byte sequences, values are ranks (merge priority).
6
+ * Lower rank = higher priority in merges.
7
+ */
8
+
9
+ // Byte → base64 token lookup (pre-built at module load)
10
+ const BYTE_TOKENS = (() => {
11
+ const out = new Array(256);
12
+ for (let i = 0; i < 256; i++) {
13
+ out[i] = Buffer.from([i]).toString('base64');
14
+ }
15
+ return out;
16
+ })();
17
+
18
+ /**
19
+ * Encode text using tiktoken-style BPE.
20
+ * @param {string} text
21
+ * @param {Object} vocabData — { engine, pattern, vocab, specialTokens }
22
+ * @returns {number[]} token ids
23
+ */
24
+ function encodeTiktoken(text, vocabData) {
25
+ if (!text) return [];
26
+
27
+ const { vocab, specialTokens = {}, pattern } = vocabData;
28
+
29
+ // Build reverse map: rank → id (rank IS the id in tiktoken)
30
+ // vocab[base64token] = rank = token_id
31
+
32
+ // Build special token map (string → id)
33
+ const specials = Object.entries(specialTokens);
34
+
35
+ const ids = [];
36
+
37
+ // Split text around special tokens first
38
+ const pieces = splitOnSpecials(text, specials);
39
+
40
+ for (const piece of pieces) {
41
+ if (piece.isSpecial) {
42
+ ids.push(piece.id);
43
+ } else {
44
+ // Apply regex pre-tokenization then BPE encode each chunk
45
+ const chunks = pretokenize(piece.text, pattern);
46
+ for (const chunk of chunks) {
47
+ const chunkIds = bpeEncode(chunk, vocab);
48
+ for (const id of chunkIds) ids.push(id);
49
+ }
50
+ }
51
+ }
52
+
53
+ return ids;
54
+ }
55
+
56
+ /**
57
+ * Decode token ids back to string.
58
+ * @param {number[]} ids
59
+ * @param {Object} vocabData
60
+ * @returns {string}
61
+ */
62
+ function decodeTiktoken(ids, vocabData) {
63
+ if (!ids || ids.length === 0) return '';
64
+
65
+ const { vocab, specialTokens = {} } = vocabData;
66
+
67
+ // Build id → bytes map
68
+ const idToBytes = new Map();
69
+ for (const [b64, rank] of Object.entries(vocab)) {
70
+ idToBytes.set(rank, Buffer.from(b64, 'base64'));
71
+ }
72
+ for (const [str, id] of Object.entries(specialTokens)) {
73
+ idToBytes.set(id, Buffer.from(str, 'utf8'));
74
+ }
75
+
76
+ const bufs = [];
77
+ for (const id of ids) {
78
+ const bytes = idToBytes.get(id);
79
+ if (bytes) bufs.push(bytes);
80
+ }
81
+
82
+ return Buffer.concat(bufs).toString('utf8');
83
+ }
84
+
85
+ // ─── Internal helpers ─────────────────────────────────────────────────────────
86
+
87
+ function splitOnSpecials(text, specials) {
88
+ if (specials.length === 0) return [{ text, isSpecial: false }];
89
+
90
+ // Sort by length descending so longer tokens match first
91
+ const sorted = specials.slice().sort((a, b) => b[0].length - a[0].length);
92
+
93
+ const result = [];
94
+ let remaining = text;
95
+
96
+ while (remaining.length > 0) {
97
+ let found = false;
98
+ for (const [str, id] of sorted) {
99
+ const idx = remaining.indexOf(str);
100
+ if (idx !== -1) {
101
+ if (idx > 0) result.push({ text: remaining.slice(0, idx), isSpecial: false });
102
+ result.push({ isSpecial: true, id });
103
+ remaining = remaining.slice(idx + str.length);
104
+ found = true;
105
+ break;
106
+ }
107
+ }
108
+ if (!found) {
109
+ result.push({ text: remaining, isSpecial: false });
110
+ break;
111
+ }
112
+ }
113
+
114
+ return result;
115
+ }
116
+
117
+ function pretokenize(text, patternStr) {
118
+ if (!patternStr || !text) return text ? [text] : [];
119
+
120
+ // Node 18+ supports Unicode property escapes in RegExp natively
121
+ // We need the 'v' or 'u' flag for \p{L}, \p{N}
122
+ // The pattern from cl100k_base uses (?i:...) syntax which is not standard JS
123
+ // We convert it to use the 'i' flag selectively via alternation
124
+ let regexStr = patternStr
125
+ .replace(/\(\?i:/g, '(?:') // (?i:...) → (?:...) and we use 'i' flag on the whole regex
126
+ .replace(/\\p\{L\}/g, '\\p{L}')
127
+ .replace(/\\p\{N\}/g, '\\p{N}');
128
+
129
+ let re;
130
+ try {
131
+ re = new RegExp(regexStr, 'guy'); // g=global, u=unicode, y=... actually just 'gu'
132
+ } catch {
133
+ try {
134
+ re = new RegExp(regexStr, 'gi');
135
+ } catch {
136
+ // Fallback: split on whitespace
137
+ return text.match(/\S+|\s+/g) || [text];
138
+ }
139
+ }
140
+
141
+ return text.match(re) || [text];
142
+ }
143
+
144
+ /**
145
+ * BPE encode a single pre-tokenized chunk.
146
+ * @param {string} chunk — raw string chunk
147
+ * @param {Object} vocab — { base64token: rank }
148
+ * @returns {number[]}
149
+ */
150
+ function bpeEncode(chunk, vocab) {
151
+ // Convert chunk to UTF-8 bytes, get initial token ids (one per byte)
152
+ const bytes = Buffer.from(chunk, 'utf8');
153
+ if (bytes.length === 0) return [];
154
+
155
+ // Start with individual bytes as tokens
156
+ let tokens = [];
157
+ for (let i = 0; i < bytes.length; i++) {
158
+ const b64 = BYTE_TOKENS[bytes[i]];
159
+ const rank = vocab[b64];
160
+ if (rank === undefined) {
161
+ // Unknown byte — use a fallback (should not happen with byte-level vocab)
162
+ tokens.push({ b64, rank: Infinity });
163
+ } else {
164
+ tokens.push({ b64, rank });
165
+ }
166
+ }
167
+
168
+ // Greedy merge: repeatedly find the adjacent pair with the lowest max rank
169
+ while (tokens.length >= 2) {
170
+ let bestRank = Infinity;
171
+ let bestIdx = -1;
172
+
173
+ for (let i = 0; i < tokens.length - 1; i++) {
174
+ const mergedB64 = mergeB64(tokens[i].b64, tokens[i + 1].b64);
175
+ const rank = vocab[mergedB64];
176
+ if (rank !== undefined && rank < bestRank) {
177
+ bestRank = rank;
178
+ bestIdx = i;
179
+ }
180
+ }
181
+
182
+ if (bestIdx === -1) break; // No more merges possible
183
+
184
+ const mergedB64 = mergeB64(tokens[bestIdx].b64, tokens[bestIdx + 1].b64);
185
+ tokens.splice(bestIdx, 2, { b64: mergedB64, rank: bestRank });
186
+ }
187
+
188
+ return tokens.map(t => t.rank);
189
+ }
190
+
191
+ function mergeB64(a, b) {
192
+ // Decode both, concatenate bytes, re-encode
193
+ const buf = Buffer.concat([Buffer.from(a, 'base64'), Buffer.from(b, 'base64')]);
194
+ return buf.toString('base64');
195
+ }
196
+
197
+ module.exports = { encodeTiktoken, decodeTiktoken };
package/src/index.js ADDED
@@ -0,0 +1,67 @@
1
+ 'use strict';
2
+
3
+ const path = require('path');
4
+ const fs = require('fs');
5
+ const { Tokenizer } = require('./tokenizer');
6
+
7
+ const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
8
+
9
+ // Lazy-loaded tokenizer instances (created once per provider per process)
10
+ const _cache = {};
11
+
12
+ function loadTokenizer(provider) {
13
+ if (_cache[provider]) return _cache[provider];
14
+
15
+ const filePath = path.join(VOCABS_DIR, `${provider}.json`);
16
+ if (!fs.existsSync(filePath)) {
17
+ throw new Error(
18
+ `Vocab file not found for provider "${provider}": ${filePath}\n` +
19
+ 'Run "node scripts/build-vocabs.js" to build vocab files.'
20
+ );
21
+ }
22
+
23
+ const data = JSON.parse(fs.readFileSync(filePath, 'utf8'));
24
+ _cache[provider] = new Tokenizer(data);
25
+ return _cache[provider];
26
+ }
27
+
28
+ /**
29
+ * Count tokens in text for a given provider.
30
+ * @param {string} text
31
+ * @param {'openai'|'anthropic'|'gemini'} provider
32
+ * @returns {number}
33
+ */
34
+ function countTokens(text, provider = 'openai') {
35
+ return loadTokenizer(provider).count(text);
36
+ }
37
+
38
+ /**
39
+ * Encode text to token ids.
40
+ * @param {string} text
41
+ * @param {'openai'|'anthropic'|'gemini'} provider
42
+ * @returns {number[]}
43
+ */
44
+ function encode(text, provider = 'openai') {
45
+ return loadTokenizer(provider).encode(text);
46
+ }
47
+
48
+ /**
49
+ * Decode token ids back to text.
50
+ * @param {number[]} ids
51
+ * @param {'openai'|'anthropic'|'gemini'} provider
52
+ * @returns {string}
53
+ */
54
+ function decode(ids, provider = 'openai') {
55
+ return loadTokenizer(provider).decode(ids);
56
+ }
57
+
58
+ /** Get a Tokenizer instance for OpenAI (cl100k_base). */
59
+ function openai() { return loadTokenizer('openai'); }
60
+
61
+ /** Get a Tokenizer instance for Anthropic (cl100k approximation). */
62
+ function anthropic() { return loadTokenizer('anthropic'); }
63
+
64
+ /** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
65
+ function gemini() { return loadTokenizer('gemini'); }
66
+
67
+ module.exports = { countTokens, encode, decode, openai, anthropic, gemini };
package/src/spm.js ADDED
@@ -0,0 +1,110 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SentencePiece BPE encoder for Gemini/Gemma3.
5
+ * Vocab keys are token strings (▁ for space), values are token ids.
6
+ * Merges are ordered list of "token_a token_b" strings — applied by rank (index = priority).
7
+ */
8
+
9
+ const SPACE_CHAR = '\u2581'; // ▁
10
+
11
+ /**
12
+ * Encode text using SentencePiece BPE.
13
+ * @param {string} text
14
+ * @param {Object} vocabData — { engine, vocab, merges }
15
+ * @returns {number[]}
16
+ */
17
+ function encodeSPM(text, vocabData) {
18
+ if (!text) return [];
19
+
20
+ const { vocab, merges } = vocabData;
21
+
22
+ // Build merge rank map: "a b" → rank index (lower = higher priority)
23
+ const mergeRank = new Map();
24
+ for (let i = 0; i < merges.length; i++) {
25
+ mergeRank.set(merges[i], i);
26
+ }
27
+
28
+ // Build reverse vocab: id → string (for decode)
29
+ // vocab: { tokenString: id }
30
+
31
+ // Normalize: replace spaces with ▁, prepend ▁
32
+ const normalized = SPACE_CHAR + text.replace(/ /g, SPACE_CHAR);
33
+
34
+ // Split into individual Unicode characters
35
+ const chars = [...normalized];
36
+
37
+ // Map each character to a token (may be a multi-byte char)
38
+ let tokens = chars.map(c => {
39
+ if (vocab[c] !== undefined) {
40
+ return { str: c, id: vocab[c] };
41
+ }
42
+ // Try byte fallback: <0xNN>
43
+ const codePoint = c.codePointAt(0);
44
+ const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
45
+ const byteKey = `<0x${hex}>`;
46
+ if (vocab[byteKey] !== undefined) {
47
+ return { str: byteKey, id: vocab[byteKey] };
48
+ }
49
+ // Unknown — use UNK (id 3 in Gemma) or skip
50
+ return { str: c, id: vocab['<unk>'] ?? 0 };
51
+ });
52
+
53
+ // Greedy BPE merges: find pair with lowest merge rank, merge, repeat
54
+ while (tokens.length >= 2) {
55
+ let bestRank = Infinity;
56
+ let bestIdx = -1;
57
+
58
+ for (let i = 0; i < tokens.length - 1; i++) {
59
+ const key = `${tokens[i].str} ${tokens[i + 1].str}`;
60
+ const rank = mergeRank.get(key);
61
+ if (rank !== undefined && rank < bestRank) {
62
+ bestRank = rank;
63
+ bestIdx = i;
64
+ }
65
+ }
66
+
67
+ if (bestIdx === -1) break;
68
+
69
+ const merged = tokens[bestIdx].str + tokens[bestIdx + 1].str;
70
+ const mergedId = vocab[merged] ?? vocab['<unk>'] ?? 0;
71
+ tokens.splice(bestIdx, 2, { str: merged, id: mergedId });
72
+ }
73
+
74
+ return tokens.map(t => t.id);
75
+ }
76
+
77
+ /**
78
+ * Decode token ids back to string.
79
+ * @param {number[]} ids
80
+ * @param {Object} vocabData
81
+ * @returns {string}
82
+ */
83
+ function decodeSPM(ids, vocabData) {
84
+ if (!ids || ids.length === 0) return '';
85
+
86
+ const { vocab } = vocabData;
87
+
88
+ // Build id → string map
89
+ const idToStr = new Map();
90
+ for (const [str, id] of Object.entries(vocab)) {
91
+ idToStr.set(id, str);
92
+ }
93
+
94
+ let result = '';
95
+ for (const id of ids) {
96
+ const str = idToStr.get(id) ?? '';
97
+ // Handle byte fallbacks like <0x41> → 'A'
98
+ const byteMatch = str.match(/^<0x([0-9A-Fa-f]{2})>$/);
99
+ if (byteMatch) {
100
+ result += String.fromCharCode(parseInt(byteMatch[1], 16));
101
+ } else {
102
+ result += str;
103
+ }
104
+ }
105
+
106
+ // Replace ▁ with space, remove leading space
107
+ return result.replace(new RegExp(SPACE_CHAR, 'g'), ' ').replace(/^ /, '');
108
+ }
109
+
110
+ module.exports = { encodeSPM, decodeSPM };
@@ -0,0 +1,31 @@
1
+ 'use strict';
2
+
3
+ const { encodeTiktoken, decodeTiktoken } = require('./bpe');
4
+ const { encodeSPM, decodeSPM } = require('./spm');
5
+
6
+ class Tokenizer {
7
+ constructor(vocabData) {
8
+ this._data = vocabData;
9
+ this._engine = vocabData.engine;
10
+
11
+ if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
12
+ throw new Error(`Unknown tokenizer engine: ${this._engine}`);
13
+ }
14
+ }
15
+
16
+ encode(text) {
17
+ if (this._engine === 'tiktoken') return encodeTiktoken(text, this._data);
18
+ return encodeSPM(text, this._data);
19
+ }
20
+
21
+ decode(ids) {
22
+ if (this._engine === 'tiktoken') return decodeTiktoken(ids, this._data);
23
+ return decodeSPM(ids, this._data);
24
+ }
25
+
26
+ count(text) {
27
+ return this.encode(text).length;
28
+ }
29
+ }
30
+
31
+ module.exports = { Tokenizer };