bpe-lite 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,13 +1,14 @@
1
1
  # bpe-lite
2
2
 
3
- Offline BPE tokenizer for OpenAI, Anthropic, and Gemini. Zero dependencies, no network calls at runtime. Works in any Node 18+ environment including Docker and edge runtimes.
3
+ Offline BPE tokenizer for OpenAI, Anthropic, and Gemini. Zero dependencies, no network calls at runtime. Works in any Node 18+ environment including Docker and edge runtimes. Ships CJS and ESM. TypeScript types included.
4
4
 
5
5
  ```js
6
6
  const { countTokens } = require('bpe-lite');
7
7
 
8
- countTokens('Hello, world!', 'openai') // → 4
9
- countTokens('Hello, world!', 'anthropic') // → 4
10
- countTokens('Hello, world!', 'gemini') // → 4
8
+ countTokens('Hello, world!', 'openai-o200k') // → 4 (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
9
+ countTokens('Hello, world!', 'openai') // → 4 (GPT-4, GPT-3.5)
10
+ countTokens('Hello, world!', 'anthropic') // → 4
11
+ countTokens('Hello, world!', 'gemini') // → 4
11
12
  ```
12
13
 
13
14
  ## Install
@@ -18,33 +19,71 @@ npm install bpe-lite
18
19
 
19
20
  ## Usage
20
21
 
22
+ Both CommonJS and ESM are supported:
23
+
24
+ ```js
25
+ // CommonJS
26
+ const { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini } = require('bpe-lite');
27
+
28
+ // ESM
29
+ import { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini } from 'bpe-lite';
30
+ ```
31
+
21
32
  ```js
22
- const { countTokens, encode, decode, openai, anthropic, gemini } = require('bpe-lite');
23
33
 
24
34
  // Count tokens
25
- countTokens('Your text here', 'openai'); // → number
35
+ countTokens('Your text here', 'openai-o200k'); // → number (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
36
+ countTokens('Your text here', 'openai'); // → number (GPT-4, GPT-3.5)
26
37
 
27
38
  // Encode / decode
28
- const ids = encode('Hello', 'openai'); // → [9906]
29
- decode(ids, 'openai'); // → 'Hello'
30
-
31
- // Tokenizer instances (lazy-loaded, cached per provider)
32
- const tok = openai();
33
- tok.encode('Hello'); // → [9906]
34
- tok.decode([9906]); // → 'Hello'
35
- tok.count('Hello, world!'); // → 4
39
+ const ids = encode('Hello', 'openai-o200k'); // → [13225]
40
+ decode(ids, 'openai-o200k'); // → 'Hello'
41
+
42
+ // Check token limit — short-circuits early on long text, more efficient than encode()
43
+ // Returns the token count if within the limit, false if exceeded
44
+ isWithinTokenLimit('Hello, world!', 10, 'openai-o200k'); // → 4
45
+ isWithinTokenLimit('Hello, world!', 3, 'openai-o200k'); // → false
46
+
47
+ // Tokenizer instances — lazy-loaded and cached per provider
48
+ const tok = openaiO200k();
49
+ tok.encode('Hello'); // → [13225]
50
+ tok.decode([13225]); // → 'Hello'
51
+ tok.count('Hello, world!'); // → 4
36
52
  ```
37
53
 
38
54
  ## Providers
39
55
 
40
- | Provider | Vocab | Tokens | Accuracy |
41
- |----------|-------|--------|----------|
42
- | `openai` | cl100k_base | 100,256 | Exact — vocab sourced directly from OpenAI's CDN (same source as tiktoken) |
43
- | `anthropic` | cl100k approximation | 100,256 | ~95% Claude 3+ tokenizer has not been publicly released |
44
- | `gemini` | Gemma 3 | 262,144 | ExactGemini uses the same tokenizer as Gemma 3 open-weights |
56
+ | Provider | Vocab | Tokens | Models | Accuracy |
57
+ |----------|-------|--------|--------|----------|
58
+ | `openai-o200k` | o200k_base | 199,998 | GPT-4o, o1, o3, o4, GPT-4.1, GPT-5 | Exact — vocab sourced directly from OpenAI's CDN |
59
+ | `openai` | cl100k_base | 100,256 | GPT-4, GPT-3.5 | Exact vocab sourced directly from OpenAI's CDN |
60
+ | `anthropic` | cl100k approximation | 100,256 | Claude 3+ | ~95% Anthropic has not released the Claude 3+ tokenizer |
61
+ | `gemini` | Gemma 3 | 262,144 | Gemini | Exact — Gemini uses the same tokenizer as Gemma 3 open-weights |
45
62
 
46
63
  Vocab files are bundled in the package — no network required at runtime or install time.
47
64
 
65
+ ## API
66
+
67
+ ### `countTokens(text, provider?)`
68
+
69
+ Returns the number of tokens in `text`. Default provider: `'openai'`.
70
+
71
+ ### `encode(text, provider?)`
72
+
73
+ Returns an array of token ids.
74
+
75
+ ### `decode(ids, provider?)`
76
+
77
+ Decodes an array of token ids back to a string.
78
+
79
+ ### `isWithinTokenLimit(text, limit, provider?)`
80
+
81
+ Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts — the tiktoken engine short-circuits as soon as the limit is crossed.
82
+
83
+ ### Tokenizer instances
84
+
85
+ `openai()`, `openaiO200k()`, `anthropic()`, `gemini()` each return a cached `Tokenizer` object with `.encode()`, `.decode()`, and `.count()` methods. Instances are created once per provider per process.
86
+
48
87
  ## Why not tiktoken?
49
88
 
50
89
  `tiktoken` is accurate for OpenAI but requires Rust/WASM native bindings, which can break in Docker containers, edge runtimes, and serverless environments. `bpe-lite` is pure JavaScript — it runs anywhere Node 18+ runs.
@@ -52,7 +91,7 @@ Vocab files are bundled in the package — no network required at runtime or ins
52
91
  ## Caveats
53
92
 
54
93
  - **Anthropic**: Anthropic has not released the Claude 3+ tokenizer. The cl100k approximation is accurate to ~95% for most text.
55
- - **Speed**: Pure JS is slower than tiktoken's native implementation. For token *counting* (not bulk processing) the difference is negligible.
94
+ - **Speed**: Pure JS is slower than tiktoken's native implementation. For token counting (not bulk processing) the difference is negligible.
56
95
  - **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
57
96
 
58
97
  ## License
package/index.d.ts ADDED
@@ -0,0 +1,42 @@
1
+ export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
2
+
3
+ export interface Tokenizer {
4
+ encode(text: string): number[];
5
+ decode(ids: number[]): string;
6
+ count(text: string): number;
7
+ countUpTo(text: string, limit: number): number;
8
+ }
9
+
10
+ /**
11
+ * Count the number of tokens in text for the given provider.
12
+ */
13
+ export function countTokens(text: string, provider?: Provider): number;
14
+
15
+ /**
16
+ * Encode text to token ids.
17
+ */
18
+ export function encode(text: string, provider?: Provider): number[];
19
+
20
+ /**
21
+ * Decode token ids back to text.
22
+ */
23
+ export function decode(ids: number[], provider?: Provider): string;
24
+
25
+ /**
26
+ * Check if text is within a token limit.
27
+ * Returns the token count if within the limit, or false if exceeded.
28
+ * More efficient than encode() for long texts since it short-circuits.
29
+ */
30
+ export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
31
+
32
+ /** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
33
+ export function openai(): Tokenizer;
34
+
35
+ /** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
36
+ export function openaiO200k(): Tokenizer;
37
+
38
+ /** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
39
+ export function anthropic(): Tokenizer;
40
+
41
+ /** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
42
+ export function gemini(): Tokenizer;
package/package.json CHANGED
@@ -1,12 +1,21 @@
1
1
  {
2
2
  "name": "bpe-lite",
3
- "version": "0.1.1",
3
+ "version": "0.3.0",
4
4
  "description": "Offline BPE tokenizer for OpenAI, Anthropic, and Gemini — zero dependencies",
5
5
  "main": "src/index.js",
6
+ "types": "index.d.ts",
6
7
  "type": "commonjs",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./index.d.ts",
11
+ "import": "./src/index.mjs",
12
+ "require": "./src/index.js"
13
+ }
14
+ },
7
15
  "files": [
8
16
  "src/",
9
- "vocabs/"
17
+ "vocabs/",
18
+ "index.d.ts"
10
19
  ],
11
20
  "scripts": {
12
21
  "build": "node scripts/build-vocabs.js",
package/src/bpe.js CHANGED
@@ -194,4 +194,35 @@ function mergeB64(a, b) {
194
194
  return buf.toString('base64');
195
195
  }
196
196
 
197
- module.exports = { encodeTiktoken, decodeTiktoken };
197
+ /**
198
+ * Count tokens up to a limit, short-circuiting once exceeded.
199
+ * @param {string} text
200
+ * @param {Object} vocabData
201
+ * @param {number} limit
202
+ * @returns {number} token count (may exceed limit by up to one chunk)
203
+ */
204
+ function countTiktokenUpTo(text, vocabData, limit) {
205
+ if (!text) return 0;
206
+
207
+ const { vocab, specialTokens = {}, pattern } = vocabData;
208
+ const specials = Object.entries(specialTokens);
209
+ const pieces = splitOnSpecials(text, specials);
210
+
211
+ let count = 0;
212
+ for (const piece of pieces) {
213
+ if (piece.isSpecial) {
214
+ count++;
215
+ } else {
216
+ const chunks = pretokenize(piece.text, pattern);
217
+ for (const chunk of chunks) {
218
+ count += bpeEncode(chunk, vocab).length;
219
+ if (count > limit) return count;
220
+ }
221
+ }
222
+ if (count > limit) return count;
223
+ }
224
+
225
+ return count;
226
+ }
227
+
228
+ module.exports = { encodeTiktoken, decodeTiktoken, countTiktokenUpTo };
package/src/index.js CHANGED
@@ -55,20 +55,36 @@ function encode(text, provider = 'openai') {
55
55
  /**
56
56
  * Decode token ids back to text.
57
57
  * @param {number[]} ids
58
- * @param {'openai'|'anthropic'|'gemini'} provider
58
+ * @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
59
59
  * @returns {string}
60
60
  */
61
61
  function decode(ids, provider = 'openai') {
62
62
  return loadTokenizer(provider).decode(ids);
63
63
  }
64
64
 
65
- /** Get a Tokenizer instance for OpenAI (cl100k_base). */
65
+ /**
66
+ * Check if text is within a token limit without necessarily encoding the whole string.
67
+ * Returns false if the limit is exceeded, otherwise returns the token count.
68
+ * @param {string} text
69
+ * @param {number} limit
70
+ * @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
71
+ * @returns {number|false}
72
+ */
73
+ function isWithinTokenLimit(text, limit, provider = 'openai') {
74
+ const count = loadTokenizer(provider).countUpTo(text, limit);
75
+ return count <= limit ? count : false;
76
+ }
77
+
78
+ /** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
66
79
  function openai() { return loadTokenizer('openai'); }
67
80
 
81
+ /** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
82
+ function openaiO200k() { return loadTokenizer('openai-o200k'); }
83
+
68
84
  /** Get a Tokenizer instance for Anthropic (cl100k approximation). */
69
85
  function anthropic() { return loadTokenizer('anthropic'); }
70
86
 
71
87
  /** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
72
88
  function gemini() { return loadTokenizer('gemini'); }
73
89
 
74
- module.exports = { countTokens, encode, decode, openai, anthropic, gemini };
90
+ module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
package/src/index.mjs ADDED
@@ -0,0 +1,24 @@
1
+ import { createRequire } from 'module';
2
+
3
+ const require = createRequire(import.meta.url);
4
+ const {
5
+ countTokens,
6
+ encode,
7
+ decode,
8
+ isWithinTokenLimit,
9
+ openai,
10
+ openaiO200k,
11
+ anthropic,
12
+ gemini,
13
+ } = require('./index.js');
14
+
15
+ export {
16
+ countTokens,
17
+ encode,
18
+ decode,
19
+ isWithinTokenLimit,
20
+ openai,
21
+ openaiO200k,
22
+ anthropic,
23
+ gemini,
24
+ };
package/src/tokenizer.js CHANGED
@@ -1,6 +1,6 @@
1
1
  'use strict';
2
2
 
3
- const { encodeTiktoken, decodeTiktoken } = require('./bpe');
3
+ const { encodeTiktoken, decodeTiktoken, countTiktokenUpTo } = require('./bpe');
4
4
  const { encodeSPM, decodeSPM } = require('./spm');
5
5
 
6
6
  class Tokenizer {
@@ -26,6 +26,19 @@ class Tokenizer {
26
26
  count(text) {
27
27
  return this.encode(text).length;
28
28
  }
29
+
30
+ /**
31
+ * Count tokens, stopping as soon as the count exceeds limit.
32
+ * More efficient than encode() for token limit checks on long text.
33
+ * @param {string} text
34
+ * @param {number} limit
35
+ * @returns {number}
36
+ */
37
+ countUpTo(text, limit) {
38
+ if (this._engine === 'tiktoken') return countTiktokenUpTo(text, this._data, limit);
39
+ // SPM encodes the whole text as one unit — no clean early exit, just encode and count
40
+ return this.encode(text).length;
41
+ }
29
42
  }
30
43
 
31
44
  module.exports = { Tokenizer };
Binary file