bpe-lite 0.4.3 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -32
- package/index.d.ts +42 -42
- package/package.json +2 -1
- package/src/bpe.js +86 -18
- package/src/index.js +90 -90
- package/src/index.mjs +24 -24
- package/src/spm.js +123 -43
- package/src/tokenizer.js +59 -59
- package/vocabs/anthropic.json.gz +0 -0
package/README.md
CHANGED
|
@@ -30,7 +30,6 @@ import { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, a
|
|
|
30
30
|
```
|
|
31
31
|
|
|
32
32
|
```js
|
|
33
|
-
|
|
34
33
|
// Count tokens
|
|
35
34
|
countTokens('Your text here', 'openai-o200k'); // → number (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5)
|
|
36
35
|
countTokens('Your text here', 'openai'); // → number (GPT-4, GPT-3.5)
|
|
@@ -53,44 +52,92 @@ tok.count('Hello, world!'); // → 4
|
|
|
53
52
|
|
|
54
53
|
## Providers
|
|
55
54
|
|
|
56
|
-
| Provider | Vocab |
|
|
57
|
-
|
|
58
|
-
| `openai-o200k` | o200k_base
|
|
59
|
-
| `openai` | cl100k_base
|
|
60
|
-
| `anthropic` |
|
|
61
|
-
| `gemini` | Gemma 3
|
|
55
|
+
| Provider | Vocab | Models | Accuracy |
|
|
56
|
+
|----------|-------|--------|----------|
|
|
57
|
+
| `openai-o200k` | o200k_base (200k) | GPT-4o, o1, o3, o4, GPT-4.1, GPT-5 | Exact — vocab sourced from OpenAI |
|
|
58
|
+
| `openai` | cl100k_base (100k) | GPT-4, GPT-3.5 | Exact — vocab sourced from OpenAI |
|
|
59
|
+
| `anthropic` | Xenova/claude-tokenizer (65k BPE) | Claude | See accuracy section below |
|
|
60
|
+
| `gemini` | Gemma 3 SPM (262k) | Gemini | See accuracy section below |
|
|
62
61
|
|
|
63
62
|
Vocab files are bundled in the package — no network required at runtime or install time.
|
|
64
63
|
|
|
64
|
+
## Accuracy — Anthropic
|
|
65
|
+
|
|
66
|
+
Anthropic has not released the Claude tokenizer. bpe-lite uses [`Xenova/claude-tokenizer`](https://huggingface.co/Xenova/claude-tokenizer), a community reverse-engineering of the ~65k BPE vocabulary, with hand-tuned byte-level corrections applied on top.
|
|
67
|
+
|
|
68
|
+
### Benchmark methodology
|
|
69
|
+
|
|
70
|
+
Tested against `claude-haiku-4-5-20251001` via the Anthropic `count_tokens` API on a 120-sample stratified corpus across 12 categories. The corpus deliberately over-represents difficult content (Arabic, symbols, emoji, numbers) to expose systematic failures — overall numbers are lower than you would see on typical prose-only workloads by design.
|
|
71
|
+
|
|
72
|
+
### Overall results (120 samples, 114 eligible)
|
|
73
|
+
|
|
74
|
+
| Metric | bpe-lite | ai-tokenizer |
|
|
75
|
+
|--------|----------|--------------|
|
|
76
|
+
| Within 5% | 46.5% | 18.4% |
|
|
77
|
+
| Within 10% | 62.3% ±8.8% CI | 37.7% ±8.8% CI |
|
|
78
|
+
| Mean abs error | 9.4% | 16.0% |
|
|
79
|
+
| Median abs error | 5.7% | 13.6% |
|
|
80
|
+
| Max abs error | 42.9% | 82.6% |
|
|
81
|
+
|
|
82
|
+
### Per-category breakdown
|
|
83
|
+
|
|
84
|
+
| Category | Within 10% | Mean error | Notes |
|
|
85
|
+
|----------|-----------|------------|-------|
|
|
86
|
+
| `code-js` | 100% | 4.2% | |
|
|
87
|
+
| `english-prose` | 90% | 5.5% | |
|
|
88
|
+
| `code-python` | 90% | 4.8% | |
|
|
89
|
+
| `structured` | 90% | 3.6% | JSON, HTML, XML, Markdown, SQL |
|
|
90
|
+
| `numbers` | 80% | 7.3% | |
|
|
91
|
+
| `hex-binary` | 80% | 5.3% | |
|
|
92
|
+
| `urls` | 80% | 3.6% | |
|
|
93
|
+
| `cjk` | 40% | 8.8% | |
|
|
94
|
+
| `short` | 30% | 6.8% | |
|
|
95
|
+
| `emoji` | 20% | 17.7% | ZWJ sequences, flags, skin tones |
|
|
96
|
+
| `symbols` | 10% | 17.6% | Cross-byte merges unreplicable |
|
|
97
|
+
| `arabic` | 0% | 26.1% | Structural vocabulary gap — unfixable |
|
|
98
|
+
|
|
99
|
+
For prose, code, structured data, and URLs — the dominant content types in real-world prompts — bpe-lite is within 10% on 80–100% of samples. Arabic and symbol-cluster-heavy content cannot be accurately estimated without the actual Claude tokenizer.
|
|
100
|
+
|
|
101
|
+
### Why bpe-lite outperforms ai-tokenizer on Claude
|
|
102
|
+
|
|
103
|
+
ai-tokenizer's `claude` encoding uses `\p{N}+` (greedy, unlimited digit chunks). Current Claude models use `\p{N}{1,3}` (1–3 digits). This causes 20–43% errors on anything involving numbers — including code, hex, and data. bpe-lite uses the correct pattern.
|
|
104
|
+
|
|
105
|
+
ai-tokenizer also does not have a Gemini encoding: all Gemini models are mapped to OpenAI's `o200k_base` vocabulary with a fudge multiplier. This is wrong by construction — see below.
|
|
106
|
+
|
|
107
|
+
## Accuracy — Gemini
|
|
108
|
+
|
|
109
|
+
bpe-lite implements the full Gemma 3 SentencePiece BPE algorithm using the actual Gemini vocabulary. On a 25-sample test against the Gemini API, bpe-lite scored 100% exact (no failures found; 25 samples is a limited basis — treat this as a lower bound, not a guarantee across all content types).
|
|
110
|
+
|
|
111
|
+
ai-tokenizer does not implement Gemini natively. Inspecting their bundled source (`dist/index.js`), every Gemini model is defined as `"encoding": "o200k_base"` with a `"contentMultiplier": 1.08` fudge factor — it runs the OpenAI vocabulary through a multiplier rather than using Gemini's actual tokenizer. bpe-lite uses the actual Gemma 3 vocabulary and SentencePiece algorithm.
|
|
112
|
+
|
|
65
113
|
## Performance
|
|
66
114
|
|
|
67
|
-
Benchmarked on Node v24 (win32/x64).
|
|
115
|
+
Benchmarked on Node v24 (win32/x64). Run `node --expose-gc scripts/bench.js` locally for numbers on your hardware.
|
|
116
|
+
|
|
117
|
+
**Large text (~500 KB) — ops/s**
|
|
68
118
|
|
|
69
|
-
|
|
119
|
+
| impl | cl100k | Anthropic | Gemini | note |
|
|
120
|
+
|------|-------:|----------:|-------:|------|
|
|
121
|
+
| bpe-lite | 291 | 289 | 998 | |
|
|
122
|
+
| ai-tokenizer | 291 | 291 | 215 | Gemini column uses o200k — wrong algorithm |
|
|
123
|
+
| js-tiktoken | 30 | — | — | WASM overhead |
|
|
70
124
|
|
|
71
|
-
|
|
72
|
-
|------|------:|---------:|-----:|
|
|
73
|
-
| bpe-lite | **257** | **3.15M** | **13.6** |
|
|
74
|
-
| ai-tokenizer | 201 | 2.46M | 10.7 |
|
|
75
|
-
| js-tiktoken | 23 | 282k | 1.2 |
|
|
125
|
+
bpe-lite matches ai-tokenizer throughput for OpenAI and Anthropic large text. On Gemini, bpe-lite's SPM engine is ~4.6x faster — the ai-tokenizer column there is not a valid comparison since it uses a different algorithm.
|
|
76
126
|
|
|
77
|
-
**
|
|
127
|
+
**Small text cold vs warm**
|
|
78
128
|
|
|
79
|
-
|
|
80
|
-
|------|------:|---------:|-----:|
|
|
81
|
-
| bpe-lite | **257** | 3.15M | **13.6** |
|
|
82
|
-
| ai-tokenizer | 253 | **4.62M** | 13.4 |
|
|
129
|
+
bpe-lite maintains a per-instance chunk cache. For repeated text patterns (e.g. the same prompt template encoded thousands of times), cache hits eliminate BPE work entirely:
|
|
83
130
|
|
|
84
|
-
|
|
131
|
+
| scenario | bpe-lite | ai-tokenizer |
|
|
132
|
+
|----------|----------|--------------|
|
|
133
|
+
| Small text, cold (novel input) | ~352k ops/s | ~453k ops/s |
|
|
134
|
+
| Small text, warm (repeated input) | ~1.45M ops/s | ~453k ops/s |
|
|
85
135
|
|
|
86
|
-
|
|
87
|
-
|------|------:|---------:|-----:|------|
|
|
88
|
-
| bpe-lite | **3,800** | **6.23M** | **29.7** | actual Gemma3 SPM |
|
|
89
|
-
| ai-tokenizer | 1,220 | 2.01M | 9.6 | o200k BPE — different algorithm, different results |
|
|
136
|
+
For diverse, non-repeating inputs, ai-tokenizer is ~29% faster on very short strings. For any repeated-text workload, bpe-lite is ~3x faster.
|
|
90
137
|
|
|
91
|
-
|
|
138
|
+
**Initialization**
|
|
92
139
|
|
|
93
|
-
|
|
140
|
+
bpe-lite lazy-loads the gzipped vocab on first encode — one-time cost of ~235ms per provider per process. Negligible for any persistent process. Relevant only for cold serverless invocations that encode once and exit.
|
|
94
141
|
|
|
95
142
|
## API
|
|
96
143
|
|
|
@@ -108,7 +155,7 @@ Decodes an array of token ids back to a string.
|
|
|
108
155
|
|
|
109
156
|
### `isWithinTokenLimit(text, limit, provider?)`
|
|
110
157
|
|
|
111
|
-
Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts —
|
|
158
|
+
Returns the token count if `text` is within `limit` tokens, or `false` if exceeded. More efficient than `encode()` for long texts — short-circuits as soon as the limit is crossed.
|
|
112
159
|
|
|
113
160
|
### Tokenizer instances
|
|
114
161
|
|
|
@@ -118,11 +165,6 @@ Returns the token count if `text` is within `limit` tokens, or `false` if exceed
|
|
|
118
165
|
|
|
119
166
|
`tiktoken` is accurate for OpenAI but requires Rust/WASM native bindings, which can break in Docker containers, edge runtimes, and serverless environments. `bpe-lite` is pure JavaScript — it runs anywhere Node 18+ runs, with no native compilation step.
|
|
120
167
|
|
|
121
|
-
## Caveats
|
|
122
|
-
|
|
123
|
-
- **Anthropic**: Anthropic has not released the Claude 3+ tokenizer. The cl100k approximation is accurate to ~95% for most text.
|
|
124
|
-
- **Node version**: Requires Node 18+ for Unicode property escapes (`\p{L}`, `\p{N}`) in the pre-tokenization regex.
|
|
125
|
-
|
|
126
168
|
## License
|
|
127
169
|
|
|
128
170
|
MIT
|
package/index.d.ts
CHANGED
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
|
|
2
|
-
|
|
3
|
-
export interface Tokenizer {
|
|
4
|
-
encode(text: string): number[];
|
|
5
|
-
decode(ids: number[]): string;
|
|
6
|
-
count(text: string): number;
|
|
7
|
-
countUpTo(text: string, limit: number): number;
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Count the number of tokens in text for the given provider.
|
|
12
|
-
*/
|
|
13
|
-
export function countTokens(text: string, provider?: Provider): number;
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* Encode text to token ids.
|
|
17
|
-
*/
|
|
18
|
-
export function encode(text: string, provider?: Provider): number[];
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Decode token ids back to text.
|
|
22
|
-
*/
|
|
23
|
-
export function decode(ids: number[], provider?: Provider): string;
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Check if text is within a token limit.
|
|
27
|
-
* Returns the token count if within the limit, or false if exceeded.
|
|
28
|
-
* More efficient than encode() for long texts since it short-circuits.
|
|
29
|
-
*/
|
|
30
|
-
export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
|
|
31
|
-
|
|
32
|
-
/** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
|
|
33
|
-
export function openai(): Tokenizer;
|
|
34
|
-
|
|
35
|
-
/** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
36
|
-
export function openaiO200k(): Tokenizer;
|
|
37
|
-
|
|
38
|
-
/** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
|
|
39
|
-
export function anthropic(): Tokenizer;
|
|
40
|
-
|
|
41
|
-
/** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
|
|
42
|
-
export function gemini(): Tokenizer;
|
|
1
|
+
export type Provider = 'openai' | 'openai-o200k' | 'anthropic' | 'gemini';
|
|
2
|
+
|
|
3
|
+
export interface Tokenizer {
|
|
4
|
+
encode(text: string): number[];
|
|
5
|
+
decode(ids: number[]): string;
|
|
6
|
+
count(text: string): number;
|
|
7
|
+
countUpTo(text: string, limit: number): number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Count the number of tokens in text for the given provider.
|
|
12
|
+
*/
|
|
13
|
+
export function countTokens(text: string, provider?: Provider): number;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Encode text to token ids.
|
|
17
|
+
*/
|
|
18
|
+
export function encode(text: string, provider?: Provider): number[];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Decode token ids back to text.
|
|
22
|
+
*/
|
|
23
|
+
export function decode(ids: number[], provider?: Provider): string;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if text is within a token limit.
|
|
27
|
+
* Returns the token count if within the limit, or false if exceeded.
|
|
28
|
+
* More efficient than encode() for long texts since it short-circuits.
|
|
29
|
+
*/
|
|
30
|
+
export function isWithinTokenLimit(text: string, limit: number, provider?: Provider): number | false;
|
|
31
|
+
|
|
32
|
+
/** Tokenizer instance for OpenAI cl100k_base (GPT-4, GPT-3.5). */
|
|
33
|
+
export function openai(): Tokenizer;
|
|
34
|
+
|
|
35
|
+
/** Tokenizer instance for OpenAI o200k_base (GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
36
|
+
export function openaiO200k(): Tokenizer;
|
|
37
|
+
|
|
38
|
+
/** Tokenizer instance for Anthropic (cl100k approximation, ~95% accurate). */
|
|
39
|
+
export function anthropic(): Tokenizer;
|
|
40
|
+
|
|
41
|
+
/** Tokenizer instance for Gemini (Gemma 3 vocab, exact). */
|
|
42
|
+
export function gemini(): Tokenizer;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bpe-lite",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "Offline BPE tokenizer for OpenAI, Anthropic, and Gemini — zero dependencies",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
"license": "MIT",
|
|
37
37
|
"devDependencies": {
|
|
38
38
|
"ai-tokenizer": "^1.0.6",
|
|
39
|
+
"gpt-tokenizer": "^3.4.0",
|
|
39
40
|
"js-tiktoken": "^1.0.21"
|
|
40
41
|
},
|
|
41
42
|
"repository": {
|
package/src/bpe.js
CHANGED
|
@@ -6,6 +6,10 @@
|
|
|
6
6
|
* Lower rank = higher priority in merges.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
+
// Fast check: does a pre-tokenized chunk contain any letter or digit?
|
|
10
|
+
// Non-word chunks (pure symbols, punctuation, spaces) are eligible for batching.
|
|
11
|
+
const WORD_CHAR_RE = /[\p{L}\p{N}]/u;
|
|
12
|
+
|
|
9
13
|
// Byte → single-byte "binary string" lookup (pre-built at module load)
|
|
10
14
|
const BYTE_STRS = (() => {
|
|
11
15
|
const out = new Array(256);
|
|
@@ -80,7 +84,8 @@ class MinHeap {
|
|
|
80
84
|
_siftUp(i) {
|
|
81
85
|
while (i > 0) {
|
|
82
86
|
const p = (i - 1) >> 1;
|
|
83
|
-
if (this.ranks[p]
|
|
87
|
+
if (this.ranks[p] < this.ranks[i]) break;
|
|
88
|
+
if (this.ranks[p] === this.ranks[i] && this.left[p] <= this.left[i]) break;
|
|
84
89
|
this._swap(i, p);
|
|
85
90
|
i = p;
|
|
86
91
|
}
|
|
@@ -93,8 +98,12 @@ class MinHeap {
|
|
|
93
98
|
if (l >= n) break;
|
|
94
99
|
const r = l + 1;
|
|
95
100
|
let m = l;
|
|
96
|
-
if (r < n
|
|
97
|
-
|
|
101
|
+
if (r < n) {
|
|
102
|
+
if (this.ranks[r] < this.ranks[l] ||
|
|
103
|
+
(this.ranks[r] === this.ranks[l] && this.left[r] < this.left[l])) m = r;
|
|
104
|
+
}
|
|
105
|
+
if (this.ranks[i] < this.ranks[m]) break;
|
|
106
|
+
if (this.ranks[i] === this.ranks[m] && this.left[i] <= this.left[m]) break;
|
|
98
107
|
this._swap(i, m);
|
|
99
108
|
i = m;
|
|
100
109
|
}
|
|
@@ -136,7 +145,7 @@ function pretokenize(text, compiled) {
|
|
|
136
145
|
}
|
|
137
146
|
|
|
138
147
|
function buildPreparedTiktoken(vocabData) {
|
|
139
|
-
const { vocab, specialTokens = {}, pattern } = vocabData;
|
|
148
|
+
const { vocab, specialTokens = {}, pattern, normalize } = vocabData;
|
|
140
149
|
|
|
141
150
|
const vocabBin = new Map();
|
|
142
151
|
let maxId = -1;
|
|
@@ -164,6 +173,10 @@ function buildPreparedTiktoken(vocabData) {
|
|
|
164
173
|
idToBytes,
|
|
165
174
|
specials,
|
|
166
175
|
patternCompiled: compilePretokenizer(pattern),
|
|
176
|
+
normalize: normalize || null,
|
|
177
|
+
// symbolBatch: merge consecutive non-word regex chunks before BPE so that
|
|
178
|
+
// cross-character byte merges can fire (matches Claude's no-regex byte BPE).
|
|
179
|
+
symbolBatch: !!vocabData.symbolBatch,
|
|
167
180
|
// opt A — per-instance chunk cache: chunk string → ids[]
|
|
168
181
|
cache: new Map(),
|
|
169
182
|
// opt B — per-instance grow-only scratch (reused across chunks)
|
|
@@ -302,6 +315,7 @@ function splitOnSpecials(text, specials) {
|
|
|
302
315
|
|
|
303
316
|
function encodeTiktokenPrepared(text, prepared) {
|
|
304
317
|
if (!text) return [];
|
|
318
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
305
319
|
|
|
306
320
|
const ids = [];
|
|
307
321
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
@@ -316,14 +330,41 @@ function encodeTiktokenPrepared(text, prepared) {
|
|
|
316
330
|
const re = patternCompiled.re;
|
|
317
331
|
re.lastIndex = 0;
|
|
318
332
|
let m;
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
333
|
+
if (prepared.symbolBatch) {
|
|
334
|
+
// Merge consecutive non-word chunks (symbols, punctuation, spaces) into one
|
|
335
|
+
// BPE input so cross-character byte merges can fire — matches Claude's behavior.
|
|
336
|
+
let symBuf = null;
|
|
337
|
+
while ((m = re.exec(t)) !== null) {
|
|
338
|
+
const chunk = m[0];
|
|
339
|
+
if (!WORD_CHAR_RE.test(chunk)) {
|
|
340
|
+
symBuf = symBuf === null ? chunk : symBuf + chunk;
|
|
341
|
+
} else {
|
|
342
|
+
if (symBuf !== null) {
|
|
343
|
+
let symIds = cache.get(symBuf);
|
|
344
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
345
|
+
for (let i = 0; i < symIds.length; i++) ids.push(symIds[i]);
|
|
346
|
+
symBuf = null;
|
|
347
|
+
}
|
|
348
|
+
let chunkIds = cache.get(chunk);
|
|
349
|
+
if (chunkIds === undefined) { chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch); cache.set(chunk, chunkIds); }
|
|
350
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
if (symBuf !== null) {
|
|
354
|
+
let symIds = cache.get(symBuf);
|
|
355
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
356
|
+
for (let i = 0; i < symIds.length; i++) ids.push(symIds[i]);
|
|
357
|
+
}
|
|
358
|
+
} else {
|
|
359
|
+
while ((m = re.exec(t)) !== null) {
|
|
360
|
+
const chunk = m[0];
|
|
361
|
+
let chunkIds = cache.get(chunk);
|
|
362
|
+
if (chunkIds === undefined) {
|
|
363
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
364
|
+
cache.set(chunk, chunkIds);
|
|
365
|
+
}
|
|
366
|
+
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
325
367
|
}
|
|
326
|
-
for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
|
|
327
368
|
}
|
|
328
369
|
} else {
|
|
329
370
|
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
@@ -355,6 +396,7 @@ function decodeTiktokenPrepared(ids, prepared) {
|
|
|
355
396
|
|
|
356
397
|
function countTiktokenPrepared(text, prepared) {
|
|
357
398
|
if (!text) return 0;
|
|
399
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
358
400
|
|
|
359
401
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
360
402
|
const pieces = splitOnSpecials(text, specials);
|
|
@@ -368,14 +410,39 @@ function countTiktokenPrepared(text, prepared) {
|
|
|
368
410
|
const re = patternCompiled.re;
|
|
369
411
|
re.lastIndex = 0;
|
|
370
412
|
let m;
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
413
|
+
if (prepared.symbolBatch) {
|
|
414
|
+
let symBuf = null;
|
|
415
|
+
while ((m = re.exec(t)) !== null) {
|
|
416
|
+
const chunk = m[0];
|
|
417
|
+
if (!WORD_CHAR_RE.test(chunk)) {
|
|
418
|
+
symBuf = symBuf === null ? chunk : symBuf + chunk;
|
|
419
|
+
} else {
|
|
420
|
+
if (symBuf !== null) {
|
|
421
|
+
let symIds = cache.get(symBuf);
|
|
422
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
423
|
+
count += symIds.length;
|
|
424
|
+
symBuf = null;
|
|
425
|
+
}
|
|
426
|
+
let chunkIds = cache.get(chunk);
|
|
427
|
+
if (chunkIds === undefined) { chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch); cache.set(chunk, chunkIds); }
|
|
428
|
+
count += chunkIds.length;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (symBuf !== null) {
|
|
432
|
+
let symIds = cache.get(symBuf);
|
|
433
|
+
if (symIds === undefined) { symIds = bpeChunk(writeChunk(symBuf), vocabBin, scratch); cache.set(symBuf, symIds); }
|
|
434
|
+
count += symIds.length;
|
|
435
|
+
}
|
|
436
|
+
} else {
|
|
437
|
+
while ((m = re.exec(t)) !== null) {
|
|
438
|
+
const chunk = m[0];
|
|
439
|
+
let chunkIds = cache.get(chunk);
|
|
440
|
+
if (chunkIds === undefined) {
|
|
441
|
+
chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
|
|
442
|
+
cache.set(chunk, chunkIds);
|
|
443
|
+
}
|
|
444
|
+
count += chunkIds.length;
|
|
377
445
|
}
|
|
378
|
-
count += chunkIds.length;
|
|
379
446
|
}
|
|
380
447
|
} else {
|
|
381
448
|
const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
|
|
@@ -396,6 +463,7 @@ function countTiktokenPrepared(text, prepared) {
|
|
|
396
463
|
|
|
397
464
|
function countTiktokenUpToPrepared(text, prepared, limit) {
|
|
398
465
|
if (!text) return 0;
|
|
466
|
+
if (prepared.normalize) text = text.normalize(prepared.normalize);
|
|
399
467
|
|
|
400
468
|
const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
|
|
401
469
|
const pieces = splitOnSpecials(text, specials);
|
package/src/index.js
CHANGED
|
@@ -1,90 +1,90 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const path = require('path');
|
|
4
|
-
const fs = require('fs');
|
|
5
|
-
const zlib = require('zlib');
|
|
6
|
-
const { Tokenizer } = require('./tokenizer');
|
|
7
|
-
|
|
8
|
-
const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
|
|
9
|
-
|
|
10
|
-
// Lazy-loaded tokenizer instances (created once per provider per process)
|
|
11
|
-
const _cache = {};
|
|
12
|
-
|
|
13
|
-
function loadTokenizer(provider) {
|
|
14
|
-
if (_cache[provider]) return _cache[provider];
|
|
15
|
-
|
|
16
|
-
const gzPath = path.join(VOCABS_DIR, `${provider}.json.gz`);
|
|
17
|
-
const jsonPath = path.join(VOCABS_DIR, `${provider}.json`);
|
|
18
|
-
|
|
19
|
-
let data;
|
|
20
|
-
if (fs.existsSync(gzPath)) {
|
|
21
|
-
data = JSON.parse(zlib.gunzipSync(fs.readFileSync(gzPath)).toString('utf8'));
|
|
22
|
-
} else if (fs.existsSync(jsonPath)) {
|
|
23
|
-
data = JSON.parse(fs.readFileSync(jsonPath, 'utf8'));
|
|
24
|
-
} else {
|
|
25
|
-
throw new Error(
|
|
26
|
-
`Vocab file not found for provider "${provider}".\n` +
|
|
27
|
-
'Run "node scripts/build-vocabs.js" to build vocab files.'
|
|
28
|
-
);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
_cache[provider] = new Tokenizer(data);
|
|
32
|
-
return _cache[provider];
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Count tokens in text for a given provider.
|
|
37
|
-
* @param {string} text
|
|
38
|
-
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
39
|
-
* @returns {number}
|
|
40
|
-
*/
|
|
41
|
-
function countTokens(text, provider = 'openai') {
|
|
42
|
-
return loadTokenizer(provider).count(text);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Encode text to token ids.
|
|
47
|
-
* @param {string} text
|
|
48
|
-
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
49
|
-
* @returns {number[]}
|
|
50
|
-
*/
|
|
51
|
-
function encode(text, provider = 'openai') {
|
|
52
|
-
return loadTokenizer(provider).encode(text);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Decode token ids back to text.
|
|
57
|
-
* @param {number[]} ids
|
|
58
|
-
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
59
|
-
* @returns {string}
|
|
60
|
-
*/
|
|
61
|
-
function decode(ids, provider = 'openai') {
|
|
62
|
-
return loadTokenizer(provider).decode(ids);
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Check if text is within a token limit without necessarily encoding the whole string.
|
|
67
|
-
* Returns false if the limit is exceeded, otherwise returns the token count.
|
|
68
|
-
* @param {string} text
|
|
69
|
-
* @param {number} limit
|
|
70
|
-
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
71
|
-
* @returns {number|false}
|
|
72
|
-
*/
|
|
73
|
-
function isWithinTokenLimit(text, limit, provider = 'openai') {
|
|
74
|
-
const count = loadTokenizer(provider).countUpTo(text, limit);
|
|
75
|
-
return count <= limit ? count : false;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
|
|
79
|
-
function openai() { return loadTokenizer('openai'); }
|
|
80
|
-
|
|
81
|
-
/** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
82
|
-
function openaiO200k() { return loadTokenizer('openai-o200k'); }
|
|
83
|
-
|
|
84
|
-
/** Get a Tokenizer instance for Anthropic (cl100k approximation). */
|
|
85
|
-
function anthropic() { return loadTokenizer('anthropic'); }
|
|
86
|
-
|
|
87
|
-
/** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
|
|
88
|
-
function gemini() { return loadTokenizer('gemini'); }
|
|
89
|
-
|
|
90
|
-
module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const zlib = require('zlib');
|
|
6
|
+
const { Tokenizer } = require('./tokenizer');
|
|
7
|
+
|
|
8
|
+
const VOCABS_DIR = path.join(__dirname, '..', 'vocabs');
|
|
9
|
+
|
|
10
|
+
// Lazy-loaded tokenizer instances (created once per provider per process)
|
|
11
|
+
const _cache = {};
|
|
12
|
+
|
|
13
|
+
function loadTokenizer(provider) {
|
|
14
|
+
if (_cache[provider]) return _cache[provider];
|
|
15
|
+
|
|
16
|
+
const gzPath = path.join(VOCABS_DIR, `${provider}.json.gz`);
|
|
17
|
+
const jsonPath = path.join(VOCABS_DIR, `${provider}.json`);
|
|
18
|
+
|
|
19
|
+
let data;
|
|
20
|
+
if (fs.existsSync(gzPath)) {
|
|
21
|
+
data = JSON.parse(zlib.gunzipSync(fs.readFileSync(gzPath)).toString('utf8'));
|
|
22
|
+
} else if (fs.existsSync(jsonPath)) {
|
|
23
|
+
data = JSON.parse(fs.readFileSync(jsonPath, 'utf8'));
|
|
24
|
+
} else {
|
|
25
|
+
throw new Error(
|
|
26
|
+
`Vocab file not found for provider "${provider}".\n` +
|
|
27
|
+
'Run "node scripts/build-vocabs.js" to build vocab files.'
|
|
28
|
+
);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
_cache[provider] = new Tokenizer(data);
|
|
32
|
+
return _cache[provider];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Count tokens in text for a given provider.
|
|
37
|
+
* @param {string} text
|
|
38
|
+
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
39
|
+
* @returns {number}
|
|
40
|
+
*/
|
|
41
|
+
function countTokens(text, provider = 'openai') {
|
|
42
|
+
return loadTokenizer(provider).count(text);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Encode text to token ids.
|
|
47
|
+
* @param {string} text
|
|
48
|
+
* @param {'openai'|'anthropic'|'gemini'} provider
|
|
49
|
+
* @returns {number[]}
|
|
50
|
+
*/
|
|
51
|
+
function encode(text, provider = 'openai') {
|
|
52
|
+
return loadTokenizer(provider).encode(text);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Decode token ids back to text.
|
|
57
|
+
* @param {number[]} ids
|
|
58
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
59
|
+
* @returns {string}
|
|
60
|
+
*/
|
|
61
|
+
function decode(ids, provider = 'openai') {
|
|
62
|
+
return loadTokenizer(provider).decode(ids);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Check if text is within a token limit without necessarily encoding the whole string.
|
|
67
|
+
* Returns false if the limit is exceeded, otherwise returns the token count.
|
|
68
|
+
* @param {string} text
|
|
69
|
+
* @param {number} limit
|
|
70
|
+
* @param {'openai'|'openai-o200k'|'anthropic'|'gemini'} provider
|
|
71
|
+
* @returns {number|false}
|
|
72
|
+
*/
|
|
73
|
+
function isWithinTokenLimit(text, limit, provider = 'openai') {
|
|
74
|
+
const count = loadTokenizer(provider).countUpTo(text, limit);
|
|
75
|
+
return count <= limit ? count : false;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Get a Tokenizer instance for OpenAI (cl100k_base — GPT-4, GPT-3.5). */
|
|
79
|
+
function openai() { return loadTokenizer('openai'); }
|
|
80
|
+
|
|
81
|
+
/** Get a Tokenizer instance for OpenAI modern models (o200k_base — GPT-4o, o1, o3, o4, GPT-4.1, GPT-5). */
|
|
82
|
+
function openaiO200k() { return loadTokenizer('openai-o200k'); }
|
|
83
|
+
|
|
84
|
+
/** Get a Tokenizer instance for Anthropic (cl100k approximation). */
|
|
85
|
+
function anthropic() { return loadTokenizer('anthropic'); }
|
|
86
|
+
|
|
87
|
+
/** Get a Tokenizer instance for Gemini (Gemma3 vocab). */
|
|
88
|
+
function gemini() { return loadTokenizer('gemini'); }
|
|
89
|
+
|
|
90
|
+
module.exports = { countTokens, encode, decode, isWithinTokenLimit, openai, openaiO200k, anthropic, gemini };
|
package/src/index.mjs
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
import { createRequire } from 'module';
|
|
2
|
-
|
|
3
|
-
const require = createRequire(import.meta.url);
|
|
4
|
-
const {
|
|
5
|
-
countTokens,
|
|
6
|
-
encode,
|
|
7
|
-
decode,
|
|
8
|
-
isWithinTokenLimit,
|
|
9
|
-
openai,
|
|
10
|
-
openaiO200k,
|
|
11
|
-
anthropic,
|
|
12
|
-
gemini,
|
|
13
|
-
} = require('./index.js');
|
|
14
|
-
|
|
15
|
-
export {
|
|
16
|
-
countTokens,
|
|
17
|
-
encode,
|
|
18
|
-
decode,
|
|
19
|
-
isWithinTokenLimit,
|
|
20
|
-
openai,
|
|
21
|
-
openaiO200k,
|
|
22
|
-
anthropic,
|
|
23
|
-
gemini,
|
|
24
|
-
};
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const {
|
|
5
|
+
countTokens,
|
|
6
|
+
encode,
|
|
7
|
+
decode,
|
|
8
|
+
isWithinTokenLimit,
|
|
9
|
+
openai,
|
|
10
|
+
openaiO200k,
|
|
11
|
+
anthropic,
|
|
12
|
+
gemini,
|
|
13
|
+
} = require('./index.js');
|
|
14
|
+
|
|
15
|
+
export {
|
|
16
|
+
countTokens,
|
|
17
|
+
encode,
|
|
18
|
+
decode,
|
|
19
|
+
isWithinTokenLimit,
|
|
20
|
+
openai,
|
|
21
|
+
openaiO200k,
|
|
22
|
+
anthropic,
|
|
23
|
+
gemini,
|
|
24
|
+
};
|
package/src/spm.js
CHANGED
|
@@ -23,12 +23,31 @@ function buildPreparedSPM(vocabData) {
|
|
|
23
23
|
idToStr.set(id, str);
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
+
// Seed tokens: multi-char vocab entries with no producing merge.
|
|
27
|
+
// In the original SentencePiece model these were user-defined symbols
|
|
28
|
+
// (HTML tags, special tokens, etc.) that the encoder recognizes atomically
|
|
29
|
+
// before BPE. We handle the common '<' case with a greedy longest-match
|
|
30
|
+
// lookup keyed by '<' — the only first-char with multiple seed tokens.
|
|
31
|
+
const producible = new Set();
|
|
32
|
+
for (const m of merges) {
|
|
33
|
+
const sp = m.indexOf(' ');
|
|
34
|
+
if (sp !== -1) producible.add(m.slice(0, sp) + m.slice(sp + 1));
|
|
35
|
+
}
|
|
36
|
+
const seedsByAngleBracket = [];
|
|
37
|
+
for (const [str, id] of Object.entries(vocab)) {
|
|
38
|
+
if (str.length > 1 && str[0] === '<' && !producible.has(str)) {
|
|
39
|
+
seedsByAngleBracket.push({ str, id, chars: [...str] });
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
seedsByAngleBracket.sort((a, b) => b.chars.length - a.chars.length);
|
|
43
|
+
|
|
26
44
|
return {
|
|
27
45
|
vocab,
|
|
28
46
|
merges,
|
|
29
47
|
mergeRank,
|
|
30
48
|
idToStr,
|
|
31
|
-
|
|
49
|
+
seedsByAngleBracket,
|
|
50
|
+
// opt A — segment-level cache: each word segment → ids[]
|
|
32
51
|
// Generalises across different inputs (same words reused across texts).
|
|
33
52
|
// Note: 1 of 514,906 Gemma merges crosses a ▁ boundary ("> ▁</"),
|
|
34
53
|
// making this negligibly imprecise for that HTML pattern.
|
|
@@ -66,24 +85,44 @@ function encodeSPM(text, vocabData) {
|
|
|
66
85
|
* Returns true if every segment was a cache hit; false on first miss.
|
|
67
86
|
* Isolated from encodeSegment so V8 can keep this function optimised
|
|
68
87
|
* even when encodeSPMPrepared is called with wildly different text lengths.
|
|
88
|
+
*
|
|
89
|
+
* Segmentation: each run of ▁ chars plus the following non-▁ word is one segment.
|
|
90
|
+
* Leading ▁s are included so BPE can naturally merge ▁▁→138, ▁▁▁→139, etc.
|
|
69
91
|
*/
|
|
70
92
|
function _scanFromCache(normalized, cache, result) {
|
|
71
|
-
let
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
93
|
+
let i = 0;
|
|
94
|
+
while (i < normalized.length) {
|
|
95
|
+
// Count leading ▁ chars
|
|
96
|
+
const segStart = i;
|
|
97
|
+
while (i < normalized.length && normalized[i] === SPACE_CHAR) i++;
|
|
98
|
+
const spaceCount = i - segStart;
|
|
99
|
+
|
|
100
|
+
if (i === normalized.length) {
|
|
101
|
+
// Trailing spaces only
|
|
102
|
+
if (spaceCount > 0) {
|
|
103
|
+
const seg = normalized.slice(segStart, i);
|
|
104
|
+
const segIds = cache.get(seg);
|
|
105
|
+
if (segIds === undefined) return false;
|
|
106
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
107
|
+
}
|
|
108
|
+
break;
|
|
79
109
|
}
|
|
110
|
+
|
|
111
|
+
// Collect word chars (non-▁)
|
|
112
|
+
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
113
|
+
|
|
114
|
+
// Emit all leading ▁s + word as one segment; BPE merges ▁▁, ▁▁▁, etc.
|
|
115
|
+
const seg = normalized.slice(segStart, i);
|
|
116
|
+
const segIds = cache.get(seg);
|
|
117
|
+
if (segIds === undefined) return false;
|
|
118
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
80
119
|
}
|
|
81
120
|
return true;
|
|
82
121
|
}
|
|
83
122
|
|
|
84
123
|
// Cold-path helper — kept separate so it is never inlined into the hot loop.
|
|
85
|
-
function _encodeAndCache(seg, vocab, mergeRank, scratch, cache) {
|
|
86
|
-
const ids = encodeSegment(seg, vocab, mergeRank, scratch);
|
|
124
|
+
function _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seeds) {
|
|
125
|
+
const ids = encodeSegment(seg, vocab, mergeRank, scratch, seeds);
|
|
87
126
|
cache.set(seg, ids);
|
|
88
127
|
return ids;
|
|
89
128
|
}
|
|
@@ -91,10 +130,10 @@ function _encodeAndCache(seg, vocab, mergeRank, scratch, cache) {
|
|
|
91
130
|
function encodeSPMPrepared(text, prepared) {
|
|
92
131
|
if (!text) return [];
|
|
93
132
|
|
|
94
|
-
const { vocab, mergeRank, scratch, cache } = prepared;
|
|
133
|
+
const { vocab, mergeRank, scratch, cache, seedsByAngleBracket } = prepared;
|
|
95
134
|
|
|
96
|
-
// Normalize: replace spaces with
|
|
97
|
-
const normalized =
|
|
135
|
+
// Normalize: replace spaces with ▁ (Gemma3: no ▁ prepend for first char)
|
|
136
|
+
const normalized = text.replace(/ /g, SPACE_CHAR);
|
|
98
137
|
|
|
99
138
|
// Fast path: serve every segment from the segment cache.
|
|
100
139
|
// After the first call, this path handles all subsequent calls for common text.
|
|
@@ -104,20 +143,34 @@ function encodeSPMPrepared(text, prepared) {
|
|
|
104
143
|
// Cold path: at least one segment is missing — encode everything from scratch.
|
|
105
144
|
// (Simpler to re-scan than to continue from the miss point.)
|
|
106
145
|
result.length = 0;
|
|
107
|
-
let
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
146
|
+
let i = 0;
|
|
147
|
+
while (i < normalized.length) {
|
|
148
|
+
const segStart = i;
|
|
149
|
+
while (i < normalized.length && normalized[i] === SPACE_CHAR) i++;
|
|
150
|
+
const spaceCount = i - segStart;
|
|
151
|
+
|
|
152
|
+
if (i === normalized.length) {
|
|
153
|
+
if (spaceCount > 0) {
|
|
154
|
+
const seg = normalized.slice(segStart, i);
|
|
155
|
+
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seedsByAngleBracket);
|
|
156
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
157
|
+
}
|
|
158
|
+
break;
|
|
114
159
|
}
|
|
160
|
+
|
|
161
|
+
while (i < normalized.length && normalized[i] !== SPACE_CHAR) i++;
|
|
162
|
+
|
|
163
|
+
const seg = normalized.slice(segStart, i);
|
|
164
|
+
const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache, seedsByAngleBracket);
|
|
165
|
+
for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
|
|
115
166
|
}
|
|
116
167
|
return result;
|
|
117
168
|
}
|
|
118
169
|
|
|
119
|
-
// Encode a single
|
|
120
|
-
|
|
170
|
+
// Encode a single segment using MinHeap BPE.
|
|
171
|
+
// seeds: sorted array of {str, id, chars[]} for '<'-prefixed vocab entries that have
|
|
172
|
+
// no producing merge (SentencePiece user-defined symbols); matched greedily before BPE.
|
|
173
|
+
function encodeSegment(seg, vocab, mergeRank, scratch, seeds) {
|
|
121
174
|
const chars = [...seg];
|
|
122
175
|
const n = chars.length;
|
|
123
176
|
|
|
@@ -125,32 +178,59 @@ function encodeSegment(seg, vocab, mergeRank, scratch) {
|
|
|
125
178
|
const { str, ids, prev, next, ver, alive, heap } = scratch;
|
|
126
179
|
heap.reset();
|
|
127
180
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
181
|
+
// Initialize nodes: one node per Unicode char, except seed tokens which collapse
|
|
182
|
+
// multiple chars into a single node (longest match at each '<' position).
|
|
183
|
+
let nNodes = 0;
|
|
184
|
+
let pos = 0;
|
|
185
|
+
while (pos < n) {
|
|
186
|
+
const node = nNodes++;
|
|
187
|
+
prev[node] = node - 1;
|
|
188
|
+
next[node] = node + 1;
|
|
189
|
+
ver[node] = 0;
|
|
190
|
+
alive[node] = 1;
|
|
191
|
+
|
|
192
|
+
let matched = false;
|
|
193
|
+
if (seeds && chars[pos] === '<') {
|
|
194
|
+
for (const seed of seeds) {
|
|
195
|
+
const sl = seed.chars.length;
|
|
196
|
+
if (pos + sl > n) continue;
|
|
197
|
+
let ok = true;
|
|
198
|
+
for (let k = 0; k < sl; k++) {
|
|
199
|
+
if (chars[pos + k] !== seed.chars[k]) { ok = false; break; }
|
|
200
|
+
}
|
|
201
|
+
if (ok) {
|
|
202
|
+
str[node] = seed.str;
|
|
203
|
+
ids[node] = seed.id;
|
|
204
|
+
pos += sl;
|
|
205
|
+
matched = true;
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
134
210
|
|
|
135
|
-
if (
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
|
|
141
|
-
const byteKey = `<0x${hex}>`;
|
|
142
|
-
if (vocab[byteKey] !== undefined) {
|
|
143
|
-
str[i] = byteKey;
|
|
144
|
-
ids[i] = vocab[byteKey];
|
|
211
|
+
if (!matched) {
|
|
212
|
+
const c = chars[pos];
|
|
213
|
+
if (vocab[c] !== undefined) {
|
|
214
|
+
str[node] = c;
|
|
215
|
+
ids[node] = vocab[c];
|
|
145
216
|
} else {
|
|
146
|
-
|
|
147
|
-
|
|
217
|
+
const codePoint = c.codePointAt(0);
|
|
218
|
+
const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
|
|
219
|
+
const byteKey = `<0x${hex}>`;
|
|
220
|
+
if (vocab[byteKey] !== undefined) {
|
|
221
|
+
str[node] = byteKey;
|
|
222
|
+
ids[node] = vocab[byteKey];
|
|
223
|
+
} else {
|
|
224
|
+
str[node] = c;
|
|
225
|
+
ids[node] = vocab['<unk>'] ?? 0;
|
|
226
|
+
}
|
|
148
227
|
}
|
|
228
|
+
pos++;
|
|
149
229
|
}
|
|
150
230
|
}
|
|
151
|
-
next[
|
|
231
|
+
next[nNodes - 1] = -1;
|
|
152
232
|
|
|
153
|
-
for (let i = 0; i <
|
|
233
|
+
for (let i = 0; i < nNodes - 1; i++) {
|
|
154
234
|
const rank = mergeRank.get(`${str[i]} ${str[i + 1]}`);
|
|
155
235
|
if (rank !== undefined) heap.push(rank, i, i + 1, ver[i], ver[i + 1]);
|
|
156
236
|
}
|
package/src/tokenizer.js
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const {
|
|
4
|
-
buildPreparedTiktoken,
|
|
5
|
-
encodeTiktokenPrepared,
|
|
6
|
-
decodeTiktokenPrepared,
|
|
7
|
-
countTiktokenPrepared,
|
|
8
|
-
countTiktokenUpToPrepared,
|
|
9
|
-
} = require('./bpe');
|
|
10
|
-
const { buildPreparedSPM, encodeSPMPrepared, decodeSPMPrepared } = require('./spm');
|
|
11
|
-
|
|
12
|
-
class Tokenizer {
|
|
13
|
-
constructor(vocabData) {
|
|
14
|
-
this._data = vocabData;
|
|
15
|
-
this._engine = vocabData.engine;
|
|
16
|
-
this._preparedTiktoken = null;
|
|
17
|
-
this._preparedSPM = null;
|
|
18
|
-
|
|
19
|
-
if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
|
|
20
|
-
throw new Error(`Unknown tokenizer engine: ${this._engine}`);
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
if (this._engine === 'tiktoken') {
|
|
24
|
-
this._preparedTiktoken = buildPreparedTiktoken(vocabData);
|
|
25
|
-
} else {
|
|
26
|
-
this._preparedSPM = buildPreparedSPM(vocabData);
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
encode(text) {
|
|
31
|
-
if (this._engine === 'tiktoken') return encodeTiktokenPrepared(text, this._preparedTiktoken);
|
|
32
|
-
return encodeSPMPrepared(text, this._preparedSPM);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
decode(ids) {
|
|
36
|
-
if (this._engine === 'tiktoken') return decodeTiktokenPrepared(ids, this._preparedTiktoken);
|
|
37
|
-
return decodeSPMPrepared(ids, this._preparedSPM);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
count(text) {
|
|
41
|
-
if (this._engine === 'tiktoken') return countTiktokenPrepared(text, this._preparedTiktoken);
|
|
42
|
-
return this.encode(text).length;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Count tokens, stopping as soon as the count exceeds limit.
|
|
47
|
-
* More efficient than encode() for token limit checks on long text.
|
|
48
|
-
* @param {string} text
|
|
49
|
-
* @param {number} limit
|
|
50
|
-
* @returns {number}
|
|
51
|
-
*/
|
|
52
|
-
countUpTo(text, limit) {
|
|
53
|
-
if (this._engine === 'tiktoken') return countTiktokenUpToPrepared(text, this._preparedTiktoken, limit);
|
|
54
|
-
// SPM encodes the whole text as one unit — no clean early exit, just encode and count
|
|
55
|
-
return this.encode(text).length;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
module.exports = { Tokenizer };
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const {
|
|
4
|
+
buildPreparedTiktoken,
|
|
5
|
+
encodeTiktokenPrepared,
|
|
6
|
+
decodeTiktokenPrepared,
|
|
7
|
+
countTiktokenPrepared,
|
|
8
|
+
countTiktokenUpToPrepared,
|
|
9
|
+
} = require('./bpe');
|
|
10
|
+
const { buildPreparedSPM, encodeSPMPrepared, decodeSPMPrepared } = require('./spm');
|
|
11
|
+
|
|
12
|
+
class Tokenizer {
|
|
13
|
+
constructor(vocabData) {
|
|
14
|
+
this._data = vocabData;
|
|
15
|
+
this._engine = vocabData.engine;
|
|
16
|
+
this._preparedTiktoken = null;
|
|
17
|
+
this._preparedSPM = null;
|
|
18
|
+
|
|
19
|
+
if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
|
|
20
|
+
throw new Error(`Unknown tokenizer engine: ${this._engine}`);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (this._engine === 'tiktoken') {
|
|
24
|
+
this._preparedTiktoken = buildPreparedTiktoken(vocabData);
|
|
25
|
+
} else {
|
|
26
|
+
this._preparedSPM = buildPreparedSPM(vocabData);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
encode(text) {
|
|
31
|
+
if (this._engine === 'tiktoken') return encodeTiktokenPrepared(text, this._preparedTiktoken);
|
|
32
|
+
return encodeSPMPrepared(text, this._preparedSPM);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
decode(ids) {
|
|
36
|
+
if (this._engine === 'tiktoken') return decodeTiktokenPrepared(ids, this._preparedTiktoken);
|
|
37
|
+
return decodeSPMPrepared(ids, this._preparedSPM);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
count(text) {
|
|
41
|
+
if (this._engine === 'tiktoken') return countTiktokenPrepared(text, this._preparedTiktoken);
|
|
42
|
+
return this.encode(text).length;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Count tokens, stopping as soon as the count exceeds limit.
|
|
47
|
+
* More efficient than encode() for token limit checks on long text.
|
|
48
|
+
* @param {string} text
|
|
49
|
+
* @param {number} limit
|
|
50
|
+
* @returns {number}
|
|
51
|
+
*/
|
|
52
|
+
countUpTo(text, limit) {
|
|
53
|
+
if (this._engine === 'tiktoken') return countTiktokenUpToPrepared(text, this._preparedTiktoken, limit);
|
|
54
|
+
// SPM encodes the whole text as one unit — no clean early exit, just encode and count
|
|
55
|
+
return this.encode(text).length;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
module.exports = { Tokenizer };
|
package/vocabs/anthropic.json.gz
CHANGED
|
Binary file
|