@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model to Encoding Mappings
|
|
3
|
+
*
|
|
4
|
+
* EXACT mappings from tiktoken-rs for model name to encoding.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs
|
|
6
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Model prefix to tokenizer mapping
|
|
10
|
+
* Checked in order, so more specific prefixes should come first
|
|
11
|
+
*
|
|
12
|
+
* From tiktoken-rs MODEL_PREFIX_TO_TOKENIZER
|
|
13
|
+
*/
|
|
14
|
+
const MODEL_PREFIX_TO_TOKENIZER = [
|
|
15
|
+
// Reasoning models
|
|
16
|
+
["o1-", "o200k_base"],
|
|
17
|
+
["o3-", "o200k_base"],
|
|
18
|
+
["o4-", "o200k_base"],
|
|
19
|
+
// Chat models
|
|
20
|
+
["gpt-5-", "o200k_base"],
|
|
21
|
+
["gpt-4.5-", "o200k_base"],
|
|
22
|
+
["gpt-4.1-", "o200k_base"],
|
|
23
|
+
["chatgpt-4o-", "o200k_base"],
|
|
24
|
+
["gpt-4o-", "o200k_base"],
|
|
25
|
+
["gpt-4-", "cl100k_base"],
|
|
26
|
+
["gpt-3.5-turbo-", "cl100k_base"],
|
|
27
|
+
["gpt-35-turbo-", "cl100k_base"], // Azure deployment name
|
|
28
|
+
["gpt-oss-", "o200k_harmony"],
|
|
29
|
+
// Fine-tuned models
|
|
30
|
+
["ft:gpt-4o", "o200k_base"],
|
|
31
|
+
["ft:gpt-4", "cl100k_base"],
|
|
32
|
+
["ft:gpt-3.5-turbo", "cl100k_base"],
|
|
33
|
+
["ft:davinci-002", "cl100k_base"],
|
|
34
|
+
["ft:babbage-002", "cl100k_base"],
|
|
35
|
+
];
|
|
36
|
+
/**
|
|
37
|
+
* Exact model name to tokenizer mapping
|
|
38
|
+
*
|
|
39
|
+
* From tiktoken-rs MODEL_TO_TOKENIZER
|
|
40
|
+
*/
|
|
41
|
+
const MODEL_TO_TOKENIZER = new Map([
|
|
42
|
+
// Reasoning models
|
|
43
|
+
["o1", "o200k_base"],
|
|
44
|
+
["o3", "o200k_base"],
|
|
45
|
+
["o4", "o200k_base"],
|
|
46
|
+
// Chat models
|
|
47
|
+
["gpt-5", "o200k_base"],
|
|
48
|
+
["gpt-4.1", "o200k_base"],
|
|
49
|
+
["chatgpt-4o-latest", "o200k_base"],
|
|
50
|
+
["gpt-4o", "o200k_base"],
|
|
51
|
+
["gpt-4", "cl100k_base"],
|
|
52
|
+
["gpt-3.5-turbo", "cl100k_base"],
|
|
53
|
+
["gpt-3.5", "cl100k_base"],
|
|
54
|
+
["gpt-35-turbo", "cl100k_base"], // Azure deployment name
|
|
55
|
+
// Base models
|
|
56
|
+
["davinci-002", "cl100k_base"],
|
|
57
|
+
["babbage-002", "cl100k_base"],
|
|
58
|
+
// Embeddings
|
|
59
|
+
["text-embedding-ada-002", "cl100k_base"],
|
|
60
|
+
["text-embedding-3-small", "cl100k_base"],
|
|
61
|
+
["text-embedding-3-large", "cl100k_base"],
|
|
62
|
+
// DEPRECATED MODELS - Text
|
|
63
|
+
["text-davinci-003", "p50k_base"],
|
|
64
|
+
["text-davinci-002", "p50k_base"],
|
|
65
|
+
["text-davinci-001", "r50k_base"],
|
|
66
|
+
["text-curie-001", "r50k_base"],
|
|
67
|
+
["text-babbage-001", "r50k_base"],
|
|
68
|
+
["text-ada-001", "r50k_base"],
|
|
69
|
+
["davinci", "r50k_base"],
|
|
70
|
+
["curie", "r50k_base"],
|
|
71
|
+
["babbage", "r50k_base"],
|
|
72
|
+
["ada", "r50k_base"],
|
|
73
|
+
// DEPRECATED MODELS - Code
|
|
74
|
+
["code-davinci-002", "p50k_base"],
|
|
75
|
+
["code-davinci-001", "p50k_base"],
|
|
76
|
+
["code-cushman-002", "p50k_base"],
|
|
77
|
+
["code-cushman-001", "p50k_base"],
|
|
78
|
+
["davinci-codex", "p50k_base"],
|
|
79
|
+
["cushman-codex", "p50k_base"],
|
|
80
|
+
// DEPRECATED MODELS - Edit
|
|
81
|
+
["text-davinci-edit-001", "p50k_edit"],
|
|
82
|
+
["code-davinci-edit-001", "p50k_edit"],
|
|
83
|
+
// DEPRECATED MODELS - Old Embeddings
|
|
84
|
+
["text-similarity-davinci-001", "r50k_base"],
|
|
85
|
+
["text-similarity-curie-001", "r50k_base"],
|
|
86
|
+
["text-similarity-babbage-001", "r50k_base"],
|
|
87
|
+
["text-similarity-ada-001", "r50k_base"],
|
|
88
|
+
["text-search-davinci-doc-001", "r50k_base"],
|
|
89
|
+
["text-search-curie-doc-001", "r50k_base"],
|
|
90
|
+
["text-search-babbage-doc-001", "r50k_base"],
|
|
91
|
+
["text-search-ada-doc-001", "r50k_base"],
|
|
92
|
+
["code-search-babbage-code-001", "r50k_base"],
|
|
93
|
+
["code-search-ada-code-001", "r50k_base"],
|
|
94
|
+
// Open source
|
|
95
|
+
["gpt2", "gpt2"],
|
|
96
|
+
["gpt-2", "gpt2"],
|
|
97
|
+
]);
|
|
98
|
+
/**
|
|
99
|
+
* Get the tokenizer/encoding for a model name
|
|
100
|
+
*
|
|
101
|
+
* This function matches the logic in tiktoken-rs get_tokenizer()
|
|
102
|
+
*
|
|
103
|
+
* @param modelName - The model name
|
|
104
|
+
* @returns The tokenizer name, or undefined if not found
|
|
105
|
+
*/
|
|
106
|
+
export function getTokenizerForModel(modelName) {
|
|
107
|
+
// First, check exact match
|
|
108
|
+
const exactMatch = MODEL_TO_TOKENIZER.get(modelName);
|
|
109
|
+
if (exactMatch) {
|
|
110
|
+
return exactMatch;
|
|
111
|
+
}
|
|
112
|
+
// Then, check prefix matches
|
|
113
|
+
for (const [prefix, tokenizer] of MODEL_PREFIX_TO_TOKENIZER) {
|
|
114
|
+
if (modelName.startsWith(prefix)) {
|
|
115
|
+
return tokenizer;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return undefined;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Context size mapping from tiktoken-rs model.rs
|
|
122
|
+
*
|
|
123
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
|
|
124
|
+
*/
|
|
125
|
+
export function getContextSize(model) {
|
|
126
|
+
// Handle fine-tuned models
|
|
127
|
+
if (model.startsWith("ft:")) {
|
|
128
|
+
const rest = model.slice(3);
|
|
129
|
+
const base = rest.split(":")[0] ?? rest;
|
|
130
|
+
return getContextSize(base);
|
|
131
|
+
}
|
|
132
|
+
// Check prefixes (most specific first)
|
|
133
|
+
if (model.startsWith("gpt-5")) {
|
|
134
|
+
return 400_000;
|
|
135
|
+
}
|
|
136
|
+
if (model.startsWith("gpt-oss")) {
|
|
137
|
+
return 131_072;
|
|
138
|
+
}
|
|
139
|
+
if (model.startsWith("o1") ||
|
|
140
|
+
model.startsWith("o3") ||
|
|
141
|
+
model.startsWith("o4")) {
|
|
142
|
+
return 200_000;
|
|
143
|
+
}
|
|
144
|
+
if (model.startsWith("gpt-4.1")) {
|
|
145
|
+
return 1_047_576;
|
|
146
|
+
}
|
|
147
|
+
if (model.startsWith("gpt-4o")) {
|
|
148
|
+
return 128_000;
|
|
149
|
+
}
|
|
150
|
+
if (model.startsWith("gpt-4-turbo-")) {
|
|
151
|
+
return 128_000;
|
|
152
|
+
}
|
|
153
|
+
if (model.startsWith("gpt-4-0125")) {
|
|
154
|
+
return 128_000;
|
|
155
|
+
}
|
|
156
|
+
if (model.startsWith("gpt-4-1106")) {
|
|
157
|
+
return 128_000;
|
|
158
|
+
}
|
|
159
|
+
if (model.startsWith("gpt-4-32k")) {
|
|
160
|
+
return 32_768;
|
|
161
|
+
}
|
|
162
|
+
if (model.startsWith("gpt-4")) {
|
|
163
|
+
return 8192;
|
|
164
|
+
}
|
|
165
|
+
if (model.startsWith("gpt-3.5-turbo-0125")) {
|
|
166
|
+
return 16_385;
|
|
167
|
+
}
|
|
168
|
+
if (model.startsWith("gpt-3.5-turbo-1106")) {
|
|
169
|
+
return 16_385;
|
|
170
|
+
}
|
|
171
|
+
if (model.startsWith("gpt-3.5-turbo-16k")) {
|
|
172
|
+
return 16_385;
|
|
173
|
+
}
|
|
174
|
+
if (model.startsWith("gpt-3.5-turbo")) {
|
|
175
|
+
return 16_385;
|
|
176
|
+
}
|
|
177
|
+
if (model.startsWith("text-davinci-002") ||
|
|
178
|
+
model.startsWith("text-davinci-003")) {
|
|
179
|
+
return 4097;
|
|
180
|
+
}
|
|
181
|
+
if (model.startsWith("ada") ||
|
|
182
|
+
model.startsWith("babbage") ||
|
|
183
|
+
model.startsWith("curie")) {
|
|
184
|
+
return 2049;
|
|
185
|
+
}
|
|
186
|
+
if (model.startsWith("code-cushman-001")) {
|
|
187
|
+
return 2048;
|
|
188
|
+
}
|
|
189
|
+
if (model.startsWith("code-davinci-002")) {
|
|
190
|
+
return 8001;
|
|
191
|
+
}
|
|
192
|
+
if (model.startsWith("davinci")) {
|
|
193
|
+
return 2049;
|
|
194
|
+
}
|
|
195
|
+
if (model.startsWith("text-ada-001") ||
|
|
196
|
+
model.startsWith("text-babbage-001") ||
|
|
197
|
+
model.startsWith("text-curie-001")) {
|
|
198
|
+
return 2049;
|
|
199
|
+
}
|
|
200
|
+
if (model.startsWith("text-embedding-ada-002")) {
|
|
201
|
+
return 8192;
|
|
202
|
+
}
|
|
203
|
+
// Default
|
|
204
|
+
return 4096;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Extended context limits including specific model versions
|
|
208
|
+
* from tiktoken-rs model.rs get_context_size match statement
|
|
209
|
+
*/
|
|
210
|
+
export const EXACT_CONTEXT_SIZES = {
|
|
211
|
+
// o-series reasoning models
|
|
212
|
+
o1: 200_000,
|
|
213
|
+
"o1-2024-12-17": 200_000,
|
|
214
|
+
"o1-mini": 128_000,
|
|
215
|
+
"o1-mini-2024-09-12": 128_000,
|
|
216
|
+
"o1-preview": 128_000,
|
|
217
|
+
"o1-preview-2024-09-12": 128_000,
|
|
218
|
+
o3: 200_000,
|
|
219
|
+
"o3-mini": 200_000,
|
|
220
|
+
"o3-mini-2025-01-31": 200_000,
|
|
221
|
+
"o4-mini": 200_000,
|
|
222
|
+
"o4-mini-2025-04-16": 200_000,
|
|
223
|
+
// GPT-5 series
|
|
224
|
+
"gpt-5": 400_000,
|
|
225
|
+
"gpt-5-2025-03-22": 400_000,
|
|
226
|
+
"gpt-5-mini": 200_000,
|
|
227
|
+
"gpt-5-mini-2025-03-22": 200_000,
|
|
228
|
+
"gpt-5-turbo": 400_000,
|
|
229
|
+
"gpt-5-turbo-2025-06-09": 400_000,
|
|
230
|
+
"gpt-5-nano": 200_000,
|
|
231
|
+
"gpt-5-nano-2025-09-14": 200_000,
|
|
232
|
+
// GPT-4.5 preview
|
|
233
|
+
"gpt-4.5-preview": 128_000,
|
|
234
|
+
"gpt-4.5-preview-2025-02-27": 128_000,
|
|
235
|
+
// GPT-4.1 series (1M context!)
|
|
236
|
+
"gpt-4.1": 1_047_576,
|
|
237
|
+
"gpt-4.1-2025-04-14": 1_047_576,
|
|
238
|
+
"gpt-4.1-mini": 1_047_576,
|
|
239
|
+
"gpt-4.1-mini-2025-04-14": 1_047_576,
|
|
240
|
+
"gpt-4.1-nano": 1_047_576,
|
|
241
|
+
"gpt-4.1-nano-2025-04-14": 1_047_576,
|
|
242
|
+
// GPT-4o series
|
|
243
|
+
"chatgpt-4o-latest": 128_000,
|
|
244
|
+
"gpt-4o": 128_000,
|
|
245
|
+
"gpt-4o-2024-05-13": 128_000,
|
|
246
|
+
"gpt-4o-2024-08-06": 128_000,
|
|
247
|
+
"gpt-4o-2024-11-20": 128_000,
|
|
248
|
+
"gpt-4o-audio-preview": 128_000,
|
|
249
|
+
"gpt-4o-audio-preview-2024-10-01": 128_000,
|
|
250
|
+
"gpt-4o-audio-preview-2024-12-17": 128_000,
|
|
251
|
+
"gpt-4o-mini": 128_000,
|
|
252
|
+
"gpt-4o-mini-2024-07-18": 128_000,
|
|
253
|
+
"gpt-4o-mini-audio-preview": 128_000,
|
|
254
|
+
"gpt-4o-mini-audio-preview-2024-12-17": 128_000,
|
|
255
|
+
"gpt-4o-realtime-preview": 128_000,
|
|
256
|
+
"gpt-4o-realtime-preview-2024-10-01": 128_000,
|
|
257
|
+
"gpt-4o-realtime-preview-2024-12-17": 128_000,
|
|
258
|
+
// GPT-4 turbo series
|
|
259
|
+
"gpt-4-turbo": 128_000,
|
|
260
|
+
"gpt-4-turbo-2024-04-09": 128_000,
|
|
261
|
+
"gpt-4-turbo-preview": 128_000,
|
|
262
|
+
"gpt-4-0125-preview": 128_000,
|
|
263
|
+
"gpt-4-1106-preview": 128_000,
|
|
264
|
+
"gpt-4-vision-preview": 128_000,
|
|
265
|
+
"gpt-4-1106-vision-preview": 128_000,
|
|
266
|
+
// GPT-4 base
|
|
267
|
+
"gpt-4": 8_192,
|
|
268
|
+
"gpt-4-0613": 8_192,
|
|
269
|
+
"gpt-4-0314": 8_192,
|
|
270
|
+
"gpt-4-32k": 32_768,
|
|
271
|
+
"gpt-4-32k-0613": 32_768,
|
|
272
|
+
"gpt-4-32k-0314": 32_768,
|
|
273
|
+
// GPT-3.5 series
|
|
274
|
+
"gpt-3.5-turbo": 16_385,
|
|
275
|
+
"gpt-3.5-turbo-0125": 16_385,
|
|
276
|
+
"gpt-3.5-turbo-1106": 16_385,
|
|
277
|
+
"gpt-3.5-turbo-0613": 16_385,
|
|
278
|
+
"gpt-3.5-turbo-16k": 16_385,
|
|
279
|
+
"gpt-3.5-turbo-16k-0613": 16_385,
|
|
280
|
+
"gpt-3.5-turbo-instruct": 4_096,
|
|
281
|
+
"gpt-3.5-turbo-instruct-0914": 4_096,
|
|
282
|
+
"gpt-3.5-turbo-0301": 4_096,
|
|
283
|
+
// gpt-oss
|
|
284
|
+
"gpt-oss": 131_072,
|
|
285
|
+
"gpt-oss-2025-03-01": 131_072,
|
|
286
|
+
};
|
|
287
|
+
/**
|
|
288
|
+
* Get context size with exact match support
|
|
289
|
+
*/
|
|
290
|
+
export function getExactContextSize(model) {
|
|
291
|
+
// Check exact match first
|
|
292
|
+
const exactSize = EXACT_CONTEXT_SIZES[model];
|
|
293
|
+
if (exactSize !== undefined) {
|
|
294
|
+
return exactSize;
|
|
295
|
+
}
|
|
296
|
+
// Fall back to prefix matching
|
|
297
|
+
return getContextSize(model);
|
|
298
|
+
}
|
|
299
|
+
//# sourceMappingURL=model-to-encoding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-to-encoding.js","sourceRoot":"","sources":["../../src/core/model-to-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAcH;;;;;GAKG;AACH,MAAM,yBAAyB,GAAmC;IAChE,mBAAmB;IACnB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,cAAc;IACd,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,UAAU,EAAE,YAAY,CAAC;IAC1B,CAAC,UAAU,EAAE,YAAY,CAAC;IAC1B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,QAAQ,EAAE,aAAa,CAAC;IACzB,CAAC,gBAAgB,EAAE,aAAa,CAAC;IACjC,CAAC,eAAe,EAAE,aAAa,CAAC,EAAE,wBAAwB;IAC1D,CAAC,UAAU,EAAE,eAAe,CAAC;IAC7B,oBAAoB;IACpB,CAAC,WAAW,EAAE,YAAY,CAAC;IAC3B,CAAC,UAAU,EAAE,aAAa,CAAC;IAC3B,CAAC,kBAAkB,EAAE,aAAa,CAAC;IACnC,CAAC,gBAAgB,EAAE,aAAa,CAAC;IACjC,CAAC,gBAAgB,EAAE,aAAa,CAAC;CAClC,CAAC;AAEF;;;;GAIG;AACH,MAAM,kBAAkB,GAA+B,IAAI,GAAG,CAAC;IAC7D,mBAAmB;IACnB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,cAAc;IACd,CAAC,OAAO,EAAE,YAAY,CAAC;IACvB,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,mBAAmB,EAAE,YAAY,CAAC;IACnC,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,OAAO,EAAE,aAAa,CAAC;IACxB,CAAC,eAAe,EAAE,aAAa,CAAC;IAChC,CAAC,SAAS,EAAE,aAAa,CAAC;IAC1B,CAAC,cAAc,EAAE,aAAa,CAAC,EAAE,wBAAwB;IACzD,cAAc;IACd,CAAC,aAAa,EAAE,aAAa,CAAC;IAC9B,CAAC,aAAa,EAAE,aAAa,CAAC;IAC9B,aAAa;IACb,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,2BAA2B;IAC3B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,gBAAgB,EAAE,WAAW,CAAC;IAC/B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,cAAc,EAAE,WAAW,CAAC;IAC7B,CAAC,SAAS,EAAE,WAAW,CAAC;IACxB,CAAC,OAAO,EAAE,WAAW,CAAC;IACtB,CAAC,SAAS,EAAE,WAAW,CAAC;IACxB,CAAC,KAAK,EAAE,WAAW,CAAC;IACpB,2BAA2B;IAC3B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,eAAe,EAAE,WAAW,CAAC;IAC9B,CAAC,eAAe,EAAE,WAAW,CAAC;IAC9B,2BAA2B;IAC3B,CAAC,uBAAuB,EAAE,WAAW,CAAC;IACtC,CAAC,uBAAuB,EAAE,WAAW,CAAC;IACtC,qCAAqC;IACrC,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,2BAA2B,EAAE,WAAW,CAAC;IAC1C,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,yBAAyB,EAAE,WAAW,CAAC;IACxC,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,2BAA2B,EAAE,WAAW,CAAC;IAC1C,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,yBAAyB,EAAE,WAAW,CAAC;IACxC,CAAC,8BAA8B,EAAE,WAAW,CAAC;IAC7C,CAAC,0BAA0B,EAAE,WAAW,CAAC;IACzC,cAAc;IACd,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,OAAO,EAAE,MAAM,CAAC;CAClB,CAAC,CAAC;AAEH;;;;;;;GAOG;AACH,MAAM,UAAU,oBAAoB,CAClC,SAAiB;IAEjB,2BAA2B;IAC3B,MAAM,UAAU,GAAG,kBAAkB,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IACrD,IAAI,UAAU,EAAE,CAAC;QACf,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,IAAI,yBAAyB,EAAE,CAAC;QAC5D,IAAI,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YACjC,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,2BAA2B;IAC3B,IAAI,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACxC,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;IAED,uCAAuC;IACvC,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC9B,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC;QACtB,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC;QACtB,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,EACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC/B,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACrC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACnC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACnC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC9B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,oBAAoB,CAAC,EAAE,CAAC;QAC3C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,oBAAoB,CAAC,EAAE,CAAC;QAC3C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,mBAAmB,CAAC,EAAE,CAAC;QAC1C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,eAAe,CAAC,EAAE,CAAC;QACtC,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC;QACpC,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EACpC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC;QACvB,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC;QAC3B,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EACzB,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC;QAChC,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC;QACpC,KAAK,CAAC,UAAU,CAAC,gBAAgB,CAAC,EAClC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,wBAAwB,CAAC,EAAE,CAAC;QAC/C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU;IACV,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAA2B;IACzD,4BAA4B;IAC5B,EAAE,EAAE,OAAO;IACX,eAAe,EAAE,OAAO;IACxB,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAC7B,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAChC,EAAE,EAAE,OAAO;IACX,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAC7B,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAE7B,eAAe;IACf,OAAO,EAAE,OAAO;IAChB,kBAAkB,EAAE,OAAO;IAC3B,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAChC,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAEhC,kBAAkB;IAClB,iBAAiB,EAAE,OAAO;IAC1B,4BAA4B,EAAE,OAAO;IAErC,+BAA+B;IAC/B,SAAS,EAAE,SAAS;IACpB,oBAAoB,EAAE,SAAS;IAC/B,cAAc,EAAE,SAAS;IACzB,yBAAyB,EAAE,SAAS;IACpC,cAAc,EAAE,SAAS;IACzB,yBAAyB,EAAE,SAAS;IAEpC,gBAAgB;IAChB,mBAAmB,EAAE,OAAO;IAC5B,QAAQ,EAAE,OAAO;IACjB,mBAAmB,EAAE,OAAO;IAC5B,mBAAmB,EAAE,OAAO;IAC5B,mBAAmB,EAAE,OAAO;IAC5B,sBAAsB,EAAE,OAAO;IAC/B,iCAAiC,EAAE,OAAO;IAC1C,iCAAiC,EAAE,OAAO;IAC1C,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,2BAA2B,EAAE,OAAO;IACpC,sCAAsC,EAAE,OAAO;IAC/C,yBAAyB,EAAE,OAAO;IAClC,oCAAoC,EAAE,OAAO;IAC7C,oCAAoC,EAAE,OAAO;IAE7C,qBAAqB;IACrB,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,qBAAqB,EAAE,OAAO;IAC9B,oBAAoB,EAAE,OAAO;IAC7B,oBAAoB,EAAE,OAAO;IAC7B,sBAAsB,EAAE,OAAO;IAC/B,2BAA2B,EAAE,OAAO;IAEpC,aAAa;IACb,OAAO,EAAE,KAAK;IACd,YAAY,EAAE,KAAK;IACnB,YAAY,EAAE,KAAK;IACnB,WAAW,EAAE,MAAM;IACnB,gBAAgB,EAAE,MAAM;IACxB,gBAAgB,EAAE,MAAM;IAExB,iBAAiB;IACjB,eAAe,EAAE,MAAM;IACvB,oBAAoB,EAAE,MAAM;IAC5B,oBAAoB,EAAE,MAAM;IAC5B,oBAAoB,EAAE,MAAM;IAC5B,mBAAmB,EAAE,MAAM;IAC3B,wBAAwB,EAAE,MAAM;IAChC,wBAAwB,EAAE,KAAK;IAC/B,6BAA6B,EAAE,KAAK;IACpC,oBAAoB,EAAE,KAAK;IAE3B,UAAU;IACV,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;CAC9B,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,0BAA0B;IAC1B,MAAM,SAAS,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAC7C,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;QAC5B,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,+BAA+B;IAC/B,OAAO,cAAc,CAAC,KAAK,CAAC,CAAC;AAC/B,CAAC"}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core Tiktoken BPE Implementation
|
|
3
|
+
*
|
|
4
|
+
* This is an EXACT port of the CoreBPE struct from tiktoken-rs.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
|
|
6
|
+
*
|
|
7
|
+
* Provides the main tokenization API:
|
|
8
|
+
* - encode_ordinary(text) - Encode without special tokens
|
|
9
|
+
* - encode(text, allowed_special) - Encode with optional special tokens
|
|
10
|
+
* - encode_with_special_tokens(text) - Encode with all special tokens
|
|
11
|
+
* - decode(tokens) - Decode tokens to text
|
|
12
|
+
* - decode_bytes(tokens) - Decode tokens to raw bytes
|
|
13
|
+
*/
|
|
14
|
+
import { type Vocabulary, type Rank } from "./byte-pair-encoding.js";
|
|
15
|
+
/**
|
|
16
|
+
* Error thrown when a token cannot be decoded
|
|
17
|
+
*/
|
|
18
|
+
export declare class DecodeKeyError extends Error {
|
|
19
|
+
readonly token: Rank;
|
|
20
|
+
constructor(token: Rank);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* CoreBPE - The main tokenizer class
|
|
24
|
+
*
|
|
25
|
+
* This class implements the complete BPE tokenization algorithm,
|
|
26
|
+
* matching the behavior of tiktoken-rs exactly.
|
|
27
|
+
*/
|
|
28
|
+
export declare class CoreBPE {
|
|
29
|
+
/** The vocabulary mapping byte sequences to ranks */
|
|
30
|
+
private readonly encoder;
|
|
31
|
+
/** The special tokens mapping strings to ranks */
|
|
32
|
+
private readonly specialTokensEncoder;
|
|
33
|
+
/** Reverse vocabulary mapping ranks to byte sequences */
|
|
34
|
+
private readonly decoder;
|
|
35
|
+
/** Reverse special tokens mapping ranks to byte sequences */
|
|
36
|
+
private readonly specialTokensDecoder;
|
|
37
|
+
/** Compiled regex for splitting text */
|
|
38
|
+
private readonly regex;
|
|
39
|
+
/** Compiled regex for finding special tokens */
|
|
40
|
+
private readonly specialRegex;
|
|
41
|
+
/** Text encoder for converting strings to bytes */
|
|
42
|
+
private readonly textEncoder;
|
|
43
|
+
/** Text decoder for converting bytes to strings */
|
|
44
|
+
private readonly textDecoder;
|
|
45
|
+
/**
|
|
46
|
+
* Create a new CoreBPE instance
|
|
47
|
+
*
|
|
48
|
+
* @param encoder - Vocabulary mapping byte sequences to ranks
|
|
49
|
+
* @param specialTokensEncoder - Special tokens mapping strings to ranks
|
|
50
|
+
* @param pattern - Regex pattern for splitting text into pieces
|
|
51
|
+
*/
|
|
52
|
+
constructor(encoder: Vocabulary, specialTokensEncoder: Map<string, Rank>, pattern: string);
|
|
53
|
+
/**
|
|
54
|
+
* Decode tokens to raw bytes
|
|
55
|
+
*
|
|
56
|
+
* The bytes are not guaranteed to be valid UTF-8.
|
|
57
|
+
*
|
|
58
|
+
* @param tokens - Array of token ranks to decode
|
|
59
|
+
* @returns Decoded bytes
|
|
60
|
+
* @throws DecodeKeyError if a token is not found
|
|
61
|
+
*/
|
|
62
|
+
decodeBytes(tokens: Rank[]): Uint8Array;
|
|
63
|
+
/**
|
|
64
|
+
* Decode tokens to a string
|
|
65
|
+
*
|
|
66
|
+
* @param tokens - Array of token ranks to decode
|
|
67
|
+
* @returns Decoded string
|
|
68
|
+
* @throws DecodeKeyError if a token is not found
|
|
69
|
+
*/
|
|
70
|
+
decode(tokens: Rank[]): string;
|
|
71
|
+
/**
|
|
72
|
+
* Encode text without handling special tokens
|
|
73
|
+
*
|
|
74
|
+
* This is the core encoding logic. Special tokens are treated as regular text.
|
|
75
|
+
*
|
|
76
|
+
* @param text - Text to encode
|
|
77
|
+
* @returns Array of token ranks
|
|
78
|
+
*/
|
|
79
|
+
encodeOrdinary(text: string): Rank[];
|
|
80
|
+
/**
|
|
81
|
+
* Encode text with special token handling
|
|
82
|
+
*
|
|
83
|
+
* @param text - Text to encode
|
|
84
|
+
* @param allowedSpecial - Set of special tokens that are allowed
|
|
85
|
+
* @returns Tuple of [tokens, lastPieceTokenLen]
|
|
86
|
+
*/
|
|
87
|
+
encode(text: string, allowedSpecial: Set<string>): [Rank[], number];
|
|
88
|
+
/**
|
|
89
|
+
* Encode text with all special tokens allowed
|
|
90
|
+
*
|
|
91
|
+
* @param text - Text to encode
|
|
92
|
+
* @returns Array of token ranks
|
|
93
|
+
*/
|
|
94
|
+
encodeWithSpecialTokens(text: string): Rank[];
|
|
95
|
+
/**
|
|
96
|
+
* Get all special tokens
|
|
97
|
+
*
|
|
98
|
+
* @returns Set of special token strings
|
|
99
|
+
*/
|
|
100
|
+
getSpecialTokens(): Set<string>;
|
|
101
|
+
/**
|
|
102
|
+
* Get the vocabulary size (excluding special tokens)
|
|
103
|
+
*/
|
|
104
|
+
get vocabSize(): number;
|
|
105
|
+
/**
|
|
106
|
+
* Get the total vocabulary size (including special tokens)
|
|
107
|
+
*/
|
|
108
|
+
get totalVocabSize(): number;
|
|
109
|
+
/**
|
|
110
|
+
* Check if a token rank is a special token
|
|
111
|
+
*/
|
|
112
|
+
isSpecialToken(token: Rank): boolean;
|
|
113
|
+
/**
|
|
114
|
+
* Get the byte representation of a token
|
|
115
|
+
*/
|
|
116
|
+
getTokenBytes(token: Rank): Uint8Array | undefined;
|
|
117
|
+
/**
|
|
118
|
+
* Get the rank of a byte sequence
|
|
119
|
+
*/
|
|
120
|
+
getRank(bytes: Uint8Array): Rank | undefined;
|
|
121
|
+
/**
|
|
122
|
+
* Get the rank of a special token
|
|
123
|
+
*/
|
|
124
|
+
getSpecialTokenRank(token: string): Rank | undefined;
|
|
125
|
+
}
|
|
126
|
+
//# sourceMappingURL=tiktoken.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tiktoken.d.ts","sourceRoot":"","sources":["../../src/core/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EACL,KAAK,UAAU,EAEf,KAAK,IAAI,EAIV,MAAM,yBAAyB,CAAC;AAEjC;;GAEG;AACH,qBAAa,cAAe,SAAQ,KAAK;IACvC,SAAgB,KAAK,EAAE,IAAI,CAAC;gBAEhB,KAAK,EAAE,IAAI;CAKxB;AASD;;;;;GAKG;AACH,qBAAa,OAAO;IAClB,qDAAqD;IACrD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAa;IAErC,kDAAkD;IAClD,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAoB;IAEzD,yDAAyD;IACzD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAoB;IAE5C,6DAA6D;IAC7D,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAwB;IAE7D,wCAAwC;IACxC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAE/B,gDAAgD;IAChD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAgB;IAE7C,mDAAmD;IACnD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmC;IAE/D,mDAAmD;IACnD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmC;IAE/D;;;;;;OAMG;gBAED,OAAO,EAAE,UAAU,EACnB,oBAAoB,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,EACvC,OAAO,EAAE,MAAM;IAyCjB;;;;;;;;OAQG;IACH,WAAW,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,UAAU;IA2BvC;;;;;;OAMG;IACH,MAAM,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM;IAK9B;;;;;;;OAOG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,EAAE;IA0BpC;;;;;;OAMG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,cAAc,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,MAAM,CAAC;IA4EnE;;;;;OAKG;IACH,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,EAAE;IAK7C;;;;OAIG;IACH,gBAAgB,IAAI,GAAG,CAAC,MAAM,CAAC;IAI/B;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,IAAI,cAAc,IAAI,MAAM,CAE3B;IAED;;OAEG;IACH,cAAc,CAAC,KAAK,EAAE,IAAI,GAAG,OAAO;IAIpC;;OAEG;IACH,aAAa,CAAC,KAAK,EAAE,IAAI,GAAG,UAAU,GAAG,SAAS;IAIlD;;OAEG;IACH,OAAO,CAAC,KAAK,EAAE,UAAU,GAAG,IAAI,GAAG,SAAS;IAK5C;;OAEG;IACH,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS;CAGrD"}
|