@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
package/dist/tiktoken.js
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* High-Level Tiktoken API
|
|
3
|
+
*
|
|
4
|
+
* Provides easy-to-use tokenization APIs with automatic vocabulary loading.
|
|
5
|
+
*
|
|
6
|
+
* This module is the main entry point for most users. It provides:
|
|
7
|
+
* - Lazy loading of vocabularies
|
|
8
|
+
* - Caching of tokenizer instances
|
|
9
|
+
* - Support for both sync and async operations
|
|
10
|
+
* - Model name resolution
|
|
11
|
+
*/
|
|
12
|
+
import { CoreBPE, getEncodingDefinition, loadVocabularyFromUrl, loadVocabularyFromString, getVocabularyFromCache, isVocabularyCached, getTokenizerForModel, } from "./core/index.js";
|
|
13
|
+
/**
|
|
14
|
+
* Tiktoken encoding instance
|
|
15
|
+
*
|
|
16
|
+
* This class wraps CoreBPE and provides:
|
|
17
|
+
* - Lazy initialization
|
|
18
|
+
* - Convenient API matching tiktoken-rs
|
|
19
|
+
* - Error handling for unloaded vocabularies
|
|
20
|
+
*/
|
|
21
|
+
export class Tiktoken {
|
|
22
|
+
/** The encoding name */
|
|
23
|
+
name;
|
|
24
|
+
/** The encoding definition */
|
|
25
|
+
definition;
|
|
26
|
+
/** The underlying CoreBPE instance (lazy loaded) */
|
|
27
|
+
coreBPE = null;
|
|
28
|
+
/** Promise for async initialization */
|
|
29
|
+
initPromise = null;
|
|
30
|
+
/** Whether the vocabulary has been loaded */
|
|
31
|
+
isLoaded = false;
|
|
32
|
+
/**
|
|
33
|
+
* Create a new Tiktoken instance
|
|
34
|
+
*
|
|
35
|
+
* Note: The vocabulary is NOT loaded until you call load() or one of the
|
|
36
|
+
* encoding methods with a loaded vocabulary.
|
|
37
|
+
*
|
|
38
|
+
* @param encodingName - The encoding name (e.g., "cl100k_base", "o200k_base")
|
|
39
|
+
*/
|
|
40
|
+
constructor(encodingName) {
|
|
41
|
+
const definition = getEncodingDefinition(encodingName);
|
|
42
|
+
if (!definition) {
|
|
43
|
+
throw new Error(`Unknown encoding: ${encodingName}`);
|
|
44
|
+
}
|
|
45
|
+
this.name = encodingName;
|
|
46
|
+
this.definition = definition;
|
|
47
|
+
// Check if vocabulary is already cached
|
|
48
|
+
if (isVocabularyCached(encodingName)) {
|
|
49
|
+
this.initFromCache();
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Initialize from cached vocabulary
|
|
54
|
+
*/
|
|
55
|
+
initFromCache() {
|
|
56
|
+
const vocab = getVocabularyFromCache(this.name);
|
|
57
|
+
if (vocab) {
|
|
58
|
+
this.initCoreBPE(vocab.encoder, vocab.decoder);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Initialize the CoreBPE instance
|
|
63
|
+
*/
|
|
64
|
+
initCoreBPE(encoder, _decoder) {
|
|
65
|
+
// Note: decoder is provided by vocab loader but CoreBPE builds its own from encoder
|
|
66
|
+
// The _decoder parameter is kept for API consistency with vocabulary loading
|
|
67
|
+
// Create special tokens map
|
|
68
|
+
const specialTokensEncoder = new Map();
|
|
69
|
+
for (const [token, rank] of Object.entries(this.definition.specialTokens)) {
|
|
70
|
+
specialTokensEncoder.set(token, rank);
|
|
71
|
+
}
|
|
72
|
+
this.coreBPE = new CoreBPE(encoder, specialTokensEncoder, this.definition.pattern);
|
|
73
|
+
this.isLoaded = true;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Load the vocabulary from URL
|
|
77
|
+
*
|
|
78
|
+
* @returns Promise that resolves when loaded
|
|
79
|
+
*/
|
|
80
|
+
async load() {
|
|
81
|
+
if (this.isLoaded) {
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
if (this.initPromise) {
|
|
85
|
+
return this.initPromise;
|
|
86
|
+
}
|
|
87
|
+
this.initPromise = (async () => {
|
|
88
|
+
const vocab = await loadVocabularyFromUrl(this.name);
|
|
89
|
+
this.initCoreBPE(vocab.encoder, vocab.decoder); // decoder used internally in initCoreBPE
|
|
90
|
+
})();
|
|
91
|
+
return this.initPromise;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Load the vocabulary from a string (for embedded vocabularies)
|
|
95
|
+
*
|
|
96
|
+
* @param content - The vocabulary file content
|
|
97
|
+
*/
|
|
98
|
+
loadFromString(content) {
|
|
99
|
+
if (this.isLoaded) {
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
const vocab = loadVocabularyFromString(this.name, content);
|
|
103
|
+
this.initCoreBPE(vocab.encoder, vocab.decoder);
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Ensure the vocabulary is loaded
|
|
107
|
+
*/
|
|
108
|
+
ensureLoaded() {
|
|
109
|
+
if (!this.coreBPE) {
|
|
110
|
+
throw new Error(`Vocabulary not loaded for ${this.name}. Call load() first or use the async API.`);
|
|
111
|
+
}
|
|
112
|
+
return this.coreBPE;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Check if the vocabulary is loaded
|
|
116
|
+
*/
|
|
117
|
+
get loaded() {
|
|
118
|
+
return this.isLoaded;
|
|
119
|
+
}
|
|
120
|
+
// =========================================================================
|
|
121
|
+
// Encoding Methods
|
|
122
|
+
// =========================================================================
|
|
123
|
+
/**
|
|
124
|
+
* Encode text into token IDs without handling special tokens
|
|
125
|
+
*
|
|
126
|
+
* @param text - Text to encode
|
|
127
|
+
* @returns Array of token IDs
|
|
128
|
+
*/
|
|
129
|
+
encodeOrdinary(text) {
|
|
130
|
+
return this.ensureLoaded().encodeOrdinary(text);
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Encode text into token IDs
|
|
134
|
+
*
|
|
135
|
+
* @param text - Text to encode
|
|
136
|
+
* @param allowedSpecial - Special tokens to allow (default: none)
|
|
137
|
+
* @returns Array of token IDs
|
|
138
|
+
*/
|
|
139
|
+
encode(text, allowedSpecial) {
|
|
140
|
+
const bpe = this.ensureLoaded();
|
|
141
|
+
if (allowedSpecial === "all") {
|
|
142
|
+
return bpe.encodeWithSpecialTokens(text);
|
|
143
|
+
}
|
|
144
|
+
const allowed = allowedSpecial ?? new Set();
|
|
145
|
+
return bpe.encode(text, allowed)[0];
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Encode text with all special tokens allowed
|
|
149
|
+
*
|
|
150
|
+
* @param text - Text to encode
|
|
151
|
+
* @returns Array of token IDs
|
|
152
|
+
*/
|
|
153
|
+
encodeWithSpecialTokens(text) {
|
|
154
|
+
return this.ensureLoaded().encodeWithSpecialTokens(text);
|
|
155
|
+
}
|
|
156
|
+
// =========================================================================
|
|
157
|
+
// Decoding Methods
|
|
158
|
+
// =========================================================================
|
|
159
|
+
/**
|
|
160
|
+
* Decode token IDs back to text
|
|
161
|
+
*
|
|
162
|
+
* @param tokens - Array of token IDs
|
|
163
|
+
* @returns Decoded text
|
|
164
|
+
*/
|
|
165
|
+
decode(tokens) {
|
|
166
|
+
return this.ensureLoaded().decode(tokens);
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Decode token IDs to raw bytes
|
|
170
|
+
*
|
|
171
|
+
* @param tokens - Array of token IDs
|
|
172
|
+
* @returns Decoded bytes
|
|
173
|
+
*/
|
|
174
|
+
decodeBytes(tokens) {
|
|
175
|
+
return this.ensureLoaded().decodeBytes(tokens);
|
|
176
|
+
}
|
|
177
|
+
// =========================================================================
|
|
178
|
+
// Utility Methods
|
|
179
|
+
// =========================================================================
|
|
180
|
+
/**
|
|
181
|
+
* Count tokens in text (without returning the tokens)
|
|
182
|
+
*
|
|
183
|
+
* @param text - Text to count
|
|
184
|
+
* @returns Number of tokens
|
|
185
|
+
*/
|
|
186
|
+
countTokens(text) {
|
|
187
|
+
return this.encodeOrdinary(text).length;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Get the vocabulary size (excluding special tokens)
|
|
191
|
+
*/
|
|
192
|
+
get vocabSize() {
|
|
193
|
+
return this.ensureLoaded().vocabSize;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Get the total vocabulary size (including special tokens)
|
|
197
|
+
*/
|
|
198
|
+
get totalVocabSize() {
|
|
199
|
+
return this.ensureLoaded().totalVocabSize;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Get all special tokens
|
|
203
|
+
*/
|
|
204
|
+
getSpecialTokens() {
|
|
205
|
+
return this.ensureLoaded().getSpecialTokens();
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Check if a token is a special token
|
|
209
|
+
*/
|
|
210
|
+
isSpecialToken(token) {
|
|
211
|
+
return this.ensureLoaded().isSpecialToken(token);
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Get the definition for this encoding
|
|
215
|
+
*/
|
|
216
|
+
getDefinition() {
|
|
217
|
+
return this.definition;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
// =========================================================================
|
|
221
|
+
// Factory Functions
|
|
222
|
+
// =========================================================================
|
|
223
|
+
/**
|
|
224
|
+
* Cache of Tiktoken instances
|
|
225
|
+
*/
|
|
226
|
+
const tiktokenCache = new Map();
|
|
227
|
+
/**
|
|
228
|
+
* Get or create a Tiktoken instance for an encoding
|
|
229
|
+
*
|
|
230
|
+
* @param encodingName - Encoding name
|
|
231
|
+
* @returns Tiktoken instance (may not be loaded yet)
|
|
232
|
+
*/
|
|
233
|
+
export function getEncoding(encodingName) {
|
|
234
|
+
let instance = tiktokenCache.get(encodingName);
|
|
235
|
+
if (!instance) {
|
|
236
|
+
instance = new Tiktoken(encodingName);
|
|
237
|
+
tiktokenCache.set(encodingName, instance);
|
|
238
|
+
}
|
|
239
|
+
return instance;
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Get or create a Tiktoken instance for a model
|
|
243
|
+
*
|
|
244
|
+
* @param modelName - Model name
|
|
245
|
+
* @returns Tiktoken instance (may not be loaded yet)
|
|
246
|
+
* @throws Error if no encoding is found for the model
|
|
247
|
+
*/
|
|
248
|
+
export function getEncodingForModel(modelName) {
|
|
249
|
+
const encoding = getTokenizerForModel(modelName);
|
|
250
|
+
if (!encoding) {
|
|
251
|
+
// Default to o200k_base for unknown models (most modern models use this)
|
|
252
|
+
return getEncoding("o200k_base");
|
|
253
|
+
}
|
|
254
|
+
return getEncoding(encoding);
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Get a loaded Tiktoken instance for an encoding (async)
|
|
258
|
+
*
|
|
259
|
+
* @param encodingName - Encoding name
|
|
260
|
+
* @returns Promise resolving to a loaded Tiktoken instance
|
|
261
|
+
*/
|
|
262
|
+
export async function getEncodingAsync(encodingName) {
|
|
263
|
+
const instance = getEncoding(encodingName);
|
|
264
|
+
await instance.load();
|
|
265
|
+
return instance;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Get a loaded Tiktoken instance for a model (async)
|
|
269
|
+
*
|
|
270
|
+
* @param modelName - Model name
|
|
271
|
+
* @returns Promise resolving to a loaded Tiktoken instance
|
|
272
|
+
*/
|
|
273
|
+
export async function getEncodingForModelAsync(modelName) {
|
|
274
|
+
const instance = getEncodingForModel(modelName);
|
|
275
|
+
await instance.load();
|
|
276
|
+
return instance;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Clear the Tiktoken instance cache
|
|
280
|
+
*/
|
|
281
|
+
export function clearTiktokenCache() {
|
|
282
|
+
tiktokenCache.clear();
|
|
283
|
+
}
|
|
284
|
+
// =========================================================================
|
|
285
|
+
// Convenience Functions
|
|
286
|
+
// =========================================================================
|
|
287
|
+
/**
|
|
288
|
+
* Encode text using a specific encoding (async)
|
|
289
|
+
*
|
|
290
|
+
* @param text - Text to encode
|
|
291
|
+
* @param encodingName - Encoding name
|
|
292
|
+
* @returns Promise resolving to token IDs
|
|
293
|
+
*/
|
|
294
|
+
export async function encodeAsync(text, encodingName = "o200k_base") {
|
|
295
|
+
const tiktoken = await getEncodingAsync(encodingName);
|
|
296
|
+
return tiktoken.encodeOrdinary(text);
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Decode tokens using a specific encoding (async)
|
|
300
|
+
*
|
|
301
|
+
* @param tokens - Token IDs
|
|
302
|
+
* @param encodingName - Encoding name
|
|
303
|
+
* @returns Promise resolving to decoded text
|
|
304
|
+
*/
|
|
305
|
+
export async function decodeAsync(tokens, encodingName = "o200k_base") {
|
|
306
|
+
const tiktoken = await getEncodingAsync(encodingName);
|
|
307
|
+
return tiktoken.decode(tokens);
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Count tokens in text (async)
|
|
311
|
+
*
|
|
312
|
+
* @param text - Text to count
|
|
313
|
+
* @param encodingName - Encoding name
|
|
314
|
+
* @returns Promise resolving to token count
|
|
315
|
+
*/
|
|
316
|
+
export async function countTokensAsync(text, encodingName = "o200k_base") {
|
|
317
|
+
const tiktoken = await getEncodingAsync(encodingName);
|
|
318
|
+
return tiktoken.countTokens(text);
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Count tokens for a model (async)
|
|
322
|
+
*
|
|
323
|
+
* @param text - Text to count
|
|
324
|
+
* @param modelName - Model name
|
|
325
|
+
* @returns Promise resolving to token count
|
|
326
|
+
*/
|
|
327
|
+
export async function countTokensForModelAsync(text, modelName) {
|
|
328
|
+
const tiktoken = await getEncodingForModelAsync(modelName);
|
|
329
|
+
return tiktoken.countTokens(text);
|
|
330
|
+
}
|
|
331
|
+
//# sourceMappingURL=tiktoken.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tiktoken.js","sourceRoot":"","sources":["../src/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACL,OAAO,EAIP,qBAAqB,EAErB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,kBAAkB,EAClB,oBAAoB,GACrB,MAAM,iBAAiB,CAAC;AAEzB;;;;;;;GAOG;AACH,MAAM,OAAO,QAAQ;IACnB,wBAAwB;IACR,IAAI,CAAS;IAE7B,8BAA8B;IACb,UAAU,CAAqB;IAEhD,oDAAoD;IAC5C,OAAO,GAAmB,IAAI,CAAC;IAEvC,uCAAuC;IAC/B,WAAW,GAAyB,IAAI,CAAC;IAEjD,6CAA6C;IACrC,QAAQ,GAAG,KAAK,CAAC;IAEzB;;;;;;;OAOG;IACH,YAAY,YAAoB;QAC9B,MAAM,UAAU,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;QACvD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,qBAAqB,YAAY,EAAE,CAAC,CAAC;QACvD,CAAC;QAED,IAAI,CAAC,IAAI,GAAG,YAAY,CAAC;QACzB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAE7B,wCAAwC;QACxC,IAAI,kBAAkB,CAAC,YAAY,CAAC,EAAE,CAAC;YACrC,IAAI,CAAC,aAAa,EAAE,CAAC;QACvB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,MAAM,KAAK,GAAG,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,OAAmB,EAAE,QAA2B;QAClE,oFAAoF;QACpF,6EAA6E;QAE7E,4BAA4B;QAC5B,MAAM,oBAAoB,GAAG,IAAI,GAAG,EAAgB,CAAC;QACrD,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;YAC1E,oBAAoB,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACxC,CAAC;QAED,IAAI,CAAC,OAAO,GAAG,IAAI,OAAO,CACxB,OAAO,EACP,oBAAoB,EACpB,IAAI,CAAC,UAAU,CAAC,OAAO,CACxB,CAAC;QACF,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,OAAO;QACT,CAAC;QAED,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QAED,IAAI,CAAC,WAAW,GAAG,CAAC,KAAK,IAAI,EAAE;YAC7B,MAAM,KAAK,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACrD,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,yCAAyC;QAC3F,CAAC,CAAC,EAAE,CAAC;QAEL,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;OAIG;IACH,cAAc,CAAC,OAAe;QAC5B,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,OAAO;QACT,CAAC;QAED,MAAM,KAAK,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC3D,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACjD,CAAC;IAED;;OAEG;IACK,YAAY;QAClB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CACb,6BAA6B,IAAI,CAAC,IAAI,2CAA2C,CAClF,CAAC;QACJ,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED,4EAA4E;IAC5E,mBAAmB;IACnB,4EAA4E;IAE5E;;;;;OAKG;IACH,cAAc,CAAC,IAAY;QACzB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAClD,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY,EAAE,cAAoC;QACvD,MAAM,GAAG,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAEhC,IAAI,cAAc,KAAK,KAAK,EAAE,CAAC;YAC7B,OAAO,GAAG,CAAC,uBAAuB,CAAC,IAAI,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,OAAO,GAAG,cAAc,IAAI,IAAI,GAAG,EAAU,CAAC;QACpD,OAAO,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC;IAED;;;;;OAKG;IACH,uBAAuB,CAAC,IAAY;QAClC,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC3D,CAAC;IAED,4EAA4E;IAC5E,mBAAmB;IACnB,4EAA4E;IAE5E;;;;;OAKG;IACH,MAAM,CAAC,MAAc;QACnB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAC5C,CAAC;IAED;;;;;OAKG;IACH,WAAW,CAAC,MAAc;QACxB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IACjD,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACH,WAAW,CAAC,IAAY;QACtB,OAAO,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,SAAS,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,IAAI,cAAc;QAChB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,gBAAgB,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,KAAW;QACxB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACH,aAAa;QACX,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;CACF;AAED,4EAA4E;AAC5E,oBAAoB;AACpB,4EAA4E;AAE5E;;GAEG;AACH,MAAM,aAAa,GAA0B,IAAI,GAAG,EAAE,CAAC;AAEvD;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAAC,YAAoB;IAC9C,IAAI,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAE/C,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,QAAQ,GAAG,IAAI,QAAQ,CAAC,YAAY,CAAC,CAAC;QACtC,aAAa,CAAC,GAAG,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IAC5C,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiB;IACnD,MAAM,QAAQ,GAAG,oBAAoB,CAAC,SAAS,CAAC,CAAC;IACjD,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,yEAAyE;QACzE,OAAO,WAAW,CAAC,YAAY,CAAC,CAAC;IACnC,CAAC;IACD,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;AAC/B,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,YAAoB;IAEpB,MAAM,QAAQ,GAAG,WAAW,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtB,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC5C,SAAiB;IAEjB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtB,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,aAAa,CAAC,KAAK,EAAE,CAAC;AACxB,CAAC;AAED,4EAA4E;AAC5E,wBAAwB;AACxB,4EAA4E;AAE5E;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAY,EACZ,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;AACvC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,MAAc,EACd,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AACjC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,IAAY,EACZ,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC5C,IAAY,EACZ,SAAiB;IAEjB,MAAM,QAAQ,GAAG,MAAM,wBAAwB,CAAC,SAAS,CAAC,CAAC;IAC3D,OAAO,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC"}
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Tokenizer Implementation
|
|
3
|
+
* Provides high-level tokenization APIs for AI applications
|
|
4
|
+
*/
|
|
5
|
+
import type { Tokenizer, EncodingName, ChatMessage, TokenCountOptions, MaxTokensOptions, TokenEstimation } from "./types.js";
|
|
6
|
+
/**
|
|
7
|
+
* Get a tokenizer for a specific encoding
|
|
8
|
+
*
|
|
9
|
+
* @param encodingName - Encoding name (cl100k_base, o200k_base, p50k_base)
|
|
10
|
+
* @returns Tokenizer instance
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* const tokenizer = getEncoding("o200k_base");
|
|
15
|
+
* const tokens = tokenizer.encode("Hello, world!");
|
|
16
|
+
* console.log(tokens.length); // ~4 tokens
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
19
|
+
export declare function getEncoding(encodingName: EncodingName): Tokenizer;
|
|
20
|
+
/**
|
|
21
|
+
* Get a tokenizer for a specific model
|
|
22
|
+
*
|
|
23
|
+
* @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
|
|
24
|
+
* @returns Tokenizer instance configured for the model
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const tokenizer = getEncodingForModelName("gpt-4o");
|
|
29
|
+
* const count = tokenizer.countTokens("Hello, world!");
|
|
30
|
+
* console.log(count); // ~4 tokens
|
|
31
|
+
* ```
|
|
32
|
+
*/
|
|
33
|
+
export declare function getEncodingForModelName(model: string): Tokenizer;
|
|
34
|
+
/**
|
|
35
|
+
* Count tokens in text
|
|
36
|
+
*
|
|
37
|
+
* @param text - Input text
|
|
38
|
+
* @param options - Token counting options
|
|
39
|
+
* @returns Token count
|
|
40
|
+
*
|
|
41
|
+
* @example
|
|
42
|
+
* ```typescript
|
|
43
|
+
* // Count with default encoding (o200k_base)
|
|
44
|
+
* const count = countTokens("Hello, world!");
|
|
45
|
+
*
|
|
46
|
+
* // Count with specific model
|
|
47
|
+
* const count = countTokens("Hello, world!", { model: "gpt-4o" });
|
|
48
|
+
*
|
|
49
|
+
* // Count with specific encoding
|
|
50
|
+
* const count = countTokens("Hello, world!", { encoding: "cl100k_base" });
|
|
51
|
+
* ```
|
|
52
|
+
*/
|
|
53
|
+
export declare function countTokens(text: string, options?: TokenCountOptions): number;
|
|
54
|
+
/**
|
|
55
|
+
* Count tokens in chat messages (includes message overhead)
|
|
56
|
+
*
|
|
57
|
+
* Chat messages have additional tokens for:
|
|
58
|
+
* - Role markers: <|im_start|>role\n ... <|im_end|>
|
|
59
|
+
* - Separator tokens between messages
|
|
60
|
+
*
|
|
61
|
+
* @param messages - Array of chat messages
|
|
62
|
+
* @param model - Model name for accurate counting
|
|
63
|
+
* @returns Total token count including overhead
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* ```typescript
|
|
67
|
+
* const messages = [
|
|
68
|
+
* { role: "system", content: "You are helpful." },
|
|
69
|
+
* { role: "user", content: "Hello!" }
|
|
70
|
+
* ];
|
|
71
|
+
* const count = countChatTokens(messages, "gpt-4o");
|
|
72
|
+
* ```
|
|
73
|
+
*/
|
|
74
|
+
export declare function countChatTokens(messages: ChatMessage[], model?: string): number;
|
|
75
|
+
/**
|
|
76
|
+
* Count tokens in a prompt with system message
|
|
77
|
+
* Convenience function for common AI playground use case
|
|
78
|
+
*
|
|
79
|
+
* @param systemPrompt - System prompt content
|
|
80
|
+
* @param userPrompt - User prompt content
|
|
81
|
+
* @param model - Model name
|
|
82
|
+
* @returns Total token count
|
|
83
|
+
*/
|
|
84
|
+
export declare function countPromptTokens(systemPrompt: string, userPrompt: string, model?: string): number;
|
|
85
|
+
/**
|
|
86
|
+
* Estimate a safe max_tokens value to avoid truncation
|
|
87
|
+
*
|
|
88
|
+
* This function helps prevent the common issue where max_tokens is set too low,
|
|
89
|
+
* causing the API to return an empty response with finish_reason: "length".
|
|
90
|
+
*
|
|
91
|
+
* @param promptText - The full prompt text (or use countChatTokens for messages)
|
|
92
|
+
* @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
|
|
93
|
+
* @param options - Estimation options
|
|
94
|
+
* @returns Recommended max_tokens value
|
|
95
|
+
*
|
|
96
|
+
* @example
|
|
97
|
+
* ```typescript
|
|
98
|
+
* const prompt = "Write a story about a robot.";
|
|
99
|
+
* const maxTokens = estimateMaxTokens(prompt, "gpt-5-nano", {
|
|
100
|
+
* desiredOutputTokens: 500,
|
|
101
|
+
* safetyMargin: 0.15
|
|
102
|
+
* });
|
|
103
|
+
* // Returns a value that ensures the response won't be truncated
|
|
104
|
+
* ```
|
|
105
|
+
*/
|
|
106
|
+
export declare function estimateMaxTokens(promptText: string, model: string, options?: MaxTokensOptions): number;
|
|
107
|
+
/**
|
|
108
|
+
* Get detailed token estimation with warnings
|
|
109
|
+
*
|
|
110
|
+
* @param promptText - Full prompt text
|
|
111
|
+
* @param model - Model name
|
|
112
|
+
* @param options - Estimation options
|
|
113
|
+
* @returns Detailed token estimation
|
|
114
|
+
*
|
|
115
|
+
* @example
|
|
116
|
+
* ```typescript
|
|
117
|
+
* const estimation = getTokenEstimation(longPrompt, "gpt-4o", {
|
|
118
|
+
* desiredOutputTokens: 2000
|
|
119
|
+
* });
|
|
120
|
+
*
|
|
121
|
+
* if (!estimation.fitsInContext) {
|
|
122
|
+
* console.error(estimation.warning);
|
|
123
|
+
* }
|
|
124
|
+
*
|
|
125
|
+
* const response = await openai.chat.completions.create({
|
|
126
|
+
* model: "gpt-4o",
|
|
127
|
+
* messages: [...],
|
|
128
|
+
* max_tokens: estimation.recommendedMaxTokens
|
|
129
|
+
* });
|
|
130
|
+
* ```
|
|
131
|
+
*/
|
|
132
|
+
export declare function getTokenEstimation(promptText: string, model: string, options?: MaxTokensOptions): TokenEstimation;
|
|
133
|
+
/**
|
|
134
|
+
* Get token estimation for chat messages
|
|
135
|
+
*
|
|
136
|
+
* @param messages - Array of chat messages
|
|
137
|
+
* @param model - Model name
|
|
138
|
+
* @param options - Estimation options
|
|
139
|
+
* @returns Detailed token estimation
|
|
140
|
+
*/
|
|
141
|
+
export declare function getChatTokenEstimation(messages: ChatMessage[], model: string, options?: MaxTokensOptions): TokenEstimation;
|
|
142
|
+
/**
|
|
143
|
+
* Check if text will fit within model context
|
|
144
|
+
*
|
|
145
|
+
* @param text - Input text
|
|
146
|
+
* @param model - Model name
|
|
147
|
+
* @param reservedOutputTokens - Tokens to reserve for output
|
|
148
|
+
* @returns True if text fits within context
|
|
149
|
+
*/
|
|
150
|
+
export declare function fitsInContext(text: string, model: string, reservedOutputTokens?: number): boolean;
|
|
151
|
+
/**
|
|
152
|
+
* Truncate text to fit within a token limit
|
|
153
|
+
*
|
|
154
|
+
* @param text - Input text
|
|
155
|
+
* @param maxTokens - Maximum tokens allowed
|
|
156
|
+
* @param model - Model name for accurate counting
|
|
157
|
+
* @returns Truncated text
|
|
158
|
+
*
|
|
159
|
+
* @example
|
|
160
|
+
* ```typescript
|
|
161
|
+
* const truncated = truncateToTokenLimit(longText, 1000, "gpt-4o");
|
|
162
|
+
* console.log(countTokens(truncated, { model: "gpt-4o" })); // <= 1000
|
|
163
|
+
* ```
|
|
164
|
+
*/
|
|
165
|
+
export declare function truncateToTokenLimit(text: string, maxTokens: number, model?: string): string;
|
|
166
|
+
/**
|
|
167
|
+
* Split text into chunks that fit within token limits
|
|
168
|
+
*
|
|
169
|
+
* @param text - Input text
|
|
170
|
+
* @param maxTokensPerChunk - Maximum tokens per chunk
|
|
171
|
+
* @param overlap - Number of characters to overlap between chunks
|
|
172
|
+
* @param model - Model name
|
|
173
|
+
* @returns Array of text chunks
|
|
174
|
+
*/
|
|
175
|
+
export declare function splitIntoChunks(text: string, maxTokensPerChunk: number, overlap?: number, model?: string): string[];
|
|
176
|
+
/**
|
|
177
|
+
* Clear the tokenizer cache
|
|
178
|
+
* Useful for testing or memory management
|
|
179
|
+
*/
|
|
180
|
+
export declare function clearTokenizerCache(): void;
|
|
181
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EACV,SAAS,EACT,YAAY,EACZ,WAAW,EACX,iBAAiB,EACjB,gBAAgB,EAChB,eAAe,EAChB,MAAM,YAAY,CAAC;AA6CpB;;;;;;;;;;;;GAYG;AACH,wBAAgB,WAAW,CAAC,YAAY,EAAE,YAAY,GAAG,SAAS,CAEjE;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAGhE;AAMD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,MAAM,CAO7E;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,WAAW,EAAE,EACvB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAiCR;AAED;;;;;;;;GAQG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,MAAM,EACpB,UAAU,EAAE,MAAM,EAClB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAUR;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,iBAAiB,CAC/B,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,MAAM,CA0BR;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,kBAAkB,CAChC,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,eAAe,CAmDjB;AAED;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,EAAE,WAAW,EAAE,EACvB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,eAAe,CAiDjB;AAMD;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAC3B,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE,MAAM,EACb,oBAAoB,GAAE,MAAa,GAClC,OAAO,CAKT;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,EACjB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAyCR;AAED;;;;;;;;GAQG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,OAAO,GAAE,MAAY,EACrB,KAAK,GAAE,MAAiB,GACvB,MAAM,EAAE,CAoDV;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,IAAI,IAAI,CAE1C"}
|