@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,331 @@
1
+ /**
2
+ * High-Level Tiktoken API
3
+ *
4
+ * Provides easy-to-use tokenization APIs with automatic vocabulary loading.
5
+ *
6
+ * This module is the main entry point for most users. It provides:
7
+ * - Lazy loading of vocabularies
8
+ * - Caching of tokenizer instances
9
+ * - Support for both sync and async operations
10
+ * - Model name resolution
11
+ */
12
+ import { CoreBPE, getEncodingDefinition, loadVocabularyFromUrl, loadVocabularyFromString, getVocabularyFromCache, isVocabularyCached, getTokenizerForModel, } from "./core/index.js";
13
+ /**
14
+ * Tiktoken encoding instance
15
+ *
16
+ * This class wraps CoreBPE and provides:
17
+ * - Lazy initialization
18
+ * - Convenient API matching tiktoken-rs
19
+ * - Error handling for unloaded vocabularies
20
+ */
21
+ export class Tiktoken {
22
+ /** The encoding name */
23
+ name;
24
+ /** The encoding definition */
25
+ definition;
26
+ /** The underlying CoreBPE instance (lazy loaded) */
27
+ coreBPE = null;
28
+ /** Promise for async initialization */
29
+ initPromise = null;
30
+ /** Whether the vocabulary has been loaded */
31
+ isLoaded = false;
32
+ /**
33
+ * Create a new Tiktoken instance
34
+ *
35
+ * Note: The vocabulary is NOT loaded until you call load() or one of the
36
+ * encoding methods with a loaded vocabulary.
37
+ *
38
+ * @param encodingName - The encoding name (e.g., "cl100k_base", "o200k_base")
39
+ */
40
+ constructor(encodingName) {
41
+ const definition = getEncodingDefinition(encodingName);
42
+ if (!definition) {
43
+ throw new Error(`Unknown encoding: ${encodingName}`);
44
+ }
45
+ this.name = encodingName;
46
+ this.definition = definition;
47
+ // Check if vocabulary is already cached
48
+ if (isVocabularyCached(encodingName)) {
49
+ this.initFromCache();
50
+ }
51
+ }
52
+ /**
53
+ * Initialize from cached vocabulary
54
+ */
55
+ initFromCache() {
56
+ const vocab = getVocabularyFromCache(this.name);
57
+ if (vocab) {
58
+ this.initCoreBPE(vocab.encoder, vocab.decoder);
59
+ }
60
+ }
61
+ /**
62
+ * Initialize the CoreBPE instance
63
+ */
64
+ initCoreBPE(encoder, _decoder) {
65
+ // Note: decoder is provided by vocab loader but CoreBPE builds its own from encoder
66
+ // The _decoder parameter is kept for API consistency with vocabulary loading
67
+ // Create special tokens map
68
+ const specialTokensEncoder = new Map();
69
+ for (const [token, rank] of Object.entries(this.definition.specialTokens)) {
70
+ specialTokensEncoder.set(token, rank);
71
+ }
72
+ this.coreBPE = new CoreBPE(encoder, specialTokensEncoder, this.definition.pattern);
73
+ this.isLoaded = true;
74
+ }
75
+ /**
76
+ * Load the vocabulary from URL
77
+ *
78
+ * @returns Promise that resolves when loaded
79
+ */
80
+ async load() {
81
+ if (this.isLoaded) {
82
+ return;
83
+ }
84
+ if (this.initPromise) {
85
+ return this.initPromise;
86
+ }
87
+ this.initPromise = (async () => {
88
+ const vocab = await loadVocabularyFromUrl(this.name);
89
+ this.initCoreBPE(vocab.encoder, vocab.decoder); // decoder used internally in initCoreBPE
90
+ })();
91
+ return this.initPromise;
92
+ }
93
+ /**
94
+ * Load the vocabulary from a string (for embedded vocabularies)
95
+ *
96
+ * @param content - The vocabulary file content
97
+ */
98
+ loadFromString(content) {
99
+ if (this.isLoaded) {
100
+ return;
101
+ }
102
+ const vocab = loadVocabularyFromString(this.name, content);
103
+ this.initCoreBPE(vocab.encoder, vocab.decoder);
104
+ }
105
+ /**
106
+ * Ensure the vocabulary is loaded
107
+ */
108
+ ensureLoaded() {
109
+ if (!this.coreBPE) {
110
+ throw new Error(`Vocabulary not loaded for ${this.name}. Call load() first or use the async API.`);
111
+ }
112
+ return this.coreBPE;
113
+ }
114
+ /**
115
+ * Check if the vocabulary is loaded
116
+ */
117
+ get loaded() {
118
+ return this.isLoaded;
119
+ }
120
+ // =========================================================================
121
+ // Encoding Methods
122
+ // =========================================================================
123
+ /**
124
+ * Encode text into token IDs without handling special tokens
125
+ *
126
+ * @param text - Text to encode
127
+ * @returns Array of token IDs
128
+ */
129
+ encodeOrdinary(text) {
130
+ return this.ensureLoaded().encodeOrdinary(text);
131
+ }
132
+ /**
133
+ * Encode text into token IDs
134
+ *
135
+ * @param text - Text to encode
136
+ * @param allowedSpecial - Special tokens to allow (default: none)
137
+ * @returns Array of token IDs
138
+ */
139
+ encode(text, allowedSpecial) {
140
+ const bpe = this.ensureLoaded();
141
+ if (allowedSpecial === "all") {
142
+ return bpe.encodeWithSpecialTokens(text);
143
+ }
144
+ const allowed = allowedSpecial ?? new Set();
145
+ return bpe.encode(text, allowed)[0];
146
+ }
147
+ /**
148
+ * Encode text with all special tokens allowed
149
+ *
150
+ * @param text - Text to encode
151
+ * @returns Array of token IDs
152
+ */
153
+ encodeWithSpecialTokens(text) {
154
+ return this.ensureLoaded().encodeWithSpecialTokens(text);
155
+ }
156
+ // =========================================================================
157
+ // Decoding Methods
158
+ // =========================================================================
159
+ /**
160
+ * Decode token IDs back to text
161
+ *
162
+ * @param tokens - Array of token IDs
163
+ * @returns Decoded text
164
+ */
165
+ decode(tokens) {
166
+ return this.ensureLoaded().decode(tokens);
167
+ }
168
+ /**
169
+ * Decode token IDs to raw bytes
170
+ *
171
+ * @param tokens - Array of token IDs
172
+ * @returns Decoded bytes
173
+ */
174
+ decodeBytes(tokens) {
175
+ return this.ensureLoaded().decodeBytes(tokens);
176
+ }
177
+ // =========================================================================
178
+ // Utility Methods
179
+ // =========================================================================
180
+ /**
181
+ * Count tokens in text (without returning the tokens)
182
+ *
183
+ * @param text - Text to count
184
+ * @returns Number of tokens
185
+ */
186
+ countTokens(text) {
187
+ return this.encodeOrdinary(text).length;
188
+ }
189
+ /**
190
+ * Get the vocabulary size (excluding special tokens)
191
+ */
192
+ get vocabSize() {
193
+ return this.ensureLoaded().vocabSize;
194
+ }
195
+ /**
196
+ * Get the total vocabulary size (including special tokens)
197
+ */
198
+ get totalVocabSize() {
199
+ return this.ensureLoaded().totalVocabSize;
200
+ }
201
+ /**
202
+ * Get all special tokens
203
+ */
204
+ getSpecialTokens() {
205
+ return this.ensureLoaded().getSpecialTokens();
206
+ }
207
+ /**
208
+ * Check if a token is a special token
209
+ */
210
+ isSpecialToken(token) {
211
+ return this.ensureLoaded().isSpecialToken(token);
212
+ }
213
+ /**
214
+ * Get the definition for this encoding
215
+ */
216
+ getDefinition() {
217
+ return this.definition;
218
+ }
219
+ }
220
+ // =========================================================================
221
+ // Factory Functions
222
+ // =========================================================================
223
+ /**
224
+ * Cache of Tiktoken instances
225
+ */
226
+ const tiktokenCache = new Map();
227
+ /**
228
+ * Get or create a Tiktoken instance for an encoding
229
+ *
230
+ * @param encodingName - Encoding name
231
+ * @returns Tiktoken instance (may not be loaded yet)
232
+ */
233
+ export function getEncoding(encodingName) {
234
+ let instance = tiktokenCache.get(encodingName);
235
+ if (!instance) {
236
+ instance = new Tiktoken(encodingName);
237
+ tiktokenCache.set(encodingName, instance);
238
+ }
239
+ return instance;
240
+ }
241
+ /**
242
+ * Get or create a Tiktoken instance for a model
243
+ *
244
+ * @param modelName - Model name
245
+ * @returns Tiktoken instance (may not be loaded yet)
246
+ * @throws Error if no encoding is found for the model
247
+ */
248
+ export function getEncodingForModel(modelName) {
249
+ const encoding = getTokenizerForModel(modelName);
250
+ if (!encoding) {
251
+ // Default to o200k_base for unknown models (most modern models use this)
252
+ return getEncoding("o200k_base");
253
+ }
254
+ return getEncoding(encoding);
255
+ }
256
+ /**
257
+ * Get a loaded Tiktoken instance for an encoding (async)
258
+ *
259
+ * @param encodingName - Encoding name
260
+ * @returns Promise resolving to a loaded Tiktoken instance
261
+ */
262
+ export async function getEncodingAsync(encodingName) {
263
+ const instance = getEncoding(encodingName);
264
+ await instance.load();
265
+ return instance;
266
+ }
267
+ /**
268
+ * Get a loaded Tiktoken instance for a model (async)
269
+ *
270
+ * @param modelName - Model name
271
+ * @returns Promise resolving to a loaded Tiktoken instance
272
+ */
273
+ export async function getEncodingForModelAsync(modelName) {
274
+ const instance = getEncodingForModel(modelName);
275
+ await instance.load();
276
+ return instance;
277
+ }
278
+ /**
279
+ * Clear the Tiktoken instance cache
280
+ */
281
+ export function clearTiktokenCache() {
282
+ tiktokenCache.clear();
283
+ }
284
+ // =========================================================================
285
+ // Convenience Functions
286
+ // =========================================================================
287
+ /**
288
+ * Encode text using a specific encoding (async)
289
+ *
290
+ * @param text - Text to encode
291
+ * @param encodingName - Encoding name
292
+ * @returns Promise resolving to token IDs
293
+ */
294
+ export async function encodeAsync(text, encodingName = "o200k_base") {
295
+ const tiktoken = await getEncodingAsync(encodingName);
296
+ return tiktoken.encodeOrdinary(text);
297
+ }
298
+ /**
299
+ * Decode tokens using a specific encoding (async)
300
+ *
301
+ * @param tokens - Token IDs
302
+ * @param encodingName - Encoding name
303
+ * @returns Promise resolving to decoded text
304
+ */
305
+ export async function decodeAsync(tokens, encodingName = "o200k_base") {
306
+ const tiktoken = await getEncodingAsync(encodingName);
307
+ return tiktoken.decode(tokens);
308
+ }
309
+ /**
310
+ * Count tokens in text (async)
311
+ *
312
+ * @param text - Text to count
313
+ * @param encodingName - Encoding name
314
+ * @returns Promise resolving to token count
315
+ */
316
+ export async function countTokensAsync(text, encodingName = "o200k_base") {
317
+ const tiktoken = await getEncodingAsync(encodingName);
318
+ return tiktoken.countTokens(text);
319
+ }
320
+ /**
321
+ * Count tokens for a model (async)
322
+ *
323
+ * @param text - Text to count
324
+ * @param modelName - Model name
325
+ * @returns Promise resolving to token count
326
+ */
327
+ export async function countTokensForModelAsync(text, modelName) {
328
+ const tiktoken = await getEncodingForModelAsync(modelName);
329
+ return tiktoken.countTokens(text);
330
+ }
331
+ //# sourceMappingURL=tiktoken.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tiktoken.js","sourceRoot":"","sources":["../src/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACL,OAAO,EAIP,qBAAqB,EAErB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,kBAAkB,EAClB,oBAAoB,GACrB,MAAM,iBAAiB,CAAC;AAEzB;;;;;;;GAOG;AACH,MAAM,OAAO,QAAQ;IACnB,wBAAwB;IACR,IAAI,CAAS;IAE7B,8BAA8B;IACb,UAAU,CAAqB;IAEhD,oDAAoD;IAC5C,OAAO,GAAmB,IAAI,CAAC;IAEvC,uCAAuC;IAC/B,WAAW,GAAyB,IAAI,CAAC;IAEjD,6CAA6C;IACrC,QAAQ,GAAG,KAAK,CAAC;IAEzB;;;;;;;OAOG;IACH,YAAY,YAAoB;QAC9B,MAAM,UAAU,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;QACvD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,qBAAqB,YAAY,EAAE,CAAC,CAAC;QACvD,CAAC;QAED,IAAI,CAAC,IAAI,GAAG,YAAY,CAAC;QACzB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAE7B,wCAAwC;QACxC,IAAI,kBAAkB,CAAC,YAAY,CAAC,EAAE,CAAC;YACrC,IAAI,CAAC,aAAa,EAAE,CAAC;QACvB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,MAAM,KAAK,GAAG,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,OAAmB,EAAE,QAA2B;QAClE,oFAAoF;QACpF,6EAA6E;QAE7E,4BAA4B;QAC5B,MAAM,oBAAoB,GAAG,IAAI,GAAG,EAAgB,CAAC;QACrD,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;YAC1E,oBAAoB,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACxC,CAAC;QAED,IAAI,CAAC,OAAO,GAAG,IAAI,OAAO,CACxB,OAAO,EACP,oBAAoB,EACpB,IAAI,CAAC,UAAU,CAAC,OAAO,CACxB,CAAC;QACF,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,OAAO;QACT,CAAC;QAED,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QAED,IAAI,CAAC,WAAW,GAAG,CAAC,KAAK,IAAI,EAAE;YAC7B,MAAM,KAAK,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACrD,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,yCAAyC;QAC3F,CAAC,CAAC,EAAE,CAAC;QAEL,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;OAIG;IACH,cAAc,CAAC,OAAe;QAC5B,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,OAAO;QACT,CAAC;QAED,MAAM,KAAK,GAAG,wBAAwB,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC3D,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACjD,CAAC;IAED;;OAEG;IACK,YAAY;QAClB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CACb,6BAA6B,IAAI,CAAC,IAAI,2CAA2C,CAClF,CAAC;QACJ,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED,4EAA4E;IAC5E,mBAAmB;IACnB,4EAA4E;IAE5E;;;;;OAKG;IACH,cAAc,CAAC,IAAY;QACzB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAClD,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY,EAAE,cAAoC;QACvD,MAAM,GAAG,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAEhC,IAAI,cAAc,KAAK,KAAK,EAAE,CAAC;YAC7B,OAAO,GAAG,CAAC,uBAAuB,CAAC,IAAI,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,OAAO,GAAG,cAAc,IAAI,IAAI,GAAG,EAAU,CAAC;QACpD,OAAO,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC;IAED;;;;;OAKG;IACH,uBAAuB,CAAC,IAAY;QAClC,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC3D,CAAC;IAED,4EAA4E;IAC5E,mBAAmB;IACnB,4EAA4E;IAE5E;;;;;OAKG;IACH,MAAM,CAAC,MAAc;QACnB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAC5C,CAAC;IAED;;;;;OAKG;IACH,WAAW,CAAC,MAAc;QACxB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IACjD,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACH,WAAW,CAAC,IAAY;QACtB,OAAO,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,SAAS,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,IAAI,cAAc;QAChB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,gBAAgB,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,KAAW;QACxB,OAAO,IAAI,CAAC,YAAY,EAAE,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACH,aAAa;QACX,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;CACF;AAED,4EAA4E;AAC5E,oBAAoB;AACpB,4EAA4E;AAE5E;;GAEG;AACH,MAAM,aAAa,GAA0B,IAAI,GAAG,EAAE,CAAC;AAEvD;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAAC,YAAoB;IAC9C,IAAI,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAE/C,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,QAAQ,GAAG,IAAI,QAAQ,CAAC,YAAY,CAAC,CAAC;QACtC,aAAa,CAAC,GAAG,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IAC5C,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiB;IACnD,MAAM,QAAQ,GAAG,oBAAoB,CAAC,SAAS,CAAC,CAAC;IACjD,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,yEAAyE;QACzE,OAAO,WAAW,CAAC,YAAY,CAAC,CAAC;IACnC,CAAC;IACD,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;AAC/B,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,YAAoB;IAEpB,MAAM,QAAQ,GAAG,WAAW,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtB,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC5C,SAAiB;IAEjB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtB,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,aAAa,CAAC,KAAK,EAAE,CAAC;AACxB,CAAC;AAED,4EAA4E;AAC5E,wBAAwB;AACxB,4EAA4E;AAE5E;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAY,EACZ,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;AACvC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,MAAc,EACd,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AACjC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,IAAY,EACZ,eAAuB,YAAY;IAEnC,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,YAAY,CAAC,CAAC;IACtD,OAAO,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAC5C,IAAY,EACZ,SAAiB;IAEjB,MAAM,QAAQ,GAAG,MAAM,wBAAwB,CAAC,SAAS,CAAC,CAAC;IAC3D,OAAO,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC"}
@@ -0,0 +1,181 @@
1
+ /**
2
+ * Main Tokenizer Implementation
3
+ * Provides high-level tokenization APIs for AI applications
4
+ */
5
+ import type { Tokenizer, EncodingName, ChatMessage, TokenCountOptions, MaxTokensOptions, TokenEstimation } from "./types.js";
6
+ /**
7
+ * Get a tokenizer for a specific encoding
8
+ *
9
+ * @param encodingName - Encoding name (cl100k_base, o200k_base, p50k_base)
10
+ * @returns Tokenizer instance
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * const tokenizer = getEncoding("o200k_base");
15
+ * const tokens = tokenizer.encode("Hello, world!");
16
+ * console.log(tokens.length); // ~4 tokens
17
+ * ```
18
+ */
19
+ export declare function getEncoding(encodingName: EncodingName): Tokenizer;
20
+ /**
21
+ * Get a tokenizer for a specific model
22
+ *
23
+ * @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
24
+ * @returns Tokenizer instance configured for the model
25
+ *
26
+ * @example
27
+ * ```typescript
28
+ * const tokenizer = getEncodingForModelName("gpt-4o");
29
+ * const count = tokenizer.countTokens("Hello, world!");
30
+ * console.log(count); // ~4 tokens
31
+ * ```
32
+ */
33
+ export declare function getEncodingForModelName(model: string): Tokenizer;
34
+ /**
35
+ * Count tokens in text
36
+ *
37
+ * @param text - Input text
38
+ * @param options - Token counting options
39
+ * @returns Token count
40
+ *
41
+ * @example
42
+ * ```typescript
43
+ * // Count with default encoding (o200k_base)
44
+ * const count = countTokens("Hello, world!");
45
+ *
46
+ * // Count with specific model
47
+ * const count = countTokens("Hello, world!", { model: "gpt-4o" });
48
+ *
49
+ * // Count with specific encoding
50
+ * const count = countTokens("Hello, world!", { encoding: "cl100k_base" });
51
+ * ```
52
+ */
53
+ export declare function countTokens(text: string, options?: TokenCountOptions): number;
54
+ /**
55
+ * Count tokens in chat messages (includes message overhead)
56
+ *
57
+ * Chat messages have additional tokens for:
58
+ * - Role markers: <|im_start|>role\n ... <|im_end|>
59
+ * - Separator tokens between messages
60
+ *
61
+ * @param messages - Array of chat messages
62
+ * @param model - Model name for accurate counting
63
+ * @returns Total token count including overhead
64
+ *
65
+ * @example
66
+ * ```typescript
67
+ * const messages = [
68
+ * { role: "system", content: "You are helpful." },
69
+ * { role: "user", content: "Hello!" }
70
+ * ];
71
+ * const count = countChatTokens(messages, "gpt-4o");
72
+ * ```
73
+ */
74
+ export declare function countChatTokens(messages: ChatMessage[], model?: string): number;
75
+ /**
76
+ * Count tokens in a prompt with system message
77
+ * Convenience function for common AI playground use case
78
+ *
79
+ * @param systemPrompt - System prompt content
80
+ * @param userPrompt - User prompt content
81
+ * @param model - Model name
82
+ * @returns Total token count
83
+ */
84
+ export declare function countPromptTokens(systemPrompt: string, userPrompt: string, model?: string): number;
85
+ /**
86
+ * Estimate a safe max_tokens value to avoid truncation
87
+ *
88
+ * This function helps prevent the common issue where max_tokens is set too low,
89
+ * causing the API to return an empty response with finish_reason: "length".
90
+ *
91
+ * @param promptText - The full prompt text (or use countChatTokens for messages)
92
+ * @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
93
+ * @param options - Estimation options
94
+ * @returns Recommended max_tokens value
95
+ *
96
+ * @example
97
+ * ```typescript
98
+ * const prompt = "Write a story about a robot.";
99
+ * const maxTokens = estimateMaxTokens(prompt, "gpt-5-nano", {
100
+ * desiredOutputTokens: 500,
101
+ * safetyMargin: 0.15
102
+ * });
103
+ * // Returns a value that ensures the response won't be truncated
104
+ * ```
105
+ */
106
+ export declare function estimateMaxTokens(promptText: string, model: string, options?: MaxTokensOptions): number;
107
+ /**
108
+ * Get detailed token estimation with warnings
109
+ *
110
+ * @param promptText - Full prompt text
111
+ * @param model - Model name
112
+ * @param options - Estimation options
113
+ * @returns Detailed token estimation
114
+ *
115
+ * @example
116
+ * ```typescript
117
+ * const estimation = getTokenEstimation(longPrompt, "gpt-4o", {
118
+ * desiredOutputTokens: 2000
119
+ * });
120
+ *
121
+ * if (!estimation.fitsInContext) {
122
+ * console.error(estimation.warning);
123
+ * }
124
+ *
125
+ * const response = await openai.chat.completions.create({
126
+ * model: "gpt-4o",
127
+ * messages: [...],
128
+ * max_tokens: estimation.recommendedMaxTokens
129
+ * });
130
+ * ```
131
+ */
132
+ export declare function getTokenEstimation(promptText: string, model: string, options?: MaxTokensOptions): TokenEstimation;
133
+ /**
134
+ * Get token estimation for chat messages
135
+ *
136
+ * @param messages - Array of chat messages
137
+ * @param model - Model name
138
+ * @param options - Estimation options
139
+ * @returns Detailed token estimation
140
+ */
141
+ export declare function getChatTokenEstimation(messages: ChatMessage[], model: string, options?: MaxTokensOptions): TokenEstimation;
142
+ /**
143
+ * Check if text will fit within model context
144
+ *
145
+ * @param text - Input text
146
+ * @param model - Model name
147
+ * @param reservedOutputTokens - Tokens to reserve for output
148
+ * @returns True if text fits within context
149
+ */
150
+ export declare function fitsInContext(text: string, model: string, reservedOutputTokens?: number): boolean;
151
+ /**
152
+ * Truncate text to fit within a token limit
153
+ *
154
+ * @param text - Input text
155
+ * @param maxTokens - Maximum tokens allowed
156
+ * @param model - Model name for accurate counting
157
+ * @returns Truncated text
158
+ *
159
+ * @example
160
+ * ```typescript
161
+ * const truncated = truncateToTokenLimit(longText, 1000, "gpt-4o");
162
+ * console.log(countTokens(truncated, { model: "gpt-4o" })); // <= 1000
163
+ * ```
164
+ */
165
+ export declare function truncateToTokenLimit(text: string, maxTokens: number, model?: string): string;
166
+ /**
167
+ * Split text into chunks that fit within token limits
168
+ *
169
+ * @param text - Input text
170
+ * @param maxTokensPerChunk - Maximum tokens per chunk
171
+ * @param overlap - Number of characters to overlap between chunks
172
+ * @param model - Model name
173
+ * @returns Array of text chunks
174
+ */
175
+ export declare function splitIntoChunks(text: string, maxTokensPerChunk: number, overlap?: number, model?: string): string[];
176
+ /**
177
+ * Clear the tokenizer cache
178
+ * Useful for testing or memory management
179
+ */
180
+ export declare function clearTokenizerCache(): void;
181
+ //# sourceMappingURL=tokenizer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EACV,SAAS,EACT,YAAY,EACZ,WAAW,EACX,iBAAiB,EACjB,gBAAgB,EAChB,eAAe,EAChB,MAAM,YAAY,CAAC;AA6CpB;;;;;;;;;;;;GAYG;AACH,wBAAgB,WAAW,CAAC,YAAY,EAAE,YAAY,GAAG,SAAS,CAEjE;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAGhE;AAMD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,MAAM,CAO7E;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,WAAW,EAAE,EACvB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAiCR;AAED;;;;;;;;GAQG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,MAAM,EACpB,UAAU,EAAE,MAAM,EAClB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAUR;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,iBAAiB,CAC/B,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,MAAM,CA0BR;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,kBAAkB,CAChC,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,eAAe,CAmDjB;AAED;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,EAAE,WAAW,EAAE,EACvB,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,gBAAgB,GACzB,eAAe,CAiDjB;AAMD;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAC3B,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE,MAAM,EACb,oBAAoB,GAAE,MAAa,GAClC,OAAO,CAKT;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,EACjB,KAAK,GAAE,MAAiB,GACvB,MAAM,CAyCR;AAED;;;;;;;;GAQG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,OAAO,GAAE,MAAY,EACrB,KAAK,GAAE,MAAiB,GACvB,MAAM,EAAE,CAoDV;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,IAAI,IAAI,CAE1C"}