@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Tokenizer Implementation
|
|
3
|
+
* Provides high-level tokenization APIs for AI applications
|
|
4
|
+
*/
|
|
5
|
+
import { BPETokenizer } from "./bpe.js";
|
|
6
|
+
import { getModelConfig, getEncodingForModel, getModelContextLimit, getModelMaxOutputTokens, } from "./models.js";
|
|
7
|
+
// =============================================================================
|
|
8
|
+
// Tokenizer Cache
|
|
9
|
+
// =============================================================================
|
|
10
|
+
/**
|
|
11
|
+
* Cache of tokenizer instances by encoding name
|
|
12
|
+
*/
|
|
13
|
+
const tokenizerCache = new Map();
|
|
14
|
+
/**
|
|
15
|
+
* Get or create a tokenizer for the specified encoding
|
|
16
|
+
*
|
|
17
|
+
* @param encoding - Encoding name
|
|
18
|
+
* @returns Tokenizer instance
|
|
19
|
+
*/
|
|
20
|
+
function getOrCreateTokenizer(encoding) {
|
|
21
|
+
let tokenizer = tokenizerCache.get(encoding);
|
|
22
|
+
if (!tokenizer) {
|
|
23
|
+
const bpe = new BPETokenizer(encoding);
|
|
24
|
+
tokenizer = {
|
|
25
|
+
encodingName: encoding,
|
|
26
|
+
encode: (text) => bpe.encode(text),
|
|
27
|
+
decode: (tokens) => bpe.decode(tokens),
|
|
28
|
+
countTokens: (text) => bpe.countTokens(text),
|
|
29
|
+
};
|
|
30
|
+
tokenizerCache.set(encoding, tokenizer);
|
|
31
|
+
}
|
|
32
|
+
return tokenizer;
|
|
33
|
+
}
|
|
34
|
+
// =============================================================================
|
|
35
|
+
// Factory Functions
|
|
36
|
+
// =============================================================================
|
|
37
|
+
/**
|
|
38
|
+
* Get a tokenizer for a specific encoding
|
|
39
|
+
*
|
|
40
|
+
* @param encodingName - Encoding name (cl100k_base, o200k_base, p50k_base)
|
|
41
|
+
* @returns Tokenizer instance
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* ```typescript
|
|
45
|
+
* const tokenizer = getEncoding("o200k_base");
|
|
46
|
+
* const tokens = tokenizer.encode("Hello, world!");
|
|
47
|
+
* console.log(tokens.length); // ~4 tokens
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
export function getEncoding(encodingName) {
|
|
51
|
+
return getOrCreateTokenizer(encodingName);
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Get a tokenizer for a specific model
|
|
55
|
+
*
|
|
56
|
+
* @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
|
|
57
|
+
* @returns Tokenizer instance configured for the model
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* ```typescript
|
|
61
|
+
* const tokenizer = getEncodingForModelName("gpt-4o");
|
|
62
|
+
* const count = tokenizer.countTokens("Hello, world!");
|
|
63
|
+
* console.log(count); // ~4 tokens
|
|
64
|
+
* ```
|
|
65
|
+
*/
|
|
66
|
+
export function getEncodingForModelName(model) {
|
|
67
|
+
const encoding = getEncodingForModel(model);
|
|
68
|
+
return getOrCreateTokenizer(encoding);
|
|
69
|
+
}
|
|
70
|
+
// =============================================================================
|
|
71
|
+
// Token Counting Functions
|
|
72
|
+
// =============================================================================
|
|
73
|
+
/**
|
|
74
|
+
* Count tokens in text
|
|
75
|
+
*
|
|
76
|
+
* @param text - Input text
|
|
77
|
+
* @param options - Token counting options
|
|
78
|
+
* @returns Token count
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* ```typescript
|
|
82
|
+
* // Count with default encoding (o200k_base)
|
|
83
|
+
* const count = countTokens("Hello, world!");
|
|
84
|
+
*
|
|
85
|
+
* // Count with specific model
|
|
86
|
+
* const count = countTokens("Hello, world!", { model: "gpt-4o" });
|
|
87
|
+
*
|
|
88
|
+
* // Count with specific encoding
|
|
89
|
+
* const count = countTokens("Hello, world!", { encoding: "cl100k_base" });
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
92
|
+
export function countTokens(text, options) {
|
|
93
|
+
const encoding = options?.encoding ??
|
|
94
|
+
(options?.model ? getEncodingForModel(options.model) : "o200k_base");
|
|
95
|
+
const tokenizer = getOrCreateTokenizer(encoding);
|
|
96
|
+
return tokenizer.countTokens(text);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Count tokens in chat messages (includes message overhead)
|
|
100
|
+
*
|
|
101
|
+
* Chat messages have additional tokens for:
|
|
102
|
+
* - Role markers: <|im_start|>role\n ... <|im_end|>
|
|
103
|
+
* - Separator tokens between messages
|
|
104
|
+
*
|
|
105
|
+
* @param messages - Array of chat messages
|
|
106
|
+
* @param model - Model name for accurate counting
|
|
107
|
+
* @returns Total token count including overhead
|
|
108
|
+
*
|
|
109
|
+
* @example
|
|
110
|
+
* ```typescript
|
|
111
|
+
* const messages = [
|
|
112
|
+
* { role: "system", content: "You are helpful." },
|
|
113
|
+
* { role: "user", content: "Hello!" }
|
|
114
|
+
* ];
|
|
115
|
+
* const count = countChatTokens(messages, "gpt-4o");
|
|
116
|
+
* ```
|
|
117
|
+
*/
|
|
118
|
+
export function countChatTokens(messages, model = "gpt-4o") {
|
|
119
|
+
const encoding = getEncodingForModel(model);
|
|
120
|
+
const tokenizer = getOrCreateTokenizer(encoding);
|
|
121
|
+
let totalTokens = 0;
|
|
122
|
+
// Token overhead per message varies by model
|
|
123
|
+
// For GPT-4o family: 3 tokens per message (role + separators)
|
|
124
|
+
// For GPT-3.5/4: 4 tokens per message
|
|
125
|
+
const config = getModelConfig(model);
|
|
126
|
+
const tokensPerMessage = config?.family === "gpt-3.5" || config?.family === "gpt-4" ? 4 : 3;
|
|
127
|
+
for (const message of messages) {
|
|
128
|
+
// Message overhead
|
|
129
|
+
totalTokens += tokensPerMessage;
|
|
130
|
+
// Role token
|
|
131
|
+
totalTokens += 1;
|
|
132
|
+
// Content tokens
|
|
133
|
+
totalTokens += tokenizer.countTokens(message.content);
|
|
134
|
+
// Name field (if present) adds extra tokens
|
|
135
|
+
if (message.name) {
|
|
136
|
+
totalTokens += tokenizer.countTokens(message.name) + 1;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// Assistant priming (3 tokens for "<|im_start|>assistant\n")
|
|
140
|
+
totalTokens += 3;
|
|
141
|
+
return totalTokens;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Count tokens in a prompt with system message
|
|
145
|
+
* Convenience function for common AI playground use case
|
|
146
|
+
*
|
|
147
|
+
* @param systemPrompt - System prompt content
|
|
148
|
+
* @param userPrompt - User prompt content
|
|
149
|
+
* @param model - Model name
|
|
150
|
+
* @returns Total token count
|
|
151
|
+
*/
|
|
152
|
+
export function countPromptTokens(systemPrompt, userPrompt, model = "gpt-4o") {
|
|
153
|
+
const messages = [];
|
|
154
|
+
if (systemPrompt) {
|
|
155
|
+
messages.push({ role: "system", content: systemPrompt });
|
|
156
|
+
}
|
|
157
|
+
messages.push({ role: "user", content: userPrompt });
|
|
158
|
+
return countChatTokens(messages, model);
|
|
159
|
+
}
|
|
160
|
+
// =============================================================================
|
|
161
|
+
// Max Tokens Estimation
|
|
162
|
+
// =============================================================================
|
|
163
|
+
/**
|
|
164
|
+
* Default options for max tokens estimation
|
|
165
|
+
*/
|
|
166
|
+
const DEFAULT_MAX_TOKENS_OPTIONS = {
|
|
167
|
+
desiredOutputTokens: 1000,
|
|
168
|
+
safetyMargin: 0.1,
|
|
169
|
+
minOutputTokens: 100,
|
|
170
|
+
maxOutputTokensCap: 16384,
|
|
171
|
+
};
|
|
172
|
+
/**
|
|
173
|
+
* Estimate a safe max_tokens value to avoid truncation
|
|
174
|
+
*
|
|
175
|
+
* This function helps prevent the common issue where max_tokens is set too low,
|
|
176
|
+
* causing the API to return an empty response with finish_reason: "length".
|
|
177
|
+
*
|
|
178
|
+
* @param promptText - The full prompt text (or use countChatTokens for messages)
|
|
179
|
+
* @param model - Model name (e.g., "gpt-4o", "gpt-5-nano")
|
|
180
|
+
* @param options - Estimation options
|
|
181
|
+
* @returns Recommended max_tokens value
|
|
182
|
+
*
|
|
183
|
+
* @example
|
|
184
|
+
* ```typescript
|
|
185
|
+
* const prompt = "Write a story about a robot.";
|
|
186
|
+
* const maxTokens = estimateMaxTokens(prompt, "gpt-5-nano", {
|
|
187
|
+
* desiredOutputTokens: 500,
|
|
188
|
+
* safetyMargin: 0.15
|
|
189
|
+
* });
|
|
190
|
+
* // Returns a value that ensures the response won't be truncated
|
|
191
|
+
* ```
|
|
192
|
+
*/
|
|
193
|
+
export function estimateMaxTokens(promptText, model, options) {
|
|
194
|
+
const opts = { ...DEFAULT_MAX_TOKENS_OPTIONS, ...options };
|
|
195
|
+
// Get model configuration
|
|
196
|
+
const contextLimit = getModelContextLimit(model);
|
|
197
|
+
const modelMaxOutput = getModelMaxOutputTokens(model);
|
|
198
|
+
// Count prompt tokens
|
|
199
|
+
const promptTokens = countTokens(promptText, { model });
|
|
200
|
+
// Calculate available tokens (context - prompt - safety buffer)
|
|
201
|
+
const safetyBuffer = Math.ceil(promptTokens * opts.safetyMargin);
|
|
202
|
+
const availableTokens = contextLimit - promptTokens - safetyBuffer;
|
|
203
|
+
// Apply constraints
|
|
204
|
+
let recommendedMaxTokens = Math.min(opts.desiredOutputTokens, availableTokens, modelMaxOutput, opts.maxOutputTokensCap);
|
|
205
|
+
// Ensure minimum
|
|
206
|
+
recommendedMaxTokens = Math.max(recommendedMaxTokens, opts.minOutputTokens);
|
|
207
|
+
return Math.round(recommendedMaxTokens);
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Get detailed token estimation with warnings
|
|
211
|
+
*
|
|
212
|
+
* @param promptText - Full prompt text
|
|
213
|
+
* @param model - Model name
|
|
214
|
+
* @param options - Estimation options
|
|
215
|
+
* @returns Detailed token estimation
|
|
216
|
+
*
|
|
217
|
+
* @example
|
|
218
|
+
* ```typescript
|
|
219
|
+
* const estimation = getTokenEstimation(longPrompt, "gpt-4o", {
|
|
220
|
+
* desiredOutputTokens: 2000
|
|
221
|
+
* });
|
|
222
|
+
*
|
|
223
|
+
* if (!estimation.fitsInContext) {
|
|
224
|
+
* console.error(estimation.warning);
|
|
225
|
+
* }
|
|
226
|
+
*
|
|
227
|
+
* const response = await openai.chat.completions.create({
|
|
228
|
+
* model: "gpt-4o",
|
|
229
|
+
* messages: [...],
|
|
230
|
+
* max_tokens: estimation.recommendedMaxTokens
|
|
231
|
+
* });
|
|
232
|
+
* ```
|
|
233
|
+
*/
|
|
234
|
+
export function getTokenEstimation(promptText, model, options) {
|
|
235
|
+
const opts = { ...DEFAULT_MAX_TOKENS_OPTIONS, ...options };
|
|
236
|
+
// Get model configuration
|
|
237
|
+
const contextLimit = getModelContextLimit(model);
|
|
238
|
+
const modelMaxOutput = getModelMaxOutputTokens(model);
|
|
239
|
+
// Count prompt tokens
|
|
240
|
+
const promptTokens = countTokens(promptText, { model });
|
|
241
|
+
// Calculate safety buffer
|
|
242
|
+
const safetyBuffer = Math.ceil(promptTokens * opts.safetyMargin);
|
|
243
|
+
// Calculate available tokens for output
|
|
244
|
+
const availableOutputTokens = Math.max(0, contextLimit - promptTokens - safetyBuffer);
|
|
245
|
+
// Check if prompt fits in context
|
|
246
|
+
const fitsInContext = promptTokens < contextLimit - opts.minOutputTokens;
|
|
247
|
+
// Calculate recommended max_tokens
|
|
248
|
+
let recommendedMaxTokens = Math.min(opts.desiredOutputTokens, availableOutputTokens, modelMaxOutput, opts.maxOutputTokensCap);
|
|
249
|
+
recommendedMaxTokens = Math.max(recommendedMaxTokens, opts.minOutputTokens);
|
|
250
|
+
// Generate warning if needed
|
|
251
|
+
let warning;
|
|
252
|
+
if (!fitsInContext) {
|
|
253
|
+
warning = `Prompt (${promptTokens} tokens) exceeds context limit (${contextLimit}) minus minimum output (${opts.minOutputTokens}). Response may be truncated or fail.`;
|
|
254
|
+
}
|
|
255
|
+
else if (availableOutputTokens < opts.desiredOutputTokens) {
|
|
256
|
+
warning = `Desired output (${opts.desiredOutputTokens} tokens) reduced to ${recommendedMaxTokens} due to context constraints.`;
|
|
257
|
+
}
|
|
258
|
+
else if (recommendedMaxTokens === opts.minOutputTokens) {
|
|
259
|
+
warning = `Output limited to minimum (${opts.minOutputTokens} tokens). Consider reducing prompt size.`;
|
|
260
|
+
}
|
|
261
|
+
return {
|
|
262
|
+
promptTokens,
|
|
263
|
+
recommendedMaxTokens: Math.round(recommendedMaxTokens),
|
|
264
|
+
contextLimit,
|
|
265
|
+
fitsInContext,
|
|
266
|
+
availableOutputTokens,
|
|
267
|
+
warning,
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Get token estimation for chat messages
|
|
272
|
+
*
|
|
273
|
+
* @param messages - Array of chat messages
|
|
274
|
+
* @param model - Model name
|
|
275
|
+
* @param options - Estimation options
|
|
276
|
+
* @returns Detailed token estimation
|
|
277
|
+
*/
|
|
278
|
+
export function getChatTokenEstimation(messages, model, options) {
|
|
279
|
+
const opts = { ...DEFAULT_MAX_TOKENS_OPTIONS, ...options };
|
|
280
|
+
// Get model configuration
|
|
281
|
+
const contextLimit = getModelContextLimit(model);
|
|
282
|
+
const modelMaxOutput = getModelMaxOutputTokens(model);
|
|
283
|
+
// Count prompt tokens (with chat overhead)
|
|
284
|
+
const promptTokens = countChatTokens(messages, model);
|
|
285
|
+
// Calculate safety buffer
|
|
286
|
+
const safetyBuffer = Math.ceil(promptTokens * opts.safetyMargin);
|
|
287
|
+
// Calculate available tokens for output
|
|
288
|
+
const availableOutputTokens = Math.max(0, contextLimit - promptTokens - safetyBuffer);
|
|
289
|
+
// Check if messages fit in context
|
|
290
|
+
const fitsInContext = promptTokens < contextLimit - opts.minOutputTokens;
|
|
291
|
+
// Calculate recommended max_tokens
|
|
292
|
+
let recommendedMaxTokens = Math.min(opts.desiredOutputTokens, availableOutputTokens, modelMaxOutput, opts.maxOutputTokensCap);
|
|
293
|
+
recommendedMaxTokens = Math.max(recommendedMaxTokens, opts.minOutputTokens);
|
|
294
|
+
// Generate warning if needed
|
|
295
|
+
let warning;
|
|
296
|
+
if (!fitsInContext) {
|
|
297
|
+
warning = `Messages (${promptTokens} tokens) exceed context limit (${contextLimit}) minus minimum output (${opts.minOutputTokens}). Response may be truncated or fail.`;
|
|
298
|
+
}
|
|
299
|
+
else if (availableOutputTokens < opts.desiredOutputTokens) {
|
|
300
|
+
warning = `Desired output (${opts.desiredOutputTokens} tokens) reduced to ${recommendedMaxTokens} due to context constraints.`;
|
|
301
|
+
}
|
|
302
|
+
return {
|
|
303
|
+
promptTokens,
|
|
304
|
+
recommendedMaxTokens: Math.round(recommendedMaxTokens),
|
|
305
|
+
contextLimit,
|
|
306
|
+
fitsInContext,
|
|
307
|
+
availableOutputTokens,
|
|
308
|
+
warning,
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
// =============================================================================
|
|
312
|
+
// Utility Functions
|
|
313
|
+
// =============================================================================
|
|
314
|
+
/**
|
|
315
|
+
* Check if text will fit within model context
|
|
316
|
+
*
|
|
317
|
+
* @param text - Input text
|
|
318
|
+
* @param model - Model name
|
|
319
|
+
* @param reservedOutputTokens - Tokens to reserve for output
|
|
320
|
+
* @returns True if text fits within context
|
|
321
|
+
*/
|
|
322
|
+
export function fitsInContext(text, model, reservedOutputTokens = 1000) {
|
|
323
|
+
const tokens = countTokens(text, { model });
|
|
324
|
+
const contextLimit = getModelContextLimit(model);
|
|
325
|
+
return tokens + reservedOutputTokens <= contextLimit;
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Truncate text to fit within a token limit
|
|
329
|
+
*
|
|
330
|
+
* @param text - Input text
|
|
331
|
+
* @param maxTokens - Maximum tokens allowed
|
|
332
|
+
* @param model - Model name for accurate counting
|
|
333
|
+
* @returns Truncated text
|
|
334
|
+
*
|
|
335
|
+
* @example
|
|
336
|
+
* ```typescript
|
|
337
|
+
* const truncated = truncateToTokenLimit(longText, 1000, "gpt-4o");
|
|
338
|
+
* console.log(countTokens(truncated, { model: "gpt-4o" })); // <= 1000
|
|
339
|
+
* ```
|
|
340
|
+
*/
|
|
341
|
+
export function truncateToTokenLimit(text, maxTokens, model = "gpt-4o") {
|
|
342
|
+
const encoding = getEncodingForModel(model);
|
|
343
|
+
const tokenizer = getOrCreateTokenizer(encoding);
|
|
344
|
+
const currentTokens = tokenizer.countTokens(text);
|
|
345
|
+
if (currentTokens <= maxTokens) {
|
|
346
|
+
return text;
|
|
347
|
+
}
|
|
348
|
+
// Binary search for the right truncation point
|
|
349
|
+
let low = 0;
|
|
350
|
+
let high = text.length;
|
|
351
|
+
let result = "";
|
|
352
|
+
while (low < high) {
|
|
353
|
+
const mid = Math.floor((low + high + 1) / 2);
|
|
354
|
+
const truncated = text.slice(0, mid);
|
|
355
|
+
const tokens = tokenizer.countTokens(truncated);
|
|
356
|
+
if (tokens <= maxTokens) {
|
|
357
|
+
result = truncated;
|
|
358
|
+
low = mid;
|
|
359
|
+
}
|
|
360
|
+
else {
|
|
361
|
+
high = mid - 1;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
// Add ellipsis if truncated
|
|
365
|
+
if (result.length < text.length) {
|
|
366
|
+
// Remove last few characters to make room for ellipsis
|
|
367
|
+
while (result.length > 0 &&
|
|
368
|
+
tokenizer.countTokens(result + "...") > maxTokens) {
|
|
369
|
+
result = result.slice(0, -1);
|
|
370
|
+
}
|
|
371
|
+
result += "...";
|
|
372
|
+
}
|
|
373
|
+
return result;
|
|
374
|
+
}
|
|
375
|
+
/**
|
|
376
|
+
* Split text into chunks that fit within token limits
|
|
377
|
+
*
|
|
378
|
+
* @param text - Input text
|
|
379
|
+
* @param maxTokensPerChunk - Maximum tokens per chunk
|
|
380
|
+
* @param overlap - Number of characters to overlap between chunks
|
|
381
|
+
* @param model - Model name
|
|
382
|
+
* @returns Array of text chunks
|
|
383
|
+
*/
|
|
384
|
+
export function splitIntoChunks(text, maxTokensPerChunk, overlap = 200, model = "gpt-4o") {
|
|
385
|
+
// If text fits in one chunk, return it directly
|
|
386
|
+
const totalTokens = countTokens(text, { model });
|
|
387
|
+
if (totalTokens <= maxTokensPerChunk) {
|
|
388
|
+
return [text.trim()].filter((s) => s.length > 0);
|
|
389
|
+
}
|
|
390
|
+
const chunks = [];
|
|
391
|
+
let remaining = text;
|
|
392
|
+
while (remaining.length > 0) {
|
|
393
|
+
// Find the maximum chunk that fits
|
|
394
|
+
let chunkEnd = remaining.length;
|
|
395
|
+
let chunk = remaining.slice(0, chunkEnd);
|
|
396
|
+
while (chunkEnd > 0 && countTokens(chunk, { model }) > maxTokensPerChunk) {
|
|
397
|
+
// Try to break at a sentence or paragraph boundary
|
|
398
|
+
const breakPoints = ["\n\n", "\n", ". ", "! ", "? ", ", ", " "];
|
|
399
|
+
let foundBreak = false;
|
|
400
|
+
for (const bp of breakPoints) {
|
|
401
|
+
const lastBreak = chunk.lastIndexOf(bp, chunkEnd - 1);
|
|
402
|
+
if (lastBreak > 0) {
|
|
403
|
+
chunkEnd = lastBreak + bp.length;
|
|
404
|
+
chunk = remaining.slice(0, chunkEnd);
|
|
405
|
+
if (countTokens(chunk, { model }) <= maxTokensPerChunk) {
|
|
406
|
+
foundBreak = true;
|
|
407
|
+
break;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
if (!foundBreak) {
|
|
412
|
+
// No good break point, just truncate
|
|
413
|
+
chunkEnd = Math.floor(chunkEnd * 0.8);
|
|
414
|
+
chunk = remaining.slice(0, chunkEnd);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
const trimmedChunk = chunk.trim();
|
|
418
|
+
if (trimmedChunk.length > 0) {
|
|
419
|
+
chunks.push(trimmedChunk);
|
|
420
|
+
}
|
|
421
|
+
// Move position forward, accounting for overlap
|
|
422
|
+
// Ensure we always make progress (minimum advance is half of chunk or 1)
|
|
423
|
+
const minAdvance = Math.max(Math.floor(chunkEnd / 2), 1);
|
|
424
|
+
const advance = Math.max(chunkEnd - overlap, minAdvance);
|
|
425
|
+
remaining = remaining.slice(advance);
|
|
426
|
+
}
|
|
427
|
+
return chunks;
|
|
428
|
+
}
|
|
429
|
+
/**
|
|
430
|
+
* Clear the tokenizer cache
|
|
431
|
+
* Useful for testing or memory management
|
|
432
|
+
*/
|
|
433
|
+
export function clearTokenizerCache() {
|
|
434
|
+
tokenizerCache.clear();
|
|
435
|
+
}
|
|
436
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAUH,OAAO,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AACxC,OAAO,EACL,cAAc,EACd,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,GACxB,MAAM,aAAa,CAAC;AAErB,gFAAgF;AAChF,kBAAkB;AAClB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,cAAc,GAAiC,IAAI,GAAG,EAAE,CAAC;AAE/D;;;;;GAKG;AACH,SAAS,oBAAoB,CAAC,QAAsB;IAClD,IAAI,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE7C,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,QAAQ,CAAC,CAAC;QACvC,SAAS,GAAG;YACV,YAAY,EAAE,QAAQ;YACtB,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;YAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;YAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;SACrD,CAAC;QACF,cAAc,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,gFAAgF;AAChF,oBAAoB;AACpB,gFAAgF;AAEhF;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,WAAW,CAAC,YAA0B;IACpD,OAAO,oBAAoB,CAAC,YAAY,CAAC,CAAC;AAC5C,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAa;IACnD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAC5C,OAAO,oBAAoB,CAAC,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,gFAAgF;AAChF,2BAA2B;AAC3B,gFAAgF;AAEhF;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY,EAAE,OAA2B;IACnE,MAAM,QAAQ,GACZ,OAAO,EAAE,QAAQ;QACjB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,mBAAmB,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC;IACvE,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAEjD,OAAO,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;AACrC,CAAC;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,eAAe,CAC7B,QAAuB,EACvB,QAAgB,QAAQ;IAExB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAC5C,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAEjD,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,6CAA6C;IAC7C,8DAA8D;IAC9D,sCAAsC;IACtC,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;IACrC,MAAM,gBAAgB,GACpB,MAAM,EAAE,MAAM,KAAK,SAAS,IAAI,MAAM,EAAE,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAErE,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,mBAAmB;QACnB,WAAW,IAAI,gBAAgB,CAAC;QAEhC,aAAa;QACb,WAAW,IAAI,CAAC,CAAC;QAEjB,iBAAiB;QACjB,WAAW,IAAI,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEtD,4CAA4C;QAC5C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,WAAW,IAAI,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,6DAA6D;IAC7D,WAAW,IAAI,CAAC,CAAC;IAEjB,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,iBAAiB,CAC/B,YAAoB,EACpB,UAAkB,EAClB,QAAgB,QAAQ;IAExB,MAAM,QAAQ,GAAkB,EAAE,CAAC;IAEnC,IAAI,YAAY,EAAE,CAAC;QACjB,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC;IAC3D,CAAC;IAED,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;IAErD,OAAO,eAAe,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;AAC1C,CAAC;AAED,gFAAgF;AAChF,wBAAwB;AACxB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,0BAA0B,GAA+B;IAC7D,mBAAmB,EAAE,IAAI;IACzB,YAAY,EAAE,GAAG;IACjB,eAAe,EAAE,GAAG;IACpB,kBAAkB,EAAE,KAAK;CAC1B,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,iBAAiB,CAC/B,UAAkB,EAClB,KAAa,EACb,OAA0B;IAE1B,MAAM,IAAI,GAAG,EAAE,GAAG,0BAA0B,EAAE,GAAG,OAAO,EAAE,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,YAAY,GAAG,oBAAoB,CAAC,KAAK,CAAC,CAAC;IACjD,MAAM,cAAc,GAAG,uBAAuB,CAAC,KAAK,CAAC,CAAC;IAEtD,sBAAsB;IACtB,MAAM,YAAY,GAAG,WAAW,CAAC,UAAU,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IAExD,gEAAgE;IAChE,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;IACjE,MAAM,eAAe,GAAG,YAAY,GAAG,YAAY,GAAG,YAAY,CAAC;IAEnE,oBAAoB;IACpB,IAAI,oBAAoB,GAAG,IAAI,CAAC,GAAG,CACjC,IAAI,CAAC,mBAAmB,EACxB,eAAe,EACf,cAAc,EACd,IAAI,CAAC,kBAAkB,CACxB,CAAC;IAEF,iBAAiB;IACjB,oBAAoB,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB,EAAE,IAAI,CAAC,eAAe,CAAC,CAAC;IAE5E,OAAO,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;AAC1C,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,UAAU,kBAAkB,CAChC,UAAkB,EAClB,KAAa,EACb,OAA0B;IAE1B,MAAM,IAAI,GAAG,EAAE,GAAG,0BAA0B,EAAE,GAAG,OAAO,EAAE,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,YAAY,GAAG,oBAAoB,CAAC,KAAK,CAAC,CAAC;IACjD,MAAM,cAAc,GAAG,uBAAuB,CAAC,KAAK,CAAC,CAAC;IAEtD,sBAAsB;IACtB,MAAM,YAAY,GAAG,WAAW,CAAC,UAAU,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IAExD,0BAA0B;IAC1B,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;IAEjE,wCAAwC;IACxC,MAAM,qBAAqB,GAAG,IAAI,CAAC,GAAG,CACpC,CAAC,EACD,YAAY,GAAG,YAAY,GAAG,YAAY,CAC3C,CAAC;IAEF,kCAAkC;IAClC,MAAM,aAAa,GAAG,YAAY,GAAG,YAAY,GAAG,IAAI,CAAC,eAAe,CAAC;IAEzE,mCAAmC;IACnC,IAAI,oBAAoB,GAAG,IAAI,CAAC,GAAG,CACjC,IAAI,CAAC,mBAAmB,EACxB,qBAAqB,EACrB,cAAc,EACd,IAAI,CAAC,kBAAkB,CACxB,CAAC;IAEF,oBAAoB,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB,EAAE,IAAI,CAAC,eAAe,CAAC,CAAC;IAE5E,6BAA6B;IAC7B,IAAI,OAA2B,CAAC;IAEhC,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,WAAW,YAAY,mCAAmC,YAAY,2BAA2B,IAAI,CAAC,eAAe,uCAAuC,CAAC;IACzK,CAAC;SAAM,IAAI,qBAAqB,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;QAC5D,OAAO,GAAG,mBAAmB,IAAI,CAAC,mBAAmB,uBAAuB,oBAAoB,8BAA8B,CAAC;IACjI,CAAC;SAAM,IAAI,oBAAoB,KAAK,IAAI,CAAC,eAAe,EAAE,CAAC;QACzD,OAAO,GAAG,8BAA8B,IAAI,CAAC,eAAe,0CAA0C,CAAC;IACzG,CAAC;IAED,OAAO;QACL,YAAY;QACZ,oBAAoB,EAAE,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC;QACtD,YAAY;QACZ,aAAa;QACb,qBAAqB;QACrB,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAAuB,EACvB,KAAa,EACb,OAA0B;IAE1B,MAAM,IAAI,GAAG,EAAE,GAAG,0BAA0B,EAAE,GAAG,OAAO,EAAE,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,YAAY,GAAG,oBAAoB,CAAC,KAAK,CAAC,CAAC;IACjD,MAAM,cAAc,GAAG,uBAAuB,CAAC,KAAK,CAAC,CAAC;IAEtD,2CAA2C;IAC3C,MAAM,YAAY,GAAG,eAAe,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;IAEtD,0BAA0B;IAC1B,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;IAEjE,wCAAwC;IACxC,MAAM,qBAAqB,GAAG,IAAI,CAAC,GAAG,CACpC,CAAC,EACD,YAAY,GAAG,YAAY,GAAG,YAAY,CAC3C,CAAC;IAEF,mCAAmC;IACnC,MAAM,aAAa,GAAG,YAAY,GAAG,YAAY,GAAG,IAAI,CAAC,eAAe,CAAC;IAEzE,mCAAmC;IACnC,IAAI,oBAAoB,GAAG,IAAI,CAAC,GAAG,CACjC,IAAI,CAAC,mBAAmB,EACxB,qBAAqB,EACrB,cAAc,EACd,IAAI,CAAC,kBAAkB,CACxB,CAAC;IAEF,oBAAoB,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB,EAAE,IAAI,CAAC,eAAe,CAAC,CAAC;IAE5E,6BAA6B;IAC7B,IAAI,OAA2B,CAAC;IAEhC,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,aAAa,YAAY,kCAAkC,YAAY,2BAA2B,IAAI,CAAC,eAAe,uCAAuC,CAAC;IAC1K,CAAC;SAAM,IAAI,qBAAqB,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;QAC5D,OAAO,GAAG,mBAAmB,IAAI,CAAC,mBAAmB,uBAAuB,oBAAoB,8BAA8B,CAAC;IACjI,CAAC;IAED,OAAO;QACL,YAAY;QACZ,oBAAoB,EAAE,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC;QACtD,YAAY;QACZ,aAAa;QACb,qBAAqB;QACrB,OAAO;KACR,CAAC;AACJ,CAAC;AAED,gFAAgF;AAChF,oBAAoB;AACpB,gFAAgF;AAEhF;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAC3B,IAAY,EACZ,KAAa,EACb,uBAA+B,IAAI;IAEnC,MAAM,MAAM,GAAG,WAAW,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IAC5C,MAAM,YAAY,GAAG,oBAAoB,CAAC,KAAK,CAAC,CAAC;IAEjD,OAAO,MAAM,GAAG,oBAAoB,IAAI,YAAY,CAAC;AACvD,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,SAAiB,EACjB,QAAgB,QAAQ;IAExB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAC5C,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAEjD,MAAM,aAAa,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAElD,IAAI,aAAa,IAAI,SAAS,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,+CAA+C;IAC/C,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;IACvB,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,OAAO,GAAG,GAAG,IAAI,EAAE,CAAC;QAClB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,SAAS,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEhD,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;YACxB,MAAM,GAAG,SAAS,CAAC;YACnB,GAAG,GAAG,GAAG,CAAC;QACZ,CAAC;aAAM,CAAC;YACN,IAAI,GAAG,GAAG,GAAG,CAAC,CAAC;QACjB,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAChC,uDAAuD;QACvD,OACE,MAAM,CAAC,MAAM,GAAG,CAAC;YACjB,SAAS,CAAC,WAAW,CAAC,MAAM,GAAG,KAAK,CAAC,GAAG,SAAS,EACjD,CAAC;YACD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC;QACD,MAAM,IAAI,KAAK,CAAC;IAClB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,iBAAyB,EACzB,UAAkB,GAAG,EACrB,QAAgB,QAAQ;IAExB,gDAAgD;IAChD,MAAM,WAAW,GAAG,WAAW,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IACjD,IAAI,WAAW,IAAI,iBAAiB,EAAE,CAAC;QACrC,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACnD,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,SAAS,GAAG,IAAI,CAAC;IAErB,OAAO,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,mCAAmC;QACnC,IAAI,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC;QAChC,IAAI,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAEzC,OAAO,QAAQ,GAAG,CAAC,IAAI,WAAW,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,GAAG,iBAAiB,EAAE,CAAC;YACzE,mDAAmD;YACnD,MAAM,WAAW,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YAEhE,IAAI,UAAU,GAAG,KAAK,CAAC;YACvB,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;gBAC7B,MAAM,SAAS,GAAG,KAAK,CAAC,WAAW,CAAC,EAAE,EAAE,QAAQ,GAAG,CAAC,CAAC,CAAC;gBACtD,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;oBAClB,QAAQ,GAAG,SAAS,GAAG,EAAE,CAAC,MAAM,CAAC;oBACjC,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;oBACrC,IAAI,WAAW,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,IAAI,iBAAiB,EAAE,CAAC;wBACvD,UAAU,GAAG,IAAI,CAAC;wBAClB,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAED,IAAI,CAAC,UAAU,EAAE,CAAC;gBAChB,qCAAqC;gBACrC,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC;gBACtC,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;QAED,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;QAClC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC5B,CAAC;QAED,gDAAgD;QAChD,yEAAyE;QACzE,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACzD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,GAAG,OAAO,EAAE,UAAU,CAAC,CAAC;QACzD,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,mBAAmB;IACjC,cAAc,CAAC,KAAK,EAAE,CAAC;AACzB,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer Types
|
|
3
|
+
* Core interfaces for the tiktoken-ts package
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Encoding names supported by the tokenizer
|
|
7
|
+
* - r50k_base: GPT-2, GPT-3 davinci/curie/babbage/ada
|
|
8
|
+
* - p50k_base: Codex models, text-davinci-002/003
|
|
9
|
+
* - p50k_edit: Edit models (text-davinci-edit-001, code-davinci-edit-001)
|
|
10
|
+
* - cl100k_base: GPT-4, GPT-3.5-turbo, text-embedding-ada-002
|
|
11
|
+
* - o200k_base: GPT-4o, GPT-4.1, GPT-5, o-series models
|
|
12
|
+
* - o200k_harmony: gpt-oss models
|
|
13
|
+
* - claude_estimation: Claude models (estimation only, errs on over-counting for safety)
|
|
14
|
+
*/
|
|
15
|
+
export type EncodingName = "r50k_base" | "p50k_base" | "p50k_edit" | "cl100k_base" | "o200k_base" | "o200k_harmony" | "claude_estimation" | "gpt2";
|
|
16
|
+
/**
|
|
17
|
+
* Model family classification for encoding selection
|
|
18
|
+
*/
|
|
19
|
+
export type ModelFamily = "gpt-5" | "gpt-4.1" | "gpt-4o" | "gpt-4" | "gpt-3.5" | "o-series" | "codex" | "embedding" | "deepseek" | "gemini" | "claude" | "unknown";
|
|
20
|
+
/**
|
|
21
|
+
* Special token types used in chat format
|
|
22
|
+
*/
|
|
23
|
+
export interface SpecialTokens {
|
|
24
|
+
readonly endOfText: string;
|
|
25
|
+
readonly imStart: string;
|
|
26
|
+
readonly imEnd: string;
|
|
27
|
+
readonly imSep: string;
|
|
28
|
+
readonly toolCall?: string;
|
|
29
|
+
readonly toolResult?: string;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Configuration for an encoding
|
|
33
|
+
*/
|
|
34
|
+
export interface EncodingConfig {
|
|
35
|
+
readonly name: EncodingName;
|
|
36
|
+
readonly patternSplit: RegExp;
|
|
37
|
+
readonly specialTokens: SpecialTokens;
|
|
38
|
+
readonly vocabSize: number;
|
|
39
|
+
readonly averageCharsPerToken: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Core tokenizer interface
|
|
43
|
+
*/
|
|
44
|
+
export interface Tokenizer {
|
|
45
|
+
/**
|
|
46
|
+
* Encode text into token IDs
|
|
47
|
+
* @param text - Input text to encode
|
|
48
|
+
* @returns Array of token IDs
|
|
49
|
+
*/
|
|
50
|
+
encode(text: string): number[];
|
|
51
|
+
/**
|
|
52
|
+
* Decode token IDs back to text
|
|
53
|
+
* @param tokens - Array of token IDs
|
|
54
|
+
* @returns Decoded text
|
|
55
|
+
*/
|
|
56
|
+
decode(tokens: number[]): string;
|
|
57
|
+
/**
|
|
58
|
+
* Count the number of tokens in text
|
|
59
|
+
* @param text - Input text
|
|
60
|
+
* @returns Token count
|
|
61
|
+
*/
|
|
62
|
+
countTokens(text: string): number;
|
|
63
|
+
/**
|
|
64
|
+
* Get the encoding name
|
|
65
|
+
*/
|
|
66
|
+
readonly encodingName: EncodingName;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Model context configuration
|
|
70
|
+
*/
|
|
71
|
+
export interface ModelConfig {
|
|
72
|
+
readonly name: string;
|
|
73
|
+
readonly encoding: EncodingName;
|
|
74
|
+
readonly contextLimit: number;
|
|
75
|
+
readonly maxOutputTokens: number;
|
|
76
|
+
readonly family: ModelFamily;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Token estimation result
|
|
80
|
+
*/
|
|
81
|
+
export interface TokenEstimation {
|
|
82
|
+
/** Estimated prompt tokens */
|
|
83
|
+
promptTokens: number;
|
|
84
|
+
/** Recommended max_tokens value */
|
|
85
|
+
recommendedMaxTokens: number;
|
|
86
|
+
/** Model context limit */
|
|
87
|
+
contextLimit: number;
|
|
88
|
+
/** Whether the prompt fits in context */
|
|
89
|
+
fitsInContext: boolean;
|
|
90
|
+
/** Available tokens for output */
|
|
91
|
+
availableOutputTokens: number;
|
|
92
|
+
/** Warning message if any */
|
|
93
|
+
warning?: string;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Chat message for token counting
|
|
97
|
+
*/
|
|
98
|
+
export interface ChatMessage {
|
|
99
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
100
|
+
content: string;
|
|
101
|
+
name?: string;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Token counting options
|
|
105
|
+
*/
|
|
106
|
+
export interface TokenCountOptions {
|
|
107
|
+
/** Model name for accurate counting */
|
|
108
|
+
model?: string;
|
|
109
|
+
/** Encoding name override */
|
|
110
|
+
encoding?: EncodingName;
|
|
111
|
+
/** Include special tokens in count */
|
|
112
|
+
includeSpecialTokens?: boolean;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Max tokens estimation options
|
|
116
|
+
*/
|
|
117
|
+
export interface MaxTokensOptions {
|
|
118
|
+
/** Desired output token count */
|
|
119
|
+
desiredOutputTokens?: number;
|
|
120
|
+
/** Safety margin percentage (0.0 to 1.0) */
|
|
121
|
+
safetyMargin?: number;
|
|
122
|
+
/** Minimum output tokens to reserve */
|
|
123
|
+
minOutputTokens?: number;
|
|
124
|
+
/** Maximum output tokens cap */
|
|
125
|
+
maxOutputTokensCap?: number;
|
|
126
|
+
}
|
|
127
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;;;;;;;;GASG;AACH,MAAM,MAAM,YAAY,GACpB,WAAW,GACX,WAAW,GACX,WAAW,GACX,aAAa,GACb,YAAY,GACZ,eAAe,GACf,mBAAmB,GACnB,MAAM,CAAC;AAEX;;GAEG;AACH,MAAM,MAAM,WAAW,GACnB,OAAO,GACP,SAAS,GACT,QAAQ,GACR,OAAO,GACP,SAAS,GACT,UAAU,GACV,OAAO,GACP,WAAW,GACX,UAAU,GACV,QAAQ,GACR,QAAQ,GACR,SAAS,CAAC;AAEd;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC;IAC5B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,aAAa,EAAE,aAAa,CAAC;IACtC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,oBAAoB,EAAE,MAAM,CAAC;CACvC;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB;;;;OAIG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAE/B;;;;OAIG;IACH,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;IAEjC;;;;OAIG;IACH,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;IAElC;;OAEG;IACH,QAAQ,CAAC,YAAY,EAAE,YAAY,CAAC;CACrC;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,YAAY,CAAC;IAChC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,MAAM,EAAE,WAAW,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,oBAAoB,EAAE,MAAM,CAAC;IAC7B,0BAA0B;IAC1B,YAAY,EAAE,MAAM,CAAC;IACrB,yCAAyC;IACzC,aAAa,EAAE,OAAO,CAAC;IACvB,kCAAkC;IAClC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,6BAA6B;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,GAAG,MAAM,CAAC;IAC/C,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,uCAAuC;IACvC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,6BAA6B;IAC7B,QAAQ,CAAC,EAAE,YAAY,CAAC;IACxB,sCAAsC;IACtC,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,iCAAiC;IACjC,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,4CAA4C;IAC5C,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uCAAuC;IACvC,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,gCAAgC;IAChC,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;GAGG"}
|