@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,299 @@
1
+ /**
2
+ * Model to Encoding Mappings
3
+ *
4
+ * EXACT mappings from tiktoken-rs for model name to encoding.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs
6
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
7
+ */
8
+ /**
9
+ * Model prefix to tokenizer mapping
10
+ * Checked in order, so more specific prefixes should come first
11
+ *
12
+ * From tiktoken-rs MODEL_PREFIX_TO_TOKENIZER
13
+ */
14
+ const MODEL_PREFIX_TO_TOKENIZER = [
15
+ // Reasoning models
16
+ ["o1-", "o200k_base"],
17
+ ["o3-", "o200k_base"],
18
+ ["o4-", "o200k_base"],
19
+ // Chat models
20
+ ["gpt-5-", "o200k_base"],
21
+ ["gpt-4.5-", "o200k_base"],
22
+ ["gpt-4.1-", "o200k_base"],
23
+ ["chatgpt-4o-", "o200k_base"],
24
+ ["gpt-4o-", "o200k_base"],
25
+ ["gpt-4-", "cl100k_base"],
26
+ ["gpt-3.5-turbo-", "cl100k_base"],
27
+ ["gpt-35-turbo-", "cl100k_base"], // Azure deployment name
28
+ ["gpt-oss-", "o200k_harmony"],
29
+ // Fine-tuned models
30
+ ["ft:gpt-4o", "o200k_base"],
31
+ ["ft:gpt-4", "cl100k_base"],
32
+ ["ft:gpt-3.5-turbo", "cl100k_base"],
33
+ ["ft:davinci-002", "cl100k_base"],
34
+ ["ft:babbage-002", "cl100k_base"],
35
+ ];
36
+ /**
37
+ * Exact model name to tokenizer mapping
38
+ *
39
+ * From tiktoken-rs MODEL_TO_TOKENIZER
40
+ */
41
+ const MODEL_TO_TOKENIZER = new Map([
42
+ // Reasoning models
43
+ ["o1", "o200k_base"],
44
+ ["o3", "o200k_base"],
45
+ ["o4", "o200k_base"],
46
+ // Chat models
47
+ ["gpt-5", "o200k_base"],
48
+ ["gpt-4.1", "o200k_base"],
49
+ ["chatgpt-4o-latest", "o200k_base"],
50
+ ["gpt-4o", "o200k_base"],
51
+ ["gpt-4", "cl100k_base"],
52
+ ["gpt-3.5-turbo", "cl100k_base"],
53
+ ["gpt-3.5", "cl100k_base"],
54
+ ["gpt-35-turbo", "cl100k_base"], // Azure deployment name
55
+ // Base models
56
+ ["davinci-002", "cl100k_base"],
57
+ ["babbage-002", "cl100k_base"],
58
+ // Embeddings
59
+ ["text-embedding-ada-002", "cl100k_base"],
60
+ ["text-embedding-3-small", "cl100k_base"],
61
+ ["text-embedding-3-large", "cl100k_base"],
62
+ // DEPRECATED MODELS - Text
63
+ ["text-davinci-003", "p50k_base"],
64
+ ["text-davinci-002", "p50k_base"],
65
+ ["text-davinci-001", "r50k_base"],
66
+ ["text-curie-001", "r50k_base"],
67
+ ["text-babbage-001", "r50k_base"],
68
+ ["text-ada-001", "r50k_base"],
69
+ ["davinci", "r50k_base"],
70
+ ["curie", "r50k_base"],
71
+ ["babbage", "r50k_base"],
72
+ ["ada", "r50k_base"],
73
+ // DEPRECATED MODELS - Code
74
+ ["code-davinci-002", "p50k_base"],
75
+ ["code-davinci-001", "p50k_base"],
76
+ ["code-cushman-002", "p50k_base"],
77
+ ["code-cushman-001", "p50k_base"],
78
+ ["davinci-codex", "p50k_base"],
79
+ ["cushman-codex", "p50k_base"],
80
+ // DEPRECATED MODELS - Edit
81
+ ["text-davinci-edit-001", "p50k_edit"],
82
+ ["code-davinci-edit-001", "p50k_edit"],
83
+ // DEPRECATED MODELS - Old Embeddings
84
+ ["text-similarity-davinci-001", "r50k_base"],
85
+ ["text-similarity-curie-001", "r50k_base"],
86
+ ["text-similarity-babbage-001", "r50k_base"],
87
+ ["text-similarity-ada-001", "r50k_base"],
88
+ ["text-search-davinci-doc-001", "r50k_base"],
89
+ ["text-search-curie-doc-001", "r50k_base"],
90
+ ["text-search-babbage-doc-001", "r50k_base"],
91
+ ["text-search-ada-doc-001", "r50k_base"],
92
+ ["code-search-babbage-code-001", "r50k_base"],
93
+ ["code-search-ada-code-001", "r50k_base"],
94
+ // Open source
95
+ ["gpt2", "gpt2"],
96
+ ["gpt-2", "gpt2"],
97
+ ]);
98
+ /**
99
+ * Get the tokenizer/encoding for a model name
100
+ *
101
+ * This function matches the logic in tiktoken-rs get_tokenizer()
102
+ *
103
+ * @param modelName - The model name
104
+ * @returns The tokenizer name, or undefined if not found
105
+ */
106
+ export function getTokenizerForModel(modelName) {
107
+ // First, check exact match
108
+ const exactMatch = MODEL_TO_TOKENIZER.get(modelName);
109
+ if (exactMatch) {
110
+ return exactMatch;
111
+ }
112
+ // Then, check prefix matches
113
+ for (const [prefix, tokenizer] of MODEL_PREFIX_TO_TOKENIZER) {
114
+ if (modelName.startsWith(prefix)) {
115
+ return tokenizer;
116
+ }
117
+ }
118
+ return undefined;
119
+ }
120
+ /**
121
+ * Context size mapping from tiktoken-rs model.rs
122
+ *
123
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
124
+ */
125
+ export function getContextSize(model) {
126
+ // Handle fine-tuned models
127
+ if (model.startsWith("ft:")) {
128
+ const rest = model.slice(3);
129
+ const base = rest.split(":")[0] ?? rest;
130
+ return getContextSize(base);
131
+ }
132
+ // Check prefixes (most specific first)
133
+ if (model.startsWith("gpt-5")) {
134
+ return 400_000;
135
+ }
136
+ if (model.startsWith("gpt-oss")) {
137
+ return 131_072;
138
+ }
139
+ if (model.startsWith("o1") ||
140
+ model.startsWith("o3") ||
141
+ model.startsWith("o4")) {
142
+ return 200_000;
143
+ }
144
+ if (model.startsWith("gpt-4.1")) {
145
+ return 1_047_576;
146
+ }
147
+ if (model.startsWith("gpt-4o")) {
148
+ return 128_000;
149
+ }
150
+ if (model.startsWith("gpt-4-turbo-")) {
151
+ return 128_000;
152
+ }
153
+ if (model.startsWith("gpt-4-0125")) {
154
+ return 128_000;
155
+ }
156
+ if (model.startsWith("gpt-4-1106")) {
157
+ return 128_000;
158
+ }
159
+ if (model.startsWith("gpt-4-32k")) {
160
+ return 32_768;
161
+ }
162
+ if (model.startsWith("gpt-4")) {
163
+ return 8192;
164
+ }
165
+ if (model.startsWith("gpt-3.5-turbo-0125")) {
166
+ return 16_385;
167
+ }
168
+ if (model.startsWith("gpt-3.5-turbo-1106")) {
169
+ return 16_385;
170
+ }
171
+ if (model.startsWith("gpt-3.5-turbo-16k")) {
172
+ return 16_385;
173
+ }
174
+ if (model.startsWith("gpt-3.5-turbo")) {
175
+ return 16_385;
176
+ }
177
+ if (model.startsWith("text-davinci-002") ||
178
+ model.startsWith("text-davinci-003")) {
179
+ return 4097;
180
+ }
181
+ if (model.startsWith("ada") ||
182
+ model.startsWith("babbage") ||
183
+ model.startsWith("curie")) {
184
+ return 2049;
185
+ }
186
+ if (model.startsWith("code-cushman-001")) {
187
+ return 2048;
188
+ }
189
+ if (model.startsWith("code-davinci-002")) {
190
+ return 8001;
191
+ }
192
+ if (model.startsWith("davinci")) {
193
+ return 2049;
194
+ }
195
+ if (model.startsWith("text-ada-001") ||
196
+ model.startsWith("text-babbage-001") ||
197
+ model.startsWith("text-curie-001")) {
198
+ return 2049;
199
+ }
200
+ if (model.startsWith("text-embedding-ada-002")) {
201
+ return 8192;
202
+ }
203
+ // Default
204
+ return 4096;
205
+ }
206
+ /**
207
+ * Extended context limits including specific model versions
208
+ * from tiktoken-rs model.rs get_context_size match statement
209
+ */
210
+ export const EXACT_CONTEXT_SIZES = {
211
+ // o-series reasoning models
212
+ o1: 200_000,
213
+ "o1-2024-12-17": 200_000,
214
+ "o1-mini": 128_000,
215
+ "o1-mini-2024-09-12": 128_000,
216
+ "o1-preview": 128_000,
217
+ "o1-preview-2024-09-12": 128_000,
218
+ o3: 200_000,
219
+ "o3-mini": 200_000,
220
+ "o3-mini-2025-01-31": 200_000,
221
+ "o4-mini": 200_000,
222
+ "o4-mini-2025-04-16": 200_000,
223
+ // GPT-5 series
224
+ "gpt-5": 400_000,
225
+ "gpt-5-2025-03-22": 400_000,
226
+ "gpt-5-mini": 200_000,
227
+ "gpt-5-mini-2025-03-22": 200_000,
228
+ "gpt-5-turbo": 400_000,
229
+ "gpt-5-turbo-2025-06-09": 400_000,
230
+ "gpt-5-nano": 200_000,
231
+ "gpt-5-nano-2025-09-14": 200_000,
232
+ // GPT-4.5 preview
233
+ "gpt-4.5-preview": 128_000,
234
+ "gpt-4.5-preview-2025-02-27": 128_000,
235
+ // GPT-4.1 series (1M context!)
236
+ "gpt-4.1": 1_047_576,
237
+ "gpt-4.1-2025-04-14": 1_047_576,
238
+ "gpt-4.1-mini": 1_047_576,
239
+ "gpt-4.1-mini-2025-04-14": 1_047_576,
240
+ "gpt-4.1-nano": 1_047_576,
241
+ "gpt-4.1-nano-2025-04-14": 1_047_576,
242
+ // GPT-4o series
243
+ "chatgpt-4o-latest": 128_000,
244
+ "gpt-4o": 128_000,
245
+ "gpt-4o-2024-05-13": 128_000,
246
+ "gpt-4o-2024-08-06": 128_000,
247
+ "gpt-4o-2024-11-20": 128_000,
248
+ "gpt-4o-audio-preview": 128_000,
249
+ "gpt-4o-audio-preview-2024-10-01": 128_000,
250
+ "gpt-4o-audio-preview-2024-12-17": 128_000,
251
+ "gpt-4o-mini": 128_000,
252
+ "gpt-4o-mini-2024-07-18": 128_000,
253
+ "gpt-4o-mini-audio-preview": 128_000,
254
+ "gpt-4o-mini-audio-preview-2024-12-17": 128_000,
255
+ "gpt-4o-realtime-preview": 128_000,
256
+ "gpt-4o-realtime-preview-2024-10-01": 128_000,
257
+ "gpt-4o-realtime-preview-2024-12-17": 128_000,
258
+ // GPT-4 turbo series
259
+ "gpt-4-turbo": 128_000,
260
+ "gpt-4-turbo-2024-04-09": 128_000,
261
+ "gpt-4-turbo-preview": 128_000,
262
+ "gpt-4-0125-preview": 128_000,
263
+ "gpt-4-1106-preview": 128_000,
264
+ "gpt-4-vision-preview": 128_000,
265
+ "gpt-4-1106-vision-preview": 128_000,
266
+ // GPT-4 base
267
+ "gpt-4": 8_192,
268
+ "gpt-4-0613": 8_192,
269
+ "gpt-4-0314": 8_192,
270
+ "gpt-4-32k": 32_768,
271
+ "gpt-4-32k-0613": 32_768,
272
+ "gpt-4-32k-0314": 32_768,
273
+ // GPT-3.5 series
274
+ "gpt-3.5-turbo": 16_385,
275
+ "gpt-3.5-turbo-0125": 16_385,
276
+ "gpt-3.5-turbo-1106": 16_385,
277
+ "gpt-3.5-turbo-0613": 16_385,
278
+ "gpt-3.5-turbo-16k": 16_385,
279
+ "gpt-3.5-turbo-16k-0613": 16_385,
280
+ "gpt-3.5-turbo-instruct": 4_096,
281
+ "gpt-3.5-turbo-instruct-0914": 4_096,
282
+ "gpt-3.5-turbo-0301": 4_096,
283
+ // gpt-oss
284
+ "gpt-oss": 131_072,
285
+ "gpt-oss-2025-03-01": 131_072,
286
+ };
287
+ /**
288
+ * Get context size with exact match support
289
+ */
290
+ export function getExactContextSize(model) {
291
+ // Check exact match first
292
+ const exactSize = EXACT_CONTEXT_SIZES[model];
293
+ if (exactSize !== undefined) {
294
+ return exactSize;
295
+ }
296
+ // Fall back to prefix matching
297
+ return getContextSize(model);
298
+ }
299
+ //# sourceMappingURL=model-to-encoding.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"model-to-encoding.js","sourceRoot":"","sources":["../../src/core/model-to-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAcH;;;;;GAKG;AACH,MAAM,yBAAyB,GAAmC;IAChE,mBAAmB;IACnB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,CAAC,KAAK,EAAE,YAAY,CAAC;IACrB,cAAc;IACd,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,UAAU,EAAE,YAAY,CAAC;IAC1B,CAAC,UAAU,EAAE,YAAY,CAAC;IAC1B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,QAAQ,EAAE,aAAa,CAAC;IACzB,CAAC,gBAAgB,EAAE,aAAa,CAAC;IACjC,CAAC,eAAe,EAAE,aAAa,CAAC,EAAE,wBAAwB;IAC1D,CAAC,UAAU,EAAE,eAAe,CAAC;IAC7B,oBAAoB;IACpB,CAAC,WAAW,EAAE,YAAY,CAAC;IAC3B,CAAC,UAAU,EAAE,aAAa,CAAC;IAC3B,CAAC,kBAAkB,EAAE,aAAa,CAAC;IACnC,CAAC,gBAAgB,EAAE,aAAa,CAAC;IACjC,CAAC,gBAAgB,EAAE,aAAa,CAAC;CAClC,CAAC;AAEF;;;;GAIG;AACH,MAAM,kBAAkB,GAA+B,IAAI,GAAG,CAAC;IAC7D,mBAAmB;IACnB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,CAAC,IAAI,EAAE,YAAY,CAAC;IACpB,cAAc;IACd,CAAC,OAAO,EAAE,YAAY,CAAC;IACvB,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,mBAAmB,EAAE,YAAY,CAAC;IACnC,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,OAAO,EAAE,aAAa,CAAC;IACxB,CAAC,eAAe,EAAE,aAAa,CAAC;IAChC,CAAC,SAAS,EAAE,aAAa,CAAC;IAC1B,CAAC,cAAc,EAAE,aAAa,CAAC,EAAE,wBAAwB;IACzD,cAAc;IACd,CAAC,aAAa,EAAE,aAAa,CAAC;IAC9B,CAAC,aAAa,EAAE,aAAa,CAAC;IAC9B,aAAa;IACb,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,CAAC,wBAAwB,EAAE,aAAa,CAAC;IACzC,2BAA2B;IAC3B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,gBAAgB,EAAE,WAAW,CAAC;IAC/B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,cAAc,EAAE,WAAW,CAAC;IAC7B,CAAC,SAAS,EAAE,WAAW,CAAC;IACxB,CAAC,OAAO,EAAE,WAAW,CAAC;IACtB,CAAC,SAAS,EAAE,WAAW,CAAC;IACxB,CAAC,KAAK,EAAE,WAAW,CAAC;IACpB,2BAA2B;IAC3B,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,kBAAkB,EAAE,WAAW,CAAC;IACjC,CAAC,eAAe,EAAE,WAAW,CAAC;IAC9B,CAAC,eAAe,EAAE,WAAW,CAAC;IAC9B,2BAA2B;IAC3B,CAAC,uBAAuB,EAAE,WAAW,CAAC;IACtC,CAAC,uBAAuB,EAAE,WAAW,CAAC;IACtC,qCAAqC;IACrC,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,2BAA2B,EAAE,WAAW,CAAC;IAC1C,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,yBAAyB,EAAE,WAAW,CAAC;IACxC,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,2BAA2B,EAAE,WAAW,CAAC;IAC1C,CAAC,6BAA6B,EAAE,WAAW,CAAC;IAC5C,CAAC,yBAAyB,EAAE,WAAW,CAAC;IACxC,CAAC,8BAA8B,EAAE,WAAW,CAAC;IAC7C,CAAC,0BAA0B,EAAE,WAAW,CAAC;IACzC,cAAc;IACd,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,OAAO,EAAE,MAAM,CAAC;CAClB,CAAC,CAAC;AAEH;;;;;;;GAOG;AACH,MAAM,UAAU,oBAAoB,CAClC,SAAiB;IAEjB,2BAA2B;IAC3B,MAAM,UAAU,GAAG,kBAAkB,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IACrD,IAAI,UAAU,EAAE,CAAC;QACf,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,6BAA6B;IAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,IAAI,yBAAyB,EAAE,CAAC;QAC5D,IAAI,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YACjC,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,2BAA2B;IAC3B,IAAI,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACxC,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;IAED,uCAAuC;IACvC,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC9B,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC;QACtB,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC;QACtB,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,EACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC/B,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACrC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACnC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACnC,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC9B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,oBAAoB,CAAC,EAAE,CAAC;QAC3C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,oBAAoB,CAAC,EAAE,CAAC;QAC3C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,mBAAmB,CAAC,EAAE,CAAC;QAC1C,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,eAAe,CAAC,EAAE,CAAC;QACtC,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC;QACpC,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EACpC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC;QACvB,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC;QAC3B,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,EACzB,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IACE,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC;QAChC,KAAK,CAAC,UAAU,CAAC,kBAAkB,CAAC;QACpC,KAAK,CAAC,UAAU,CAAC,gBAAgB,CAAC,EAClC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,KAAK,CAAC,UAAU,CAAC,wBAAwB,CAAC,EAAE,CAAC;QAC/C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU;IACV,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAA2B;IACzD,4BAA4B;IAC5B,EAAE,EAAE,OAAO;IACX,eAAe,EAAE,OAAO;IACxB,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAC7B,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAChC,EAAE,EAAE,OAAO;IACX,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAC7B,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;IAE7B,eAAe;IACf,OAAO,EAAE,OAAO;IAChB,kBAAkB,EAAE,OAAO;IAC3B,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAChC,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,YAAY,EAAE,OAAO;IACrB,uBAAuB,EAAE,OAAO;IAEhC,kBAAkB;IAClB,iBAAiB,EAAE,OAAO;IAC1B,4BAA4B,EAAE,OAAO;IAErC,+BAA+B;IAC/B,SAAS,EAAE,SAAS;IACpB,oBAAoB,EAAE,SAAS;IAC/B,cAAc,EAAE,SAAS;IACzB,yBAAyB,EAAE,SAAS;IACpC,cAAc,EAAE,SAAS;IACzB,yBAAyB,EAAE,SAAS;IAEpC,gBAAgB;IAChB,mBAAmB,EAAE,OAAO;IAC5B,QAAQ,EAAE,OAAO;IACjB,mBAAmB,EAAE,OAAO;IAC5B,mBAAmB,EAAE,OAAO;IAC5B,mBAAmB,EAAE,OAAO;IAC5B,sBAAsB,EAAE,OAAO;IAC/B,iCAAiC,EAAE,OAAO;IAC1C,iCAAiC,EAAE,OAAO;IAC1C,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,2BAA2B,EAAE,OAAO;IACpC,sCAAsC,EAAE,OAAO;IAC/C,yBAAyB,EAAE,OAAO;IAClC,oCAAoC,EAAE,OAAO;IAC7C,oCAAoC,EAAE,OAAO;IAE7C,qBAAqB;IACrB,aAAa,EAAE,OAAO;IACtB,wBAAwB,EAAE,OAAO;IACjC,qBAAqB,EAAE,OAAO;IAC9B,oBAAoB,EAAE,OAAO;IAC7B,oBAAoB,EAAE,OAAO;IAC7B,sBAAsB,EAAE,OAAO;IAC/B,2BAA2B,EAAE,OAAO;IAEpC,aAAa;IACb,OAAO,EAAE,KAAK;IACd,YAAY,EAAE,KAAK;IACnB,YAAY,EAAE,KAAK;IACnB,WAAW,EAAE,MAAM;IACnB,gBAAgB,EAAE,MAAM;IACxB,gBAAgB,EAAE,MAAM;IAExB,iBAAiB;IACjB,eAAe,EAAE,MAAM;IACvB,oBAAoB,EAAE,MAAM;IAC5B,oBAAoB,EAAE,MAAM;IAC5B,oBAAoB,EAAE,MAAM;IAC5B,mBAAmB,EAAE,MAAM;IAC3B,wBAAwB,EAAE,MAAM;IAChC,wBAAwB,EAAE,KAAK;IAC/B,6BAA6B,EAAE,KAAK;IACpC,oBAAoB,EAAE,KAAK;IAE3B,UAAU;IACV,SAAS,EAAE,OAAO;IAClB,oBAAoB,EAAE,OAAO;CAC9B,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,0BAA0B;IAC1B,MAAM,SAAS,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAC7C,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;QAC5B,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,+BAA+B;IAC/B,OAAO,cAAc,CAAC,KAAK,CAAC,CAAC;AAC/B,CAAC"}
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Core Tiktoken BPE Implementation
3
+ *
4
+ * This is an EXACT port of the CoreBPE struct from tiktoken-rs.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
6
+ *
7
+ * Provides the main tokenization API:
8
+ * - encode_ordinary(text) - Encode without special tokens
9
+ * - encode(text, allowed_special) - Encode with optional special tokens
10
+ * - encode_with_special_tokens(text) - Encode with all special tokens
11
+ * - decode(tokens) - Decode tokens to text
12
+ * - decode_bytes(tokens) - Decode tokens to raw bytes
13
+ */
14
+ import { type Vocabulary, type Rank } from "./byte-pair-encoding.js";
15
+ /**
16
+ * Error thrown when a token cannot be decoded
17
+ */
18
+ export declare class DecodeKeyError extends Error {
19
+ readonly token: Rank;
20
+ constructor(token: Rank);
21
+ }
22
+ /**
23
+ * CoreBPE - The main tokenizer class
24
+ *
25
+ * This class implements the complete BPE tokenization algorithm,
26
+ * matching the behavior of tiktoken-rs exactly.
27
+ */
28
+ export declare class CoreBPE {
29
+ /** The vocabulary mapping byte sequences to ranks */
30
+ private readonly encoder;
31
+ /** The special tokens mapping strings to ranks */
32
+ private readonly specialTokensEncoder;
33
+ /** Reverse vocabulary mapping ranks to byte sequences */
34
+ private readonly decoder;
35
+ /** Reverse special tokens mapping ranks to byte sequences */
36
+ private readonly specialTokensDecoder;
37
+ /** Compiled regex for splitting text */
38
+ private readonly regex;
39
+ /** Compiled regex for finding special tokens */
40
+ private readonly specialRegex;
41
+ /** Text encoder for converting strings to bytes */
42
+ private readonly textEncoder;
43
+ /** Text decoder for converting bytes to strings */
44
+ private readonly textDecoder;
45
+ /**
46
+ * Create a new CoreBPE instance
47
+ *
48
+ * @param encoder - Vocabulary mapping byte sequences to ranks
49
+ * @param specialTokensEncoder - Special tokens mapping strings to ranks
50
+ * @param pattern - Regex pattern for splitting text into pieces
51
+ */
52
+ constructor(encoder: Vocabulary, specialTokensEncoder: Map<string, Rank>, pattern: string);
53
+ /**
54
+ * Decode tokens to raw bytes
55
+ *
56
+ * The bytes are not guaranteed to be valid UTF-8.
57
+ *
58
+ * @param tokens - Array of token ranks to decode
59
+ * @returns Decoded bytes
60
+ * @throws DecodeKeyError if a token is not found
61
+ */
62
+ decodeBytes(tokens: Rank[]): Uint8Array;
63
+ /**
64
+ * Decode tokens to a string
65
+ *
66
+ * @param tokens - Array of token ranks to decode
67
+ * @returns Decoded string
68
+ * @throws DecodeKeyError if a token is not found
69
+ */
70
+ decode(tokens: Rank[]): string;
71
+ /**
72
+ * Encode text without handling special tokens
73
+ *
74
+ * This is the core encoding logic. Special tokens are treated as regular text.
75
+ *
76
+ * @param text - Text to encode
77
+ * @returns Array of token ranks
78
+ */
79
+ encodeOrdinary(text: string): Rank[];
80
+ /**
81
+ * Encode text with special token handling
82
+ *
83
+ * @param text - Text to encode
84
+ * @param allowedSpecial - Set of special tokens that are allowed
85
+ * @returns Tuple of [tokens, lastPieceTokenLen]
86
+ */
87
+ encode(text: string, allowedSpecial: Set<string>): [Rank[], number];
88
+ /**
89
+ * Encode text with all special tokens allowed
90
+ *
91
+ * @param text - Text to encode
92
+ * @returns Array of token ranks
93
+ */
94
+ encodeWithSpecialTokens(text: string): Rank[];
95
+ /**
96
+ * Get all special tokens
97
+ *
98
+ * @returns Set of special token strings
99
+ */
100
+ getSpecialTokens(): Set<string>;
101
+ /**
102
+ * Get the vocabulary size (excluding special tokens)
103
+ */
104
+ get vocabSize(): number;
105
+ /**
106
+ * Get the total vocabulary size (including special tokens)
107
+ */
108
+ get totalVocabSize(): number;
109
+ /**
110
+ * Check if a token rank is a special token
111
+ */
112
+ isSpecialToken(token: Rank): boolean;
113
+ /**
114
+ * Get the byte representation of a token
115
+ */
116
+ getTokenBytes(token: Rank): Uint8Array | undefined;
117
+ /**
118
+ * Get the rank of a byte sequence
119
+ */
120
+ getRank(bytes: Uint8Array): Rank | undefined;
121
+ /**
122
+ * Get the rank of a special token
123
+ */
124
+ getSpecialTokenRank(token: string): Rank | undefined;
125
+ }
126
+ //# sourceMappingURL=tiktoken.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tiktoken.d.ts","sourceRoot":"","sources":["../../src/core/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EACL,KAAK,UAAU,EAEf,KAAK,IAAI,EAIV,MAAM,yBAAyB,CAAC;AAEjC;;GAEG;AACH,qBAAa,cAAe,SAAQ,KAAK;IACvC,SAAgB,KAAK,EAAE,IAAI,CAAC;gBAEhB,KAAK,EAAE,IAAI;CAKxB;AASD;;;;;GAKG;AACH,qBAAa,OAAO;IAClB,qDAAqD;IACrD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAa;IAErC,kDAAkD;IAClD,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAoB;IAEzD,yDAAyD;IACzD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAoB;IAE5C,6DAA6D;IAC7D,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAwB;IAE7D,wCAAwC;IACxC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAE/B,gDAAgD;IAChD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAgB;IAE7C,mDAAmD;IACnD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmC;IAE/D,mDAAmD;IACnD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmC;IAE/D;;;;;;OAMG;gBAED,OAAO,EAAE,UAAU,EACnB,oBAAoB,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,EACvC,OAAO,EAAE,MAAM;IAyCjB;;;;;;;;OAQG;IACH,WAAW,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,UAAU;IA2BvC;;;;;;OAMG;IACH,MAAM,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM;IAK9B;;;;;;;OAOG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,EAAE;IA0BpC;;;;;;OAMG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,cAAc,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,MAAM,CAAC;IA4EnE;;;;;OAKG;IACH,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,EAAE;IAK7C;;;;OAIG;IACH,gBAAgB,IAAI,GAAG,CAAC,MAAM,CAAC;IAI/B;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,IAAI,cAAc,IAAI,MAAM,CAE3B;IAED;;OAEG;IACH,cAAc,CAAC,KAAK,EAAE,IAAI,GAAG,OAAO;IAIpC;;OAEG;IACH,aAAa,CAAC,KAAK,EAAE,IAAI,GAAG,UAAU,GAAG,SAAS;IAIlD;;OAEG;IACH,OAAO,CAAC,KAAK,EAAE,UAAU,GAAG,IAAI,GAAG,SAAS;IAK5C;;OAEG;IACH,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS;CAGrD"}