@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,295 @@
1
+ /**
2
+ * Core Tiktoken BPE Implementation
3
+ *
4
+ * This is an EXACT port of the CoreBPE struct from tiktoken-rs.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
6
+ *
7
+ * Provides the main tokenization API:
8
+ * - encode_ordinary(text) - Encode without special tokens
9
+ * - encode(text, allowed_special) - Encode with optional special tokens
10
+ * - encode_with_special_tokens(text) - Encode with all special tokens
11
+ * - decode(tokens) - Decode tokens to text
12
+ * - decode_bytes(tokens) - Decode tokens to raw bytes
13
+ */
14
+ import { bytePairEncode, bytesToKey, keyToBytes, } from "./byte-pair-encoding.js";
15
+ /**
16
+ * Error thrown when a token cannot be decoded
17
+ */
18
+ export class DecodeKeyError extends Error {
19
+ token;
20
+ constructor(token) {
21
+ super(`Invalid token for decoding: ${token}`);
22
+ this.name = "DecodeKeyError";
23
+ this.token = token;
24
+ }
25
+ }
26
+ /**
27
+ * Escape a string for use in a regex
28
+ */
29
+ function escapeRegex(str) {
30
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
31
+ }
32
+ /**
33
+ * CoreBPE - The main tokenizer class
34
+ *
35
+ * This class implements the complete BPE tokenization algorithm,
36
+ * matching the behavior of tiktoken-rs exactly.
37
+ */
38
+ export class CoreBPE {
39
+ /** The vocabulary mapping byte sequences to ranks */
40
+ encoder;
41
+ /** The special tokens mapping strings to ranks */
42
+ specialTokensEncoder;
43
+ /** Reverse vocabulary mapping ranks to byte sequences */
44
+ decoder;
45
+ /** Reverse special tokens mapping ranks to byte sequences */
46
+ specialTokensDecoder;
47
+ /** Compiled regex for splitting text */
48
+ regex;
49
+ /** Compiled regex for finding special tokens */
50
+ specialRegex;
51
+ /** Text encoder for converting strings to bytes */
52
+ textEncoder;
53
+ /** Text decoder for converting bytes to strings */
54
+ textDecoder;
55
+ /**
56
+ * Create a new CoreBPE instance
57
+ *
58
+ * @param encoder - Vocabulary mapping byte sequences to ranks
59
+ * @param specialTokensEncoder - Special tokens mapping strings to ranks
60
+ * @param pattern - Regex pattern for splitting text into pieces
61
+ */
62
+ constructor(encoder, specialTokensEncoder, pattern) {
63
+ this.encoder = encoder;
64
+ this.specialTokensEncoder = specialTokensEncoder;
65
+ this.textEncoder = new TextEncoder();
66
+ this.textDecoder = new TextDecoder("utf-8", { fatal: false });
67
+ // Build the decoder (reverse of encoder)
68
+ this.decoder = new Map();
69
+ for (const [key, rank] of encoder) {
70
+ this.decoder.set(rank, keyToBytes(key));
71
+ }
72
+ // Verify encoder and decoder have same size
73
+ if (encoder.size !== this.decoder.size) {
74
+ throw new Error("Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?");
75
+ }
76
+ // Build special tokens decoder
77
+ this.specialTokensDecoder = new Map();
78
+ for (const [token, rank] of specialTokensEncoder) {
79
+ this.specialTokensDecoder.set(rank, this.textEncoder.encode(token));
80
+ }
81
+ // Compile the regex pattern
82
+ // Note: JavaScript regex flags differ from Rust - we use 'gu' for global + unicode
83
+ this.regex = new RegExp(pattern, "gu");
84
+ // Build special tokens regex
85
+ if (specialTokensEncoder.size > 0) {
86
+ const escapedTokens = Array.from(specialTokensEncoder.keys())
87
+ .map(escapeRegex)
88
+ .join("|");
89
+ this.specialRegex = new RegExp(escapedTokens, "g");
90
+ }
91
+ else {
92
+ this.specialRegex = null;
93
+ }
94
+ }
95
+ /**
96
+ * Decode tokens to raw bytes
97
+ *
98
+ * The bytes are not guaranteed to be valid UTF-8.
99
+ *
100
+ * @param tokens - Array of token ranks to decode
101
+ * @returns Decoded bytes
102
+ * @throws DecodeKeyError if a token is not found
103
+ */
104
+ decodeBytes(tokens) {
105
+ const parts = [];
106
+ let totalLength = 0;
107
+ for (const token of tokens) {
108
+ let tokenBytes = this.decoder.get(token);
109
+ if (tokenBytes === undefined) {
110
+ tokenBytes = this.specialTokensDecoder.get(token);
111
+ }
112
+ if (tokenBytes === undefined) {
113
+ throw new DecodeKeyError(token);
114
+ }
115
+ parts.push(tokenBytes);
116
+ totalLength += tokenBytes.length;
117
+ }
118
+ // Concatenate all parts
119
+ const result = new Uint8Array(totalLength);
120
+ let offset = 0;
121
+ for (const part of parts) {
122
+ result.set(part, offset);
123
+ offset += part.length;
124
+ }
125
+ return result;
126
+ }
127
+ /**
128
+ * Decode tokens to a string
129
+ *
130
+ * @param tokens - Array of token ranks to decode
131
+ * @returns Decoded string
132
+ * @throws DecodeKeyError if a token is not found
133
+ */
134
+ decode(tokens) {
135
+ const bytes = this.decodeBytes(tokens);
136
+ return this.textDecoder.decode(bytes);
137
+ }
138
+ /**
139
+ * Encode text without handling special tokens
140
+ *
141
+ * This is the core encoding logic. Special tokens are treated as regular text.
142
+ *
143
+ * @param text - Text to encode
144
+ * @returns Array of token ranks
145
+ */
146
+ encodeOrdinary(text) {
147
+ const tokens = [];
148
+ // Reset regex state
149
+ this.regex.lastIndex = 0;
150
+ let match;
151
+ while ((match = this.regex.exec(text)) !== null) {
152
+ const piece = match[0];
153
+ const pieceBytes = this.textEncoder.encode(piece);
154
+ const key = bytesToKey(pieceBytes);
155
+ // Check if the piece is a single token
156
+ const directRank = this.encoder.get(key);
157
+ if (directRank !== undefined) {
158
+ tokens.push(directRank);
159
+ }
160
+ else {
161
+ // Apply BPE
162
+ const bpeTokens = bytePairEncode(pieceBytes, this.encoder);
163
+ tokens.push(...bpeTokens);
164
+ }
165
+ }
166
+ return tokens;
167
+ }
168
+ /**
169
+ * Encode text with special token handling
170
+ *
171
+ * @param text - Text to encode
172
+ * @param allowedSpecial - Set of special tokens that are allowed
173
+ * @returns Tuple of [tokens, lastPieceTokenLen]
174
+ */
175
+ encode(text, allowedSpecial) {
176
+ if (!this.specialRegex || allowedSpecial.size === 0) {
177
+ // No special tokens to handle
178
+ return [this.encodeOrdinary(text), 0];
179
+ }
180
+ const tokens = [];
181
+ let start = 0;
182
+ let lastPieceTokenLen = 0;
183
+ while (start < text.length) {
184
+ // Find the next special token
185
+ let nextSpecial = null;
186
+ let searchStart = start;
187
+ while (searchStart < text.length) {
188
+ this.specialRegex.lastIndex = searchStart;
189
+ const match = this.specialRegex.exec(text);
190
+ if (!match) {
191
+ break;
192
+ }
193
+ const matchedToken = match[0];
194
+ if (allowedSpecial.has(matchedToken)) {
195
+ nextSpecial = { index: match.index, token: matchedToken };
196
+ break;
197
+ }
198
+ // Token not allowed, continue searching
199
+ searchStart = match.index + 1;
200
+ }
201
+ const end = nextSpecial ? nextSpecial.index : text.length;
202
+ // Encode the text before the special token
203
+ if (end > start) {
204
+ const segment = text.slice(start, end);
205
+ // Reset regex state
206
+ this.regex.lastIndex = 0;
207
+ let match;
208
+ while ((match = this.regex.exec(segment)) !== null) {
209
+ const piece = match[0];
210
+ const pieceBytes = this.textEncoder.encode(piece);
211
+ const key = bytesToKey(pieceBytes);
212
+ const directRank = this.encoder.get(key);
213
+ if (directRank !== undefined) {
214
+ lastPieceTokenLen = 1;
215
+ tokens.push(directRank);
216
+ }
217
+ else {
218
+ const bpeTokens = bytePairEncode(pieceBytes, this.encoder);
219
+ lastPieceTokenLen = bpeTokens.length;
220
+ tokens.push(...bpeTokens);
221
+ }
222
+ }
223
+ }
224
+ // Handle the special token
225
+ if (nextSpecial) {
226
+ const specialRank = this.specialTokensEncoder.get(nextSpecial.token);
227
+ if (specialRank !== undefined) {
228
+ tokens.push(specialRank);
229
+ lastPieceTokenLen = 0;
230
+ }
231
+ start = nextSpecial.index + nextSpecial.token.length;
232
+ }
233
+ else {
234
+ break;
235
+ }
236
+ }
237
+ return [tokens, lastPieceTokenLen];
238
+ }
239
+ /**
240
+ * Encode text with all special tokens allowed
241
+ *
242
+ * @param text - Text to encode
243
+ * @returns Array of token ranks
244
+ */
245
+ encodeWithSpecialTokens(text) {
246
+ const allowedSpecial = this.getSpecialTokens();
247
+ return this.encode(text, allowedSpecial)[0];
248
+ }
249
+ /**
250
+ * Get all special tokens
251
+ *
252
+ * @returns Set of special token strings
253
+ */
254
+ getSpecialTokens() {
255
+ return new Set(this.specialTokensEncoder.keys());
256
+ }
257
+ /**
258
+ * Get the vocabulary size (excluding special tokens)
259
+ */
260
+ get vocabSize() {
261
+ return this.encoder.size;
262
+ }
263
+ /**
264
+ * Get the total vocabulary size (including special tokens)
265
+ */
266
+ get totalVocabSize() {
267
+ return this.encoder.size + this.specialTokensEncoder.size;
268
+ }
269
+ /**
270
+ * Check if a token rank is a special token
271
+ */
272
+ isSpecialToken(token) {
273
+ return this.specialTokensDecoder.has(token);
274
+ }
275
+ /**
276
+ * Get the byte representation of a token
277
+ */
278
+ getTokenBytes(token) {
279
+ return this.decoder.get(token) ?? this.specialTokensDecoder.get(token);
280
+ }
281
+ /**
282
+ * Get the rank of a byte sequence
283
+ */
284
+ getRank(bytes) {
285
+ const key = bytesToKey(bytes);
286
+ return this.encoder.get(key);
287
+ }
288
+ /**
289
+ * Get the rank of a special token
290
+ */
291
+ getSpecialTokenRank(token) {
292
+ return this.specialTokensEncoder.get(token);
293
+ }
294
+ }
295
+ //# sourceMappingURL=tiktoken.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tiktoken.js","sourceRoot":"","sources":["../../src/core/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAIL,cAAc,EACd,UAAU,EACV,UAAU,GACX,MAAM,yBAAyB,CAAC;AAEjC;;GAEG;AACH,MAAM,OAAO,cAAe,SAAQ,KAAK;IACvB,KAAK,CAAO;IAE5B,YAAY,KAAW;QACrB,KAAK,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;QAC9C,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;QAC7B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,GAAG,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AACpD,CAAC;AAED;;;;;GAKG;AACH,MAAM,OAAO,OAAO;IAClB,qDAAqD;IACpC,OAAO,CAAa;IAErC,kDAAkD;IACjC,oBAAoB,CAAoB;IAEzD,yDAAyD;IACxC,OAAO,CAAoB;IAE5C,6DAA6D;IAC5C,oBAAoB,CAAwB;IAE7D,wCAAwC;IACvB,KAAK,CAAS;IAE/B,gDAAgD;IAC/B,YAAY,CAAgB;IAE7C,mDAAmD;IAClC,WAAW,CAAmC;IAE/D,mDAAmD;IAClC,WAAW,CAAmC;IAE/D;;;;;;OAMG;IACH,YACE,OAAmB,EACnB,oBAAuC,EACvC,OAAe;QAEf,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,oBAAoB,GAAG,oBAAoB,CAAC;QACjD,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;QAE9D,yCAAyC;QACzC,IAAI,CAAC,OAAO,GAAG,IAAI,GAAG,EAAE,CAAC;QACzB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,OAAO,EAAE,CAAC;YAClC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,4CAA4C;QAC5C,IAAI,OAAO,CAAC,IAAI,KAAK,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YACvC,MAAM,IAAI,KAAK,CACb,qGAAqG,CACtG,CAAC;QACJ,CAAC;QAED,+BAA+B;QAC/B,IAAI,CAAC,oBAAoB,GAAG,IAAI,GAAG,EAAE,CAAC;QACtC,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,oBAAoB,EAAE,CAAC;YACjD,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACtE,CAAC;QAED,4BAA4B;QAC5B,mFAAmF;QACnF,IAAI,CAAC,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;QAEvC,6BAA6B;QAC7B,IAAI,oBAAoB,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YAClC,MAAM,aAAa,GAAG,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC;iBAC1D,GAAG,CAAC,WAAW,CAAC;iBAChB,IAAI,CAAC,GAAG,CAAC,CAAC;YACb,IAAI,CAAC,YAAY,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;IAED;;;;;;;;OAQG;IACH,WAAW,CAAC,MAAc;QACxB,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,IAAI,WAAW,GAAG,CAAC,CAAC;QAEpB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,UAAU,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YACpD,CAAC;YACD,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,MAAM,IAAI,cAAc,CAAC,KAAK,CAAC,CAAC;YAClC,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACvB,WAAW,IAAI,UAAU,CAAC,MAAM,CAAC;QACnC,CAAC;QAED,wBAAwB;QACxB,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAC3C,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YACzB,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC;QACxB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,MAAc;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACvC,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IACxC,CAAC;IAED;;;;;;;OAOG;IACH,cAAc,CAAC,IAAY;QACzB,MAAM,MAAM,GAAW,EAAE,CAAC;QAE1B,oBAAoB;QACpB,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC;QAEzB,IAAI,KAAK,CAAC;QACV,OAAO,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACvB,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAClD,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;YAEnC,uCAAuC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,YAAY;gBACZ,MAAM,SAAS,GAAG,cAAc,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAC3D,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY,EAAE,cAA2B;QAC9C,IAAI,CAAC,IAAI,CAAC,YAAY,IAAI,cAAc,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YACpD,8BAA8B;YAC9B,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,MAAM,GAAW,EAAE,CAAC;QAC1B,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,iBAAiB,GAAG,CAAC,CAAC;QAE1B,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,8BAA8B;YAC9B,IAAI,WAAW,GAA4C,IAAI,CAAC;YAChE,IAAI,WAAW,GAAG,KAAK,CAAC;YAExB,OAAO,WAAW,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjC,IAAI,CAAC,YAAY,CAAC,SAAS,GAAG,WAAW,CAAC;gBAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAE3C,IAAI,CAAC,KAAK,EAAE,CAAC;oBACX,MAAM;gBACR,CAAC;gBAED,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC9B,IAAI,cAAc,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,CAAC;oBACrC,WAAW,GAAG,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;oBAC1D,MAAM;gBACR,CAAC;gBAED,wCAAwC;gBACxC,WAAW,GAAG,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC;YAChC,CAAC;YAED,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;YAE1D,2CAA2C;YAC3C,IAAI,GAAG,GAAG,KAAK,EAAE,CAAC;gBAChB,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAEvC,oBAAoB;gBACpB,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC;gBAEzB,IAAI,KAAK,CAAC;gBACV,OAAO,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;oBACvB,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBAClD,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;oBAEnC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;wBAC7B,iBAAiB,GAAG,CAAC,CAAC;wBACtB,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;oBAC1B,CAAC;yBAAM,CAAC;wBACN,MAAM,SAAS,GAAG,cAAc,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;wBAC3D,iBAAiB,GAAG,SAAS,CAAC,MAAM,CAAC;wBACrC,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;oBAC5B,CAAC;gBACH,CAAC;YACH,CAAC;YAED,2BAA2B;YAC3B,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,WAAW,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBACrE,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;oBAC9B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBACzB,iBAAiB,GAAG,CAAC,CAAC;gBACxB,CAAC;gBACD,KAAK,GAAG,WAAW,CAAC,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC;YACvD,CAAC;iBAAM,CAAC;gBACN,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC;IACrC,CAAC;IAED;;;;;OAKG;IACH,uBAAuB,CAAC,IAAY;QAClC,MAAM,cAAc,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC/C,OAAO,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED;;;;OAIG;IACH,gBAAgB;QACd,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,IAAI,cAAc;QAChB,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;IAC5D,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,KAAW;QACxB,OAAO,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,KAAW;QACvB,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IACzE,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,KAAiB;QACvB,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC9B,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,mBAAmB,CAAC,KAAa;QAC/B,OAAO,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC9C,CAAC;CACF"}
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Vocabulary Loader
3
+ *
4
+ * Handles loading and parsing of .tiktoken vocabulary files.
5
+ * These files contain base64-encoded tokens with their ranks.
6
+ *
7
+ * Format: Each line is "base64_token rank"
8
+ * Example: "SGVsbG8= 12345" means "Hello" has rank 12345
9
+ */
10
+ import { type Vocabulary, type ReverseVocabulary, type Rank, keyToBytes } from "./byte-pair-encoding.js";
11
+ export { keyToBytes };
12
+ /**
13
+ * Base64 encode (works in both Node.js and browser)
14
+ */
15
+ export declare function base64Encode(bytes: Uint8Array): string;
16
+ /**
17
+ * Parse a .tiktoken vocabulary file content
18
+ *
19
+ * @param content - The raw content of the .tiktoken file
20
+ * @returns Vocabulary map and reverse vocabulary map
21
+ */
22
+ export declare function parseVocabulary(content: string): {
23
+ encoder: Vocabulary;
24
+ decoder: ReverseVocabulary;
25
+ };
26
+ /**
27
+ * Vocabulary URLs for each encoding
28
+ */
29
+ export declare const VOCABULARY_URLS: Record<string, string>;
30
+ /**
31
+ * Load vocabulary from URL (with caching)
32
+ *
33
+ * @param encodingName - Name of the encoding
34
+ * @returns Promise resolving to vocabulary maps
35
+ */
36
+ export declare function loadVocabularyFromUrl(encodingName: string): Promise<{
37
+ encoder: Vocabulary;
38
+ decoder: ReverseVocabulary;
39
+ }>;
40
+ /**
41
+ * Load vocabulary from a string (for embedded vocabularies)
42
+ *
43
+ * @param encodingName - Name of the encoding (for caching)
44
+ * @param content - The vocabulary file content
45
+ * @returns Vocabulary maps
46
+ */
47
+ export declare function loadVocabularyFromString(encodingName: string, content: string): {
48
+ encoder: Vocabulary;
49
+ decoder: ReverseVocabulary;
50
+ };
51
+ /**
52
+ * Create special tokens map
53
+ *
54
+ * @param specialTokens - Map of special token strings to ranks
55
+ * @returns Maps for encoding and decoding special tokens
56
+ */
57
+ export declare function createSpecialTokenMaps(specialTokens: Record<string, Rank>): {
58
+ encoder: Map<string, Rank>;
59
+ decoder: Map<Rank, Uint8Array>;
60
+ };
61
+ /**
62
+ * Clear the vocabulary cache
63
+ * Useful for testing or memory management
64
+ */
65
+ export declare function clearVocabularyCache(): void;
66
+ /**
67
+ * Get vocabulary from cache (sync, returns undefined if not loaded)
68
+ */
69
+ export declare function getVocabularyFromCache(encodingName: string): {
70
+ encoder: Vocabulary;
71
+ decoder: ReverseVocabulary;
72
+ } | undefined;
73
+ /**
74
+ * Check if a vocabulary is cached
75
+ */
76
+ export declare function isVocabularyCached(encodingName: string): boolean;
77
+ //# sourceMappingURL=vocab-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vocab-loader.d.ts","sourceRoot":"","sources":["../../src/core/vocab-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,KAAK,UAAU,EACf,KAAK,iBAAiB,EACtB,KAAK,IAAI,EAET,UAAU,EACX,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EAAE,UAAU,EAAE,CAAC;AAsBtB;;GAEG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAWtD;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG;IAChD,OAAO,EAAE,UAAU,CAAC;IACpB,OAAO,EAAE,iBAAiB,CAAC;CAC5B,CAgCA;AAED;;GAEG;AACH,eAAO,MAAM,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAalD,CAAC;AAUF;;;;;GAKG;AACH,wBAAsB,qBAAqB,CACzC,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,CAAC,CA2B9D;AAED;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,YAAY,EAAE,MAAM,EACpB,OAAO,EAAE,MAAM,GACd;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,CAarD;AAED;;;;;GAKG;AACH,wBAAgB,sBAAsB,CAAC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,GAAG;IAC3E,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC3B,OAAO,EAAE,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;CAChC,CAYA;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,IAAI,IAAI,CAE3C;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CACpC,YAAY,EAAE,MAAM,GACnB;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,GAAG,SAAS,CAEjE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,YAAY,EAAE,MAAM,GAAG,OAAO,CAEhE"}
@@ -0,0 +1,176 @@
1
+ /**
2
+ * Vocabulary Loader
3
+ *
4
+ * Handles loading and parsing of .tiktoken vocabulary files.
5
+ * These files contain base64-encoded tokens with their ranks.
6
+ *
7
+ * Format: Each line is "base64_token rank"
8
+ * Example: "SGVsbG8= 12345" means "Hello" has rank 12345
9
+ */
10
+ import { bytesToKey, keyToBytes, } from "./byte-pair-encoding.js";
11
+ // Re-export for convenience
12
+ export { keyToBytes };
13
+ /**
14
+ * Base64 decode (works in both Node.js and browser)
15
+ */
16
+ function base64Decode(base64) {
17
+ // Use built-in atob for browser compatibility
18
+ // Node.js 18+ also supports atob globally
19
+ if (typeof atob === "function") {
20
+ const binaryString = atob(base64);
21
+ const bytes = new Uint8Array(binaryString.length);
22
+ for (let i = 0; i < binaryString.length; i++) {
23
+ bytes[i] = binaryString.charCodeAt(i);
24
+ }
25
+ return bytes;
26
+ }
27
+ // Fallback for older Node.js (should not be needed with Node 18+)
28
+ const buffer = Buffer.from(base64, "base64");
29
+ return new Uint8Array(buffer);
30
+ }
31
+ /**
32
+ * Base64 encode (works in both Node.js and browser)
33
+ */
34
+ export function base64Encode(bytes) {
35
+ if (typeof btoa === "function") {
36
+ let binaryString = "";
37
+ for (let i = 0; i < bytes.length; i++) {
38
+ binaryString += String.fromCharCode(bytes[i]);
39
+ }
40
+ return btoa(binaryString);
41
+ }
42
+ // Fallback for older Node.js
43
+ return Buffer.from(bytes).toString("base64");
44
+ }
45
+ /**
46
+ * Parse a .tiktoken vocabulary file content
47
+ *
48
+ * @param content - The raw content of the .tiktoken file
49
+ * @returns Vocabulary map and reverse vocabulary map
50
+ */
51
+ export function parseVocabulary(content) {
52
+ const encoder = new Map();
53
+ const decoder = new Map();
54
+ const lines = content.split("\n");
55
+ for (const line of lines) {
56
+ const trimmed = line.trim();
57
+ if (!trimmed)
58
+ continue;
59
+ const spaceIndex = trimmed.indexOf(" ");
60
+ if (spaceIndex === -1)
61
+ continue;
62
+ const base64Token = trimmed.slice(0, spaceIndex);
63
+ const rankStr = trimmed.slice(spaceIndex + 1);
64
+ try {
65
+ const tokenBytes = base64Decode(base64Token);
66
+ const rank = parseInt(rankStr, 10);
67
+ if (!isNaN(rank)) {
68
+ const key = bytesToKey(tokenBytes);
69
+ encoder.set(key, rank);
70
+ decoder.set(rank, tokenBytes);
71
+ }
72
+ }
73
+ catch {
74
+ // Skip malformed lines
75
+ continue;
76
+ }
77
+ }
78
+ return { encoder, decoder };
79
+ }
80
+ /**
81
+ * Vocabulary URLs for each encoding
82
+ */
83
+ export const VOCABULARY_URLS = {
84
+ r50k_base: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
85
+ p50k_base: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
86
+ p50k_edit: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", // Uses same vocab
87
+ cl100k_base: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
88
+ o200k_base: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
89
+ o200k_harmony: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", // Uses same vocab
90
+ };
91
+ /**
92
+ * Cache for loaded vocabularies
93
+ */
94
+ const vocabularyCache = new Map();
95
+ /**
96
+ * Load vocabulary from URL (with caching)
97
+ *
98
+ * @param encodingName - Name of the encoding
99
+ * @returns Promise resolving to vocabulary maps
100
+ */
101
+ export async function loadVocabularyFromUrl(encodingName) {
102
+ // Check cache first
103
+ const cached = vocabularyCache.get(encodingName);
104
+ if (cached) {
105
+ return cached;
106
+ }
107
+ const url = VOCABULARY_URLS[encodingName];
108
+ if (!url) {
109
+ throw new Error(`Unknown encoding: ${encodingName}`);
110
+ }
111
+ // Fetch the vocabulary file
112
+ const response = await fetch(url);
113
+ if (!response.ok) {
114
+ throw new Error(`Failed to fetch vocabulary for ${encodingName}: ${response.statusText}`);
115
+ }
116
+ const content = await response.text();
117
+ const vocab = parseVocabulary(content);
118
+ // Cache the result
119
+ vocabularyCache.set(encodingName, vocab);
120
+ return vocab;
121
+ }
122
+ /**
123
+ * Load vocabulary from a string (for embedded vocabularies)
124
+ *
125
+ * @param encodingName - Name of the encoding (for caching)
126
+ * @param content - The vocabulary file content
127
+ * @returns Vocabulary maps
128
+ */
129
+ export function loadVocabularyFromString(encodingName, content) {
130
+ // Check cache first
131
+ const cached = vocabularyCache.get(encodingName);
132
+ if (cached) {
133
+ return cached;
134
+ }
135
+ const vocab = parseVocabulary(content);
136
+ // Cache the result
137
+ vocabularyCache.set(encodingName, vocab);
138
+ return vocab;
139
+ }
140
+ /**
141
+ * Create special tokens map
142
+ *
143
+ * @param specialTokens - Map of special token strings to ranks
144
+ * @returns Maps for encoding and decoding special tokens
145
+ */
146
+ export function createSpecialTokenMaps(specialTokens) {
147
+ const encoder = new Map();
148
+ const decoder = new Map();
149
+ for (const [token, rank] of Object.entries(specialTokens)) {
150
+ encoder.set(token, rank);
151
+ // Convert string to bytes for decoder
152
+ const textEncoder = new TextEncoder();
153
+ decoder.set(rank, textEncoder.encode(token));
154
+ }
155
+ return { encoder, decoder };
156
+ }
157
+ /**
158
+ * Clear the vocabulary cache
159
+ * Useful for testing or memory management
160
+ */
161
+ export function clearVocabularyCache() {
162
+ vocabularyCache.clear();
163
+ }
164
+ /**
165
+ * Get vocabulary from cache (sync, returns undefined if not loaded)
166
+ */
167
+ export function getVocabularyFromCache(encodingName) {
168
+ return vocabularyCache.get(encodingName);
169
+ }
170
+ /**
171
+ * Check if a vocabulary is cached
172
+ */
173
+ export function isVocabularyCached(encodingName) {
174
+ return vocabularyCache.has(encodingName);
175
+ }
176
+ //# sourceMappingURL=vocab-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vocab-loader.js","sourceRoot":"","sources":["../../src/core/vocab-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAIL,UAAU,EACV,UAAU,GACX,MAAM,yBAAyB,CAAC;AAEjC,4BAA4B;AAC5B,OAAO,EAAE,UAAU,EAAE,CAAC;AAEtB;;GAEG;AACH,SAAS,YAAY,CAAC,MAAc;IAClC,8CAA8C;IAC9C,0CAA0C;IAC1C,IAAI,OAAO,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACxC,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,kEAAkE;IAClE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;IAC7C,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,KAAiB;IAC5C,IAAI,OAAO,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAChD,CAAC;QACD,OAAO,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5B,CAAC;IAED,6BAA6B;IAC7B,OAAO,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAC/C,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,eAAe,CAAC,OAAe;IAI7C,MAAM,OAAO,GAAe,IAAI,GAAG,EAAE,CAAC;IACtC,MAAM,OAAO,GAAsB,IAAI,GAAG,EAAE,CAAC;IAE7C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACxC,IAAI,UAAU,KAAK,CAAC,CAAC;YAAE,SAAS;QAEhC,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,YAAY,CAAC,WAAW,CAAC,CAAC;YAC7C,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YAEnC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBACjB,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;gBACnC,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;gBACvB,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;YAChC,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;YACvB,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,eAAe,GAA2B;IACrD,SAAS,EACP,yEAAyE;IAC3E,SAAS,EACP,yEAAyE;IAC3E,SAAS,EACP,yEAAyE,EAAE,kBAAkB;IAC/F,WAAW,EACT,2EAA2E;IAC7E,UAAU,EACR,0EAA0E;IAC5E,aAAa,EACX,0EAA0E,EAAE,kBAAkB;CACjG,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAGjB,IAAI,GAAG,EAAE,CAAC;AAEd;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,YAAoB;IAEpB,oBAAoB;IACpB,MAAM,MAAM,GAAG,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IACjD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,GAAG,GAAG,eAAe,CAAC,YAAY,CAAC,CAAC;IAC1C,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,KAAK,CAAC,qBAAqB,YAAY,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,4BAA4B;IAC5B,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CACb,kCAAkC,YAAY,KAAK,QAAQ,CAAC,UAAU,EAAE,CACzE,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtC,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEvC,mBAAmB;IACnB,eAAe,CAAC,GAAG,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;IAEzC,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,YAAoB,EACpB,OAAe;IAEf,oBAAoB;IACpB,MAAM,MAAM,GAAG,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IACjD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEvC,mBAAmB;IACnB,eAAe,CAAC,GAAG,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;IAEzC,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,sBAAsB,CAAC,aAAmC;IAIxE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAgB,CAAC;IACxC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAoB,CAAC;IAE5C,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACzB,sCAAsC;QACtC,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;IAC/C,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB;IAClC,eAAe,CAAC,KAAK,EAAE,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,sBAAsB,CACpC,YAAoB;IAEpB,OAAO,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,YAAoB;IACrD,OAAO,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC"}
@@ -0,0 +1,43 @@
1
+ /**
2
+ * cl100k_base Encoding
3
+ * Used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002
4
+ *
5
+ * This encoding has:
6
+ * - 100,256 tokens
7
+ * - Better handling of code and programming languages
8
+ * - Improved whitespace handling
9
+ */
10
+ import type { Tokenizer, EncodingName } from "../types.js";
11
+ /**
12
+ * Encoding name constant
13
+ */
14
+ export declare const ENCODING_NAME: EncodingName;
15
+ /**
16
+ * Create a cl100k_base tokenizer instance
17
+ *
18
+ * @returns Tokenizer instance
19
+ */
20
+ export declare function createCL100kTokenizer(): Tokenizer;
21
+ /**
22
+ * Pre-computed token counts for common patterns
23
+ * Used to improve estimation accuracy
24
+ */
25
+ export declare const COMMON_TOKEN_PATTERNS: Record<string, number>;
26
+ /**
27
+ * Average characters per token for cl100k_base
28
+ * Based on empirical analysis of English text
29
+ */
30
+ export declare const AVERAGE_CHARS_PER_TOKEN = 3.8;
31
+ /**
32
+ * Token estimation adjustments for different content types
33
+ */
34
+ export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
35
+ /**
36
+ * Get estimated token count with content-type awareness
37
+ *
38
+ * @param text - Input text
39
+ * @param contentType - Type of content
40
+ * @returns Estimated token count
41
+ */
42
+ export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
43
+ //# sourceMappingURL=cl100k-base.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cl100k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/cl100k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA4B,CAAC;AAEzD;;;;GAIG;AACH,wBAAgB,qBAAqB,IAAI,SAAS,CASjD;AAED;;;GAGG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAmFxD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR"}