@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core Tiktoken BPE Implementation
|
|
3
|
+
*
|
|
4
|
+
* This is an EXACT port of the CoreBPE struct from tiktoken-rs.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
|
|
6
|
+
*
|
|
7
|
+
* Provides the main tokenization API:
|
|
8
|
+
* - encode_ordinary(text) - Encode without special tokens
|
|
9
|
+
* - encode(text, allowed_special) - Encode with optional special tokens
|
|
10
|
+
* - encode_with_special_tokens(text) - Encode with all special tokens
|
|
11
|
+
* - decode(tokens) - Decode tokens to text
|
|
12
|
+
* - decode_bytes(tokens) - Decode tokens to raw bytes
|
|
13
|
+
*/
|
|
14
|
+
import { bytePairEncode, bytesToKey, keyToBytes, } from "./byte-pair-encoding.js";
|
|
15
|
+
/**
|
|
16
|
+
* Error thrown when a token cannot be decoded
|
|
17
|
+
*/
|
|
18
|
+
export class DecodeKeyError extends Error {
|
|
19
|
+
token;
|
|
20
|
+
constructor(token) {
|
|
21
|
+
super(`Invalid token for decoding: ${token}`);
|
|
22
|
+
this.name = "DecodeKeyError";
|
|
23
|
+
this.token = token;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Escape a string for use in a regex
|
|
28
|
+
*/
|
|
29
|
+
function escapeRegex(str) {
|
|
30
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* CoreBPE - The main tokenizer class
|
|
34
|
+
*
|
|
35
|
+
* This class implements the complete BPE tokenization algorithm,
|
|
36
|
+
* matching the behavior of tiktoken-rs exactly.
|
|
37
|
+
*/
|
|
38
|
+
export class CoreBPE {
|
|
39
|
+
/** The vocabulary mapping byte sequences to ranks */
|
|
40
|
+
encoder;
|
|
41
|
+
/** The special tokens mapping strings to ranks */
|
|
42
|
+
specialTokensEncoder;
|
|
43
|
+
/** Reverse vocabulary mapping ranks to byte sequences */
|
|
44
|
+
decoder;
|
|
45
|
+
/** Reverse special tokens mapping ranks to byte sequences */
|
|
46
|
+
specialTokensDecoder;
|
|
47
|
+
/** Compiled regex for splitting text */
|
|
48
|
+
regex;
|
|
49
|
+
/** Compiled regex for finding special tokens */
|
|
50
|
+
specialRegex;
|
|
51
|
+
/** Text encoder for converting strings to bytes */
|
|
52
|
+
textEncoder;
|
|
53
|
+
/** Text decoder for converting bytes to strings */
|
|
54
|
+
textDecoder;
|
|
55
|
+
/**
|
|
56
|
+
* Create a new CoreBPE instance
|
|
57
|
+
*
|
|
58
|
+
* @param encoder - Vocabulary mapping byte sequences to ranks
|
|
59
|
+
* @param specialTokensEncoder - Special tokens mapping strings to ranks
|
|
60
|
+
* @param pattern - Regex pattern for splitting text into pieces
|
|
61
|
+
*/
|
|
62
|
+
constructor(encoder, specialTokensEncoder, pattern) {
|
|
63
|
+
this.encoder = encoder;
|
|
64
|
+
this.specialTokensEncoder = specialTokensEncoder;
|
|
65
|
+
this.textEncoder = new TextEncoder();
|
|
66
|
+
this.textDecoder = new TextDecoder("utf-8", { fatal: false });
|
|
67
|
+
// Build the decoder (reverse of encoder)
|
|
68
|
+
this.decoder = new Map();
|
|
69
|
+
for (const [key, rank] of encoder) {
|
|
70
|
+
this.decoder.set(rank, keyToBytes(key));
|
|
71
|
+
}
|
|
72
|
+
// Verify encoder and decoder have same size
|
|
73
|
+
if (encoder.size !== this.decoder.size) {
|
|
74
|
+
throw new Error("Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?");
|
|
75
|
+
}
|
|
76
|
+
// Build special tokens decoder
|
|
77
|
+
this.specialTokensDecoder = new Map();
|
|
78
|
+
for (const [token, rank] of specialTokensEncoder) {
|
|
79
|
+
this.specialTokensDecoder.set(rank, this.textEncoder.encode(token));
|
|
80
|
+
}
|
|
81
|
+
// Compile the regex pattern
|
|
82
|
+
// Note: JavaScript regex flags differ from Rust - we use 'gu' for global + unicode
|
|
83
|
+
this.regex = new RegExp(pattern, "gu");
|
|
84
|
+
// Build special tokens regex
|
|
85
|
+
if (specialTokensEncoder.size > 0) {
|
|
86
|
+
const escapedTokens = Array.from(specialTokensEncoder.keys())
|
|
87
|
+
.map(escapeRegex)
|
|
88
|
+
.join("|");
|
|
89
|
+
this.specialRegex = new RegExp(escapedTokens, "g");
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
this.specialRegex = null;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Decode tokens to raw bytes
|
|
97
|
+
*
|
|
98
|
+
* The bytes are not guaranteed to be valid UTF-8.
|
|
99
|
+
*
|
|
100
|
+
* @param tokens - Array of token ranks to decode
|
|
101
|
+
* @returns Decoded bytes
|
|
102
|
+
* @throws DecodeKeyError if a token is not found
|
|
103
|
+
*/
|
|
104
|
+
decodeBytes(tokens) {
|
|
105
|
+
const parts = [];
|
|
106
|
+
let totalLength = 0;
|
|
107
|
+
for (const token of tokens) {
|
|
108
|
+
let tokenBytes = this.decoder.get(token);
|
|
109
|
+
if (tokenBytes === undefined) {
|
|
110
|
+
tokenBytes = this.specialTokensDecoder.get(token);
|
|
111
|
+
}
|
|
112
|
+
if (tokenBytes === undefined) {
|
|
113
|
+
throw new DecodeKeyError(token);
|
|
114
|
+
}
|
|
115
|
+
parts.push(tokenBytes);
|
|
116
|
+
totalLength += tokenBytes.length;
|
|
117
|
+
}
|
|
118
|
+
// Concatenate all parts
|
|
119
|
+
const result = new Uint8Array(totalLength);
|
|
120
|
+
let offset = 0;
|
|
121
|
+
for (const part of parts) {
|
|
122
|
+
result.set(part, offset);
|
|
123
|
+
offset += part.length;
|
|
124
|
+
}
|
|
125
|
+
return result;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Decode tokens to a string
|
|
129
|
+
*
|
|
130
|
+
* @param tokens - Array of token ranks to decode
|
|
131
|
+
* @returns Decoded string
|
|
132
|
+
* @throws DecodeKeyError if a token is not found
|
|
133
|
+
*/
|
|
134
|
+
decode(tokens) {
|
|
135
|
+
const bytes = this.decodeBytes(tokens);
|
|
136
|
+
return this.textDecoder.decode(bytes);
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Encode text without handling special tokens
|
|
140
|
+
*
|
|
141
|
+
* This is the core encoding logic. Special tokens are treated as regular text.
|
|
142
|
+
*
|
|
143
|
+
* @param text - Text to encode
|
|
144
|
+
* @returns Array of token ranks
|
|
145
|
+
*/
|
|
146
|
+
encodeOrdinary(text) {
|
|
147
|
+
const tokens = [];
|
|
148
|
+
// Reset regex state
|
|
149
|
+
this.regex.lastIndex = 0;
|
|
150
|
+
let match;
|
|
151
|
+
while ((match = this.regex.exec(text)) !== null) {
|
|
152
|
+
const piece = match[0];
|
|
153
|
+
const pieceBytes = this.textEncoder.encode(piece);
|
|
154
|
+
const key = bytesToKey(pieceBytes);
|
|
155
|
+
// Check if the piece is a single token
|
|
156
|
+
const directRank = this.encoder.get(key);
|
|
157
|
+
if (directRank !== undefined) {
|
|
158
|
+
tokens.push(directRank);
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
// Apply BPE
|
|
162
|
+
const bpeTokens = bytePairEncode(pieceBytes, this.encoder);
|
|
163
|
+
tokens.push(...bpeTokens);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return tokens;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Encode text with special token handling
|
|
170
|
+
*
|
|
171
|
+
* @param text - Text to encode
|
|
172
|
+
* @param allowedSpecial - Set of special tokens that are allowed
|
|
173
|
+
* @returns Tuple of [tokens, lastPieceTokenLen]
|
|
174
|
+
*/
|
|
175
|
+
encode(text, allowedSpecial) {
|
|
176
|
+
if (!this.specialRegex || allowedSpecial.size === 0) {
|
|
177
|
+
// No special tokens to handle
|
|
178
|
+
return [this.encodeOrdinary(text), 0];
|
|
179
|
+
}
|
|
180
|
+
const tokens = [];
|
|
181
|
+
let start = 0;
|
|
182
|
+
let lastPieceTokenLen = 0;
|
|
183
|
+
while (start < text.length) {
|
|
184
|
+
// Find the next special token
|
|
185
|
+
let nextSpecial = null;
|
|
186
|
+
let searchStart = start;
|
|
187
|
+
while (searchStart < text.length) {
|
|
188
|
+
this.specialRegex.lastIndex = searchStart;
|
|
189
|
+
const match = this.specialRegex.exec(text);
|
|
190
|
+
if (!match) {
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
193
|
+
const matchedToken = match[0];
|
|
194
|
+
if (allowedSpecial.has(matchedToken)) {
|
|
195
|
+
nextSpecial = { index: match.index, token: matchedToken };
|
|
196
|
+
break;
|
|
197
|
+
}
|
|
198
|
+
// Token not allowed, continue searching
|
|
199
|
+
searchStart = match.index + 1;
|
|
200
|
+
}
|
|
201
|
+
const end = nextSpecial ? nextSpecial.index : text.length;
|
|
202
|
+
// Encode the text before the special token
|
|
203
|
+
if (end > start) {
|
|
204
|
+
const segment = text.slice(start, end);
|
|
205
|
+
// Reset regex state
|
|
206
|
+
this.regex.lastIndex = 0;
|
|
207
|
+
let match;
|
|
208
|
+
while ((match = this.regex.exec(segment)) !== null) {
|
|
209
|
+
const piece = match[0];
|
|
210
|
+
const pieceBytes = this.textEncoder.encode(piece);
|
|
211
|
+
const key = bytesToKey(pieceBytes);
|
|
212
|
+
const directRank = this.encoder.get(key);
|
|
213
|
+
if (directRank !== undefined) {
|
|
214
|
+
lastPieceTokenLen = 1;
|
|
215
|
+
tokens.push(directRank);
|
|
216
|
+
}
|
|
217
|
+
else {
|
|
218
|
+
const bpeTokens = bytePairEncode(pieceBytes, this.encoder);
|
|
219
|
+
lastPieceTokenLen = bpeTokens.length;
|
|
220
|
+
tokens.push(...bpeTokens);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
// Handle the special token
|
|
225
|
+
if (nextSpecial) {
|
|
226
|
+
const specialRank = this.specialTokensEncoder.get(nextSpecial.token);
|
|
227
|
+
if (specialRank !== undefined) {
|
|
228
|
+
tokens.push(specialRank);
|
|
229
|
+
lastPieceTokenLen = 0;
|
|
230
|
+
}
|
|
231
|
+
start = nextSpecial.index + nextSpecial.token.length;
|
|
232
|
+
}
|
|
233
|
+
else {
|
|
234
|
+
break;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
return [tokens, lastPieceTokenLen];
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Encode text with all special tokens allowed
|
|
241
|
+
*
|
|
242
|
+
* @param text - Text to encode
|
|
243
|
+
* @returns Array of token ranks
|
|
244
|
+
*/
|
|
245
|
+
encodeWithSpecialTokens(text) {
|
|
246
|
+
const allowedSpecial = this.getSpecialTokens();
|
|
247
|
+
return this.encode(text, allowedSpecial)[0];
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Get all special tokens
|
|
251
|
+
*
|
|
252
|
+
* @returns Set of special token strings
|
|
253
|
+
*/
|
|
254
|
+
getSpecialTokens() {
|
|
255
|
+
return new Set(this.specialTokensEncoder.keys());
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Get the vocabulary size (excluding special tokens)
|
|
259
|
+
*/
|
|
260
|
+
get vocabSize() {
|
|
261
|
+
return this.encoder.size;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Get the total vocabulary size (including special tokens)
|
|
265
|
+
*/
|
|
266
|
+
get totalVocabSize() {
|
|
267
|
+
return this.encoder.size + this.specialTokensEncoder.size;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Check if a token rank is a special token
|
|
271
|
+
*/
|
|
272
|
+
isSpecialToken(token) {
|
|
273
|
+
return this.specialTokensDecoder.has(token);
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Get the byte representation of a token
|
|
277
|
+
*/
|
|
278
|
+
getTokenBytes(token) {
|
|
279
|
+
return this.decoder.get(token) ?? this.specialTokensDecoder.get(token);
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Get the rank of a byte sequence
|
|
283
|
+
*/
|
|
284
|
+
getRank(bytes) {
|
|
285
|
+
const key = bytesToKey(bytes);
|
|
286
|
+
return this.encoder.get(key);
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Get the rank of a special token
|
|
290
|
+
*/
|
|
291
|
+
getSpecialTokenRank(token) {
|
|
292
|
+
return this.specialTokensEncoder.get(token);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
//# sourceMappingURL=tiktoken.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tiktoken.js","sourceRoot":"","sources":["../../src/core/tiktoken.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAIL,cAAc,EACd,UAAU,EACV,UAAU,GACX,MAAM,yBAAyB,CAAC;AAEjC;;GAEG;AACH,MAAM,OAAO,cAAe,SAAQ,KAAK;IACvB,KAAK,CAAO;IAE5B,YAAY,KAAW;QACrB,KAAK,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;QAC9C,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;QAC7B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,GAAG,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AACpD,CAAC;AAED;;;;;GAKG;AACH,MAAM,OAAO,OAAO;IAClB,qDAAqD;IACpC,OAAO,CAAa;IAErC,kDAAkD;IACjC,oBAAoB,CAAoB;IAEzD,yDAAyD;IACxC,OAAO,CAAoB;IAE5C,6DAA6D;IAC5C,oBAAoB,CAAwB;IAE7D,wCAAwC;IACvB,KAAK,CAAS;IAE/B,gDAAgD;IAC/B,YAAY,CAAgB;IAE7C,mDAAmD;IAClC,WAAW,CAAmC;IAE/D,mDAAmD;IAClC,WAAW,CAAmC;IAE/D;;;;;;OAMG;IACH,YACE,OAAmB,EACnB,oBAAuC,EACvC,OAAe;QAEf,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,oBAAoB,GAAG,oBAAoB,CAAC;QACjD,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;QAE9D,yCAAyC;QACzC,IAAI,CAAC,OAAO,GAAG,IAAI,GAAG,EAAE,CAAC;QACzB,KAAK,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,OAAO,EAAE,CAAC;YAClC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,4CAA4C;QAC5C,IAAI,OAAO,CAAC,IAAI,KAAK,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YACvC,MAAM,IAAI,KAAK,CACb,qGAAqG,CACtG,CAAC;QACJ,CAAC;QAED,+BAA+B;QAC/B,IAAI,CAAC,oBAAoB,GAAG,IAAI,GAAG,EAAE,CAAC;QACtC,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,oBAAoB,EAAE,CAAC;YACjD,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACtE,CAAC;QAED,4BAA4B;QAC5B,mFAAmF;QACnF,IAAI,CAAC,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;QAEvC,6BAA6B;QAC7B,IAAI,oBAAoB,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YAClC,MAAM,aAAa,GAAG,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC;iBAC1D,GAAG,CAAC,WAAW,CAAC;iBAChB,IAAI,CAAC,GAAG,CAAC,CAAC;YACb,IAAI,CAAC,YAAY,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;IAED;;;;;;;;OAQG;IACH,WAAW,CAAC,MAAc;QACxB,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,IAAI,WAAW,GAAG,CAAC,CAAC;QAEpB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,UAAU,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YACpD,CAAC;YACD,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,MAAM,IAAI,cAAc,CAAC,KAAK,CAAC,CAAC;YAClC,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACvB,WAAW,IAAI,UAAU,CAAC,MAAM,CAAC;QACnC,CAAC;QAED,wBAAwB;QACxB,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAC3C,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YACzB,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC;QACxB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,MAAc;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACvC,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IACxC,CAAC;IAED;;;;;;;OAOG;IACH,cAAc,CAAC,IAAY;QACzB,MAAM,MAAM,GAAW,EAAE,CAAC;QAE1B,oBAAoB;QACpB,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC;QAEzB,IAAI,KAAK,CAAC;QACV,OAAO,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACvB,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAClD,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;YAEnC,uCAAuC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,YAAY;gBACZ,MAAM,SAAS,GAAG,cAAc,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAC3D,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY,EAAE,cAA2B;QAC9C,IAAI,CAAC,IAAI,CAAC,YAAY,IAAI,cAAc,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YACpD,8BAA8B;YAC9B,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,MAAM,GAAW,EAAE,CAAC;QAC1B,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,iBAAiB,GAAG,CAAC,CAAC;QAE1B,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,8BAA8B;YAC9B,IAAI,WAAW,GAA4C,IAAI,CAAC;YAChE,IAAI,WAAW,GAAG,KAAK,CAAC;YAExB,OAAO,WAAW,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjC,IAAI,CAAC,YAAY,CAAC,SAAS,GAAG,WAAW,CAAC;gBAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAE3C,IAAI,CAAC,KAAK,EAAE,CAAC;oBACX,MAAM;gBACR,CAAC;gBAED,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC9B,IAAI,cAAc,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,CAAC;oBACrC,WAAW,GAAG,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;oBAC1D,MAAM;gBACR,CAAC;gBAED,wCAAwC;gBACxC,WAAW,GAAG,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC;YAChC,CAAC;YAED,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;YAE1D,2CAA2C;YAC3C,IAAI,GAAG,GAAG,KAAK,EAAE,CAAC;gBAChB,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAEvC,oBAAoB;gBACpB,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC;gBAEzB,IAAI,KAAK,CAAC;gBACV,OAAO,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;oBACvB,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBAClD,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;oBAEnC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACzC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;wBAC7B,iBAAiB,GAAG,CAAC,CAAC;wBACtB,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;oBAC1B,CAAC;yBAAM,CAAC;wBACN,MAAM,SAAS,GAAG,cAAc,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;wBAC3D,iBAAiB,GAAG,SAAS,CAAC,MAAM,CAAC;wBACrC,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;oBAC5B,CAAC;gBACH,CAAC;YACH,CAAC;YAED,2BAA2B;YAC3B,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,WAAW,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBACrE,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;oBAC9B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBACzB,iBAAiB,GAAG,CAAC,CAAC;gBACxB,CAAC;gBACD,KAAK,GAAG,WAAW,CAAC,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC;YACvD,CAAC;iBAAM,CAAC;gBACN,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC;IACrC,CAAC;IAED;;;;;OAKG;IACH,uBAAuB,CAAC,IAAY;QAClC,MAAM,cAAc,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC/C,OAAO,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED;;;;OAIG;IACH,gBAAgB;QACd,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,IAAI,cAAc;QAChB,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;IAC5D,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,KAAW;QACxB,OAAO,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,KAAW;QACvB,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IACzE,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,KAAiB;QACvB,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC9B,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,mBAAmB,CAAC,KAAa;QAC/B,OAAO,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC9C,CAAC;CACF"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vocabulary Loader
|
|
3
|
+
*
|
|
4
|
+
* Handles loading and parsing of .tiktoken vocabulary files.
|
|
5
|
+
* These files contain base64-encoded tokens with their ranks.
|
|
6
|
+
*
|
|
7
|
+
* Format: Each line is "base64_token rank"
|
|
8
|
+
* Example: "SGVsbG8= 12345" means "Hello" has rank 12345
|
|
9
|
+
*/
|
|
10
|
+
import { type Vocabulary, type ReverseVocabulary, type Rank, keyToBytes } from "./byte-pair-encoding.js";
|
|
11
|
+
export { keyToBytes };
|
|
12
|
+
/**
|
|
13
|
+
* Base64 encode (works in both Node.js and browser)
|
|
14
|
+
*/
|
|
15
|
+
export declare function base64Encode(bytes: Uint8Array): string;
|
|
16
|
+
/**
|
|
17
|
+
* Parse a .tiktoken vocabulary file content
|
|
18
|
+
*
|
|
19
|
+
* @param content - The raw content of the .tiktoken file
|
|
20
|
+
* @returns Vocabulary map and reverse vocabulary map
|
|
21
|
+
*/
|
|
22
|
+
export declare function parseVocabulary(content: string): {
|
|
23
|
+
encoder: Vocabulary;
|
|
24
|
+
decoder: ReverseVocabulary;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Vocabulary URLs for each encoding
|
|
28
|
+
*/
|
|
29
|
+
export declare const VOCABULARY_URLS: Record<string, string>;
|
|
30
|
+
/**
|
|
31
|
+
* Load vocabulary from URL (with caching)
|
|
32
|
+
*
|
|
33
|
+
* @param encodingName - Name of the encoding
|
|
34
|
+
* @returns Promise resolving to vocabulary maps
|
|
35
|
+
*/
|
|
36
|
+
export declare function loadVocabularyFromUrl(encodingName: string): Promise<{
|
|
37
|
+
encoder: Vocabulary;
|
|
38
|
+
decoder: ReverseVocabulary;
|
|
39
|
+
}>;
|
|
40
|
+
/**
|
|
41
|
+
* Load vocabulary from a string (for embedded vocabularies)
|
|
42
|
+
*
|
|
43
|
+
* @param encodingName - Name of the encoding (for caching)
|
|
44
|
+
* @param content - The vocabulary file content
|
|
45
|
+
* @returns Vocabulary maps
|
|
46
|
+
*/
|
|
47
|
+
export declare function loadVocabularyFromString(encodingName: string, content: string): {
|
|
48
|
+
encoder: Vocabulary;
|
|
49
|
+
decoder: ReverseVocabulary;
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Create special tokens map
|
|
53
|
+
*
|
|
54
|
+
* @param specialTokens - Map of special token strings to ranks
|
|
55
|
+
* @returns Maps for encoding and decoding special tokens
|
|
56
|
+
*/
|
|
57
|
+
export declare function createSpecialTokenMaps(specialTokens: Record<string, Rank>): {
|
|
58
|
+
encoder: Map<string, Rank>;
|
|
59
|
+
decoder: Map<Rank, Uint8Array>;
|
|
60
|
+
};
|
|
61
|
+
/**
|
|
62
|
+
* Clear the vocabulary cache
|
|
63
|
+
* Useful for testing or memory management
|
|
64
|
+
*/
|
|
65
|
+
export declare function clearVocabularyCache(): void;
|
|
66
|
+
/**
|
|
67
|
+
* Get vocabulary from cache (sync, returns undefined if not loaded)
|
|
68
|
+
*/
|
|
69
|
+
export declare function getVocabularyFromCache(encodingName: string): {
|
|
70
|
+
encoder: Vocabulary;
|
|
71
|
+
decoder: ReverseVocabulary;
|
|
72
|
+
} | undefined;
|
|
73
|
+
/**
|
|
74
|
+
* Check if a vocabulary is cached
|
|
75
|
+
*/
|
|
76
|
+
export declare function isVocabularyCached(encodingName: string): boolean;
|
|
77
|
+
//# sourceMappingURL=vocab-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vocab-loader.d.ts","sourceRoot":"","sources":["../../src/core/vocab-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,KAAK,UAAU,EACf,KAAK,iBAAiB,EACtB,KAAK,IAAI,EAET,UAAU,EACX,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EAAE,UAAU,EAAE,CAAC;AAsBtB;;GAEG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAWtD;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG;IAChD,OAAO,EAAE,UAAU,CAAC;IACpB,OAAO,EAAE,iBAAiB,CAAC;CAC5B,CAgCA;AAED;;GAEG;AACH,eAAO,MAAM,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAalD,CAAC;AAUF;;;;;GAKG;AACH,wBAAsB,qBAAqB,CACzC,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,CAAC,CA2B9D;AAED;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,YAAY,EAAE,MAAM,EACpB,OAAO,EAAE,MAAM,GACd;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,CAarD;AAED;;;;;GAKG;AACH,wBAAgB,sBAAsB,CAAC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,GAAG;IAC3E,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC3B,OAAO,EAAE,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;CAChC,CAYA;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,IAAI,IAAI,CAE3C;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CACpC,YAAY,EAAE,MAAM,GACnB;IAAE,OAAO,EAAE,UAAU,CAAC;IAAC,OAAO,EAAE,iBAAiB,CAAA;CAAE,GAAG,SAAS,CAEjE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,YAAY,EAAE,MAAM,GAAG,OAAO,CAEhE"}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vocabulary Loader
|
|
3
|
+
*
|
|
4
|
+
* Handles loading and parsing of .tiktoken vocabulary files.
|
|
5
|
+
* These files contain base64-encoded tokens with their ranks.
|
|
6
|
+
*
|
|
7
|
+
* Format: Each line is "base64_token rank"
|
|
8
|
+
* Example: "SGVsbG8= 12345" means "Hello" has rank 12345
|
|
9
|
+
*/
|
|
10
|
+
import { bytesToKey, keyToBytes, } from "./byte-pair-encoding.js";
|
|
11
|
+
// Re-export for convenience
|
|
12
|
+
export { keyToBytes };
|
|
13
|
+
/**
|
|
14
|
+
* Base64 decode (works in both Node.js and browser)
|
|
15
|
+
*/
|
|
16
|
+
function base64Decode(base64) {
|
|
17
|
+
// Use built-in atob for browser compatibility
|
|
18
|
+
// Node.js 18+ also supports atob globally
|
|
19
|
+
if (typeof atob === "function") {
|
|
20
|
+
const binaryString = atob(base64);
|
|
21
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
22
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
23
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
24
|
+
}
|
|
25
|
+
return bytes;
|
|
26
|
+
}
|
|
27
|
+
// Fallback for older Node.js (should not be needed with Node 18+)
|
|
28
|
+
const buffer = Buffer.from(base64, "base64");
|
|
29
|
+
return new Uint8Array(buffer);
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Base64 encode (works in both Node.js and browser)
|
|
33
|
+
*/
|
|
34
|
+
export function base64Encode(bytes) {
|
|
35
|
+
if (typeof btoa === "function") {
|
|
36
|
+
let binaryString = "";
|
|
37
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
38
|
+
binaryString += String.fromCharCode(bytes[i]);
|
|
39
|
+
}
|
|
40
|
+
return btoa(binaryString);
|
|
41
|
+
}
|
|
42
|
+
// Fallback for older Node.js
|
|
43
|
+
return Buffer.from(bytes).toString("base64");
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Parse a .tiktoken vocabulary file content
|
|
47
|
+
*
|
|
48
|
+
* @param content - The raw content of the .tiktoken file
|
|
49
|
+
* @returns Vocabulary map and reverse vocabulary map
|
|
50
|
+
*/
|
|
51
|
+
export function parseVocabulary(content) {
|
|
52
|
+
const encoder = new Map();
|
|
53
|
+
const decoder = new Map();
|
|
54
|
+
const lines = content.split("\n");
|
|
55
|
+
for (const line of lines) {
|
|
56
|
+
const trimmed = line.trim();
|
|
57
|
+
if (!trimmed)
|
|
58
|
+
continue;
|
|
59
|
+
const spaceIndex = trimmed.indexOf(" ");
|
|
60
|
+
if (spaceIndex === -1)
|
|
61
|
+
continue;
|
|
62
|
+
const base64Token = trimmed.slice(0, spaceIndex);
|
|
63
|
+
const rankStr = trimmed.slice(spaceIndex + 1);
|
|
64
|
+
try {
|
|
65
|
+
const tokenBytes = base64Decode(base64Token);
|
|
66
|
+
const rank = parseInt(rankStr, 10);
|
|
67
|
+
if (!isNaN(rank)) {
|
|
68
|
+
const key = bytesToKey(tokenBytes);
|
|
69
|
+
encoder.set(key, rank);
|
|
70
|
+
decoder.set(rank, tokenBytes);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
catch {
|
|
74
|
+
// Skip malformed lines
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return { encoder, decoder };
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Vocabulary URLs for each encoding
|
|
82
|
+
*/
|
|
83
|
+
export const VOCABULARY_URLS = {
|
|
84
|
+
r50k_base: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
|
|
85
|
+
p50k_base: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
|
86
|
+
p50k_edit: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", // Uses same vocab
|
|
87
|
+
cl100k_base: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
|
|
88
|
+
o200k_base: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
|
89
|
+
o200k_harmony: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", // Uses same vocab
|
|
90
|
+
};
|
|
91
|
+
/**
|
|
92
|
+
* Cache for loaded vocabularies
|
|
93
|
+
*/
|
|
94
|
+
const vocabularyCache = new Map();
|
|
95
|
+
/**
|
|
96
|
+
* Load vocabulary from URL (with caching)
|
|
97
|
+
*
|
|
98
|
+
* @param encodingName - Name of the encoding
|
|
99
|
+
* @returns Promise resolving to vocabulary maps
|
|
100
|
+
*/
|
|
101
|
+
export async function loadVocabularyFromUrl(encodingName) {
|
|
102
|
+
// Check cache first
|
|
103
|
+
const cached = vocabularyCache.get(encodingName);
|
|
104
|
+
if (cached) {
|
|
105
|
+
return cached;
|
|
106
|
+
}
|
|
107
|
+
const url = VOCABULARY_URLS[encodingName];
|
|
108
|
+
if (!url) {
|
|
109
|
+
throw new Error(`Unknown encoding: ${encodingName}`);
|
|
110
|
+
}
|
|
111
|
+
// Fetch the vocabulary file
|
|
112
|
+
const response = await fetch(url);
|
|
113
|
+
if (!response.ok) {
|
|
114
|
+
throw new Error(`Failed to fetch vocabulary for ${encodingName}: ${response.statusText}`);
|
|
115
|
+
}
|
|
116
|
+
const content = await response.text();
|
|
117
|
+
const vocab = parseVocabulary(content);
|
|
118
|
+
// Cache the result
|
|
119
|
+
vocabularyCache.set(encodingName, vocab);
|
|
120
|
+
return vocab;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Load vocabulary from a string (for embedded vocabularies)
|
|
124
|
+
*
|
|
125
|
+
* @param encodingName - Name of the encoding (for caching)
|
|
126
|
+
* @param content - The vocabulary file content
|
|
127
|
+
* @returns Vocabulary maps
|
|
128
|
+
*/
|
|
129
|
+
export function loadVocabularyFromString(encodingName, content) {
|
|
130
|
+
// Check cache first
|
|
131
|
+
const cached = vocabularyCache.get(encodingName);
|
|
132
|
+
if (cached) {
|
|
133
|
+
return cached;
|
|
134
|
+
}
|
|
135
|
+
const vocab = parseVocabulary(content);
|
|
136
|
+
// Cache the result
|
|
137
|
+
vocabularyCache.set(encodingName, vocab);
|
|
138
|
+
return vocab;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Create special tokens map
|
|
142
|
+
*
|
|
143
|
+
* @param specialTokens - Map of special token strings to ranks
|
|
144
|
+
* @returns Maps for encoding and decoding special tokens
|
|
145
|
+
*/
|
|
146
|
+
export function createSpecialTokenMaps(specialTokens) {
|
|
147
|
+
const encoder = new Map();
|
|
148
|
+
const decoder = new Map();
|
|
149
|
+
for (const [token, rank] of Object.entries(specialTokens)) {
|
|
150
|
+
encoder.set(token, rank);
|
|
151
|
+
// Convert string to bytes for decoder
|
|
152
|
+
const textEncoder = new TextEncoder();
|
|
153
|
+
decoder.set(rank, textEncoder.encode(token));
|
|
154
|
+
}
|
|
155
|
+
return { encoder, decoder };
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Clear the vocabulary cache
|
|
159
|
+
* Useful for testing or memory management
|
|
160
|
+
*/
|
|
161
|
+
export function clearVocabularyCache() {
|
|
162
|
+
vocabularyCache.clear();
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Get vocabulary from cache (sync, returns undefined if not loaded)
|
|
166
|
+
*/
|
|
167
|
+
export function getVocabularyFromCache(encodingName) {
|
|
168
|
+
return vocabularyCache.get(encodingName);
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Check if a vocabulary is cached
|
|
172
|
+
*/
|
|
173
|
+
export function isVocabularyCached(encodingName) {
|
|
174
|
+
return vocabularyCache.has(encodingName);
|
|
175
|
+
}
|
|
176
|
+
//# sourceMappingURL=vocab-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vocab-loader.js","sourceRoot":"","sources":["../../src/core/vocab-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAIL,UAAU,EACV,UAAU,GACX,MAAM,yBAAyB,CAAC;AAEjC,4BAA4B;AAC5B,OAAO,EAAE,UAAU,EAAE,CAAC;AAEtB;;GAEG;AACH,SAAS,YAAY,CAAC,MAAc;IAClC,8CAA8C;IAC9C,0CAA0C;IAC1C,IAAI,OAAO,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACxC,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,kEAAkE;IAClE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;IAC7C,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,KAAiB;IAC5C,IAAI,OAAO,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAChD,CAAC;QACD,OAAO,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5B,CAAC;IAED,6BAA6B;IAC7B,OAAO,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAC/C,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,eAAe,CAAC,OAAe;IAI7C,MAAM,OAAO,GAAe,IAAI,GAAG,EAAE,CAAC;IACtC,MAAM,OAAO,GAAsB,IAAI,GAAG,EAAE,CAAC;IAE7C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACxC,IAAI,UAAU,KAAK,CAAC,CAAC;YAAE,SAAS;QAEhC,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,YAAY,CAAC,WAAW,CAAC,CAAC;YAC7C,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YAEnC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBACjB,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;gBACnC,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;gBACvB,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;YAChC,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;YACvB,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,eAAe,GAA2B;IACrD,SAAS,EACP,yEAAyE;IAC3E,SAAS,EACP,yEAAyE;IAC3E,SAAS,EACP,yEAAyE,EAAE,kBAAkB;IAC/F,WAAW,EACT,2EAA2E;IAC7E,UAAU,EACR,0EAA0E;IAC5E,aAAa,EACX,0EAA0E,EAAE,kBAAkB;CACjG,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAGjB,IAAI,GAAG,EAAE,CAAC;AAEd;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,YAAoB;IAEpB,oBAAoB;IACpB,MAAM,MAAM,GAAG,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IACjD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,GAAG,GAAG,eAAe,CAAC,YAAY,CAAC,CAAC;IAC1C,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,KAAK,CAAC,qBAAqB,YAAY,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,4BAA4B;IAC5B,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CACb,kCAAkC,YAAY,KAAK,QAAQ,CAAC,UAAU,EAAE,CACzE,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACtC,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEvC,mBAAmB;IACnB,eAAe,CAAC,GAAG,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;IAEzC,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,YAAoB,EACpB,OAAe;IAEf,oBAAoB;IACpB,MAAM,MAAM,GAAG,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IACjD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEvC,mBAAmB;IACnB,eAAe,CAAC,GAAG,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;IAEzC,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,sBAAsB,CAAC,aAAmC;IAIxE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAgB,CAAC;IACxC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAoB,CAAC;IAE5C,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACzB,sCAAsC;QACtC,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;IAC/C,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB;IAClC,eAAe,CAAC,KAAK,EAAE,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,sBAAsB,CACpC,YAAoB;IAEpB,OAAO,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,YAAoB;IACrD,OAAO,eAAe,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cl100k_base Encoding
|
|
3
|
+
* Used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 100,256 tokens
|
|
7
|
+
* - Better handling of code and programming languages
|
|
8
|
+
* - Improved whitespace handling
|
|
9
|
+
*/
|
|
10
|
+
import type { Tokenizer, EncodingName } from "../types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Encoding name constant
|
|
13
|
+
*/
|
|
14
|
+
export declare const ENCODING_NAME: EncodingName;
|
|
15
|
+
/**
|
|
16
|
+
* Create a cl100k_base tokenizer instance
|
|
17
|
+
*
|
|
18
|
+
* @returns Tokenizer instance
|
|
19
|
+
*/
|
|
20
|
+
export declare function createCL100kTokenizer(): Tokenizer;
|
|
21
|
+
/**
|
|
22
|
+
* Pre-computed token counts for common patterns
|
|
23
|
+
* Used to improve estimation accuracy
|
|
24
|
+
*/
|
|
25
|
+
export declare const COMMON_TOKEN_PATTERNS: Record<string, number>;
|
|
26
|
+
/**
|
|
27
|
+
* Average characters per token for cl100k_base
|
|
28
|
+
* Based on empirical analysis of English text
|
|
29
|
+
*/
|
|
30
|
+
export declare const AVERAGE_CHARS_PER_TOKEN = 3.8;
|
|
31
|
+
/**
|
|
32
|
+
* Token estimation adjustments for different content types
|
|
33
|
+
*/
|
|
34
|
+
export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
|
|
35
|
+
/**
|
|
36
|
+
* Get estimated token count with content-type awareness
|
|
37
|
+
*
|
|
38
|
+
* @param text - Input text
|
|
39
|
+
* @param contentType - Type of content
|
|
40
|
+
* @returns Estimated token count
|
|
41
|
+
*/
|
|
42
|
+
export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
|
|
43
|
+
//# sourceMappingURL=cl100k-base.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cl100k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/cl100k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA4B,CAAC;AAEzD;;;;GAIG;AACH,wBAAgB,qBAAqB,IAAI,SAAS,CASjD;AAED;;;GAGG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAmFxD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR"}
|