@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Byte-Pair Encoding Core Algorithm
|
|
3
|
+
*
|
|
4
|
+
* This is an EXACT port of the BPE algorithm from tiktoken-rs.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
|
|
6
|
+
*
|
|
7
|
+
* The algorithm works as follows:
|
|
8
|
+
* 1. Split input text into pieces using a regex pattern
|
|
9
|
+
* 2. For each piece, convert to bytes and apply BPE merges
|
|
10
|
+
* 3. Look up each merged byte sequence in the vocabulary to get token IDs
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Token rank type (same as tiktoken-rs)
|
|
14
|
+
*/
|
|
15
|
+
export type Rank = number;
|
|
16
|
+
/**
|
|
17
|
+
* Vocabulary type: maps byte sequences to ranks
|
|
18
|
+
*/
|
|
19
|
+
export type Vocabulary = Map<string, Rank>;
|
|
20
|
+
/**
|
|
21
|
+
* Reverse vocabulary: maps ranks to byte sequences
|
|
22
|
+
*/
|
|
23
|
+
export type ReverseVocabulary = Map<Rank, Uint8Array>;
|
|
24
|
+
/**
|
|
25
|
+
* Convert a byte array to a string key for Map lookup
|
|
26
|
+
* This is necessary because JavaScript Maps use reference equality for arrays
|
|
27
|
+
*/
|
|
28
|
+
export declare function bytesToKey(bytes: Uint8Array): string;
|
|
29
|
+
/**
|
|
30
|
+
* Convert a string key back to bytes
|
|
31
|
+
*/
|
|
32
|
+
export declare function keyToBytes(key: string): Uint8Array;
|
|
33
|
+
/**
|
|
34
|
+
* Encode a byte sequence using BPE
|
|
35
|
+
*
|
|
36
|
+
* @param piece - The byte sequence to encode
|
|
37
|
+
* @param ranks - The vocabulary mapping byte sequences to ranks
|
|
38
|
+
* @returns Array of token ranks
|
|
39
|
+
*/
|
|
40
|
+
export declare function bytePairEncode(piece: Uint8Array, ranks: Vocabulary): Rank[];
|
|
41
|
+
/**
|
|
42
|
+
* Split a byte sequence into BPE tokens (returns byte slices, not ranks)
|
|
43
|
+
*
|
|
44
|
+
* @param piece - The byte sequence to split
|
|
45
|
+
* @param ranks - The vocabulary mapping byte sequences to ranks
|
|
46
|
+
* @returns Array of byte slices
|
|
47
|
+
*/
|
|
48
|
+
export declare function bytePairSplit(piece: Uint8Array, ranks: Vocabulary): Uint8Array[];
|
|
49
|
+
//# sourceMappingURL=byte-pair-encoding.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"byte-pair-encoding.d.ts","sourceRoot":"","sources":["../../src/core/byte-pair-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH;;GAEG;AACH,MAAM,MAAM,IAAI,GAAG,MAAM,CAAC;AAE1B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;AAE3C;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;AAEtD;;;GAGG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAQpD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,CAMlD;AA8ED;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,UAAU,GAAG,IAAI,EAAE,CA2B3E;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAC3B,KAAK,EAAE,UAAU,EACjB,KAAK,EAAE,UAAU,GAChB,UAAU,EAAE,CAed"}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Byte-Pair Encoding Core Algorithm
|
|
3
|
+
*
|
|
4
|
+
* This is an EXACT port of the BPE algorithm from tiktoken-rs.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
|
|
6
|
+
*
|
|
7
|
+
* The algorithm works as follows:
|
|
8
|
+
* 1. Split input text into pieces using a regex pattern
|
|
9
|
+
* 2. For each piece, convert to bytes and apply BPE merges
|
|
10
|
+
* 3. Look up each merged byte sequence in the vocabulary to get token IDs
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Convert a byte array to a string key for Map lookup
|
|
14
|
+
* This is necessary because JavaScript Maps use reference equality for arrays
|
|
15
|
+
*/
|
|
16
|
+
export function bytesToKey(bytes) {
|
|
17
|
+
// Use a more efficient encoding for the key
|
|
18
|
+
// Each byte becomes a character in the string
|
|
19
|
+
let key = "";
|
|
20
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
21
|
+
key += String.fromCharCode(bytes[i]);
|
|
22
|
+
}
|
|
23
|
+
return key;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Convert a string key back to bytes
|
|
27
|
+
*/
|
|
28
|
+
export function keyToBytes(key) {
|
|
29
|
+
const bytes = new Uint8Array(key.length);
|
|
30
|
+
for (let i = 0; i < key.length; i++) {
|
|
31
|
+
bytes[i] = key.charCodeAt(i);
|
|
32
|
+
}
|
|
33
|
+
return bytes;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Internal BPE merge algorithm
|
|
37
|
+
*
|
|
38
|
+
* This is the core of the BPE algorithm. It finds the lowest-ranked pair
|
|
39
|
+
* of adjacent bytes and merges them, repeating until no more merges are possible.
|
|
40
|
+
*
|
|
41
|
+
* @param ranks - The vocabulary mapping byte sequences to ranks
|
|
42
|
+
* @param piece - The byte sequence to process
|
|
43
|
+
* @returns Array of (start_index, rank) tuples representing the merge points
|
|
44
|
+
*/
|
|
45
|
+
function _bytePairMerge(ranks, piece) {
|
|
46
|
+
// This is a vector of (start, rank).
|
|
47
|
+
// The rank is of the pair starting at position start.
|
|
48
|
+
const parts = [];
|
|
49
|
+
// Note that we hash bytes when indexing into `ranks`, not token pairs. As long as we train BPE
|
|
50
|
+
// the way we currently do, this is equivalent. An easy way to break this would be to decouple
|
|
51
|
+
// merge priority from token index or to prevent specific token merges.
|
|
52
|
+
let minRank = [
|
|
53
|
+
Number.MAX_SAFE_INTEGER,
|
|
54
|
+
Number.MAX_SAFE_INTEGER,
|
|
55
|
+
];
|
|
56
|
+
for (let i = 0; i < piece.length - 1; i++) {
|
|
57
|
+
const pairKey = bytesToKey(piece.slice(i, i + 2));
|
|
58
|
+
const rank = ranks.get(pairKey) ?? Number.MAX_SAFE_INTEGER;
|
|
59
|
+
if (rank < minRank[0]) {
|
|
60
|
+
minRank = [rank, i];
|
|
61
|
+
}
|
|
62
|
+
parts.push([i, rank]);
|
|
63
|
+
}
|
|
64
|
+
parts.push([piece.length - 1, Number.MAX_SAFE_INTEGER]);
|
|
65
|
+
parts.push([piece.length, Number.MAX_SAFE_INTEGER]);
|
|
66
|
+
const getRank = (parts, i) => {
|
|
67
|
+
if (i + 3 < parts.length) {
|
|
68
|
+
// Similar to `piece[i..i + 2]` above. The +3 is because we haven't yet deleted
|
|
69
|
+
// parts[i + 1], see comment in the main loop.
|
|
70
|
+
const start = parts[i][0];
|
|
71
|
+
const end = parts[i + 3][0];
|
|
72
|
+
const pairKey = bytesToKey(piece.slice(start, end));
|
|
73
|
+
return ranks.get(pairKey) ?? Number.MAX_SAFE_INTEGER;
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
return Number.MAX_SAFE_INTEGER;
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
// If you have n parts and m merges, this does O(mn) work.
|
|
80
|
+
// We could do something with a heap and do O(m log n) work.
|
|
81
|
+
// n is often very small so considerations like cache-locality outweigh the algorithmic
|
|
82
|
+
// complexity downsides of the `parts` vector.
|
|
83
|
+
while (minRank[0] !== Number.MAX_SAFE_INTEGER) {
|
|
84
|
+
const i = minRank[1];
|
|
85
|
+
// Update parts[i] and parts[i - 1] before removing parts[i + 1], since
|
|
86
|
+
// `parts.splice(i + 1, 1)` will thrash the cache.
|
|
87
|
+
if (i > 0) {
|
|
88
|
+
parts[i - 1][1] = getRank(parts, i - 1);
|
|
89
|
+
}
|
|
90
|
+
parts[i][1] = getRank(parts, i);
|
|
91
|
+
parts.splice(i + 1, 1);
|
|
92
|
+
minRank = [Number.MAX_SAFE_INTEGER, Number.MAX_SAFE_INTEGER];
|
|
93
|
+
for (let j = 0; j < parts.length - 1; j++) {
|
|
94
|
+
const rank = parts[j][1];
|
|
95
|
+
if (rank < minRank[0]) {
|
|
96
|
+
minRank = [rank, j];
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return parts;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Encode a byte sequence using BPE
|
|
104
|
+
*
|
|
105
|
+
* @param piece - The byte sequence to encode
|
|
106
|
+
* @param ranks - The vocabulary mapping byte sequences to ranks
|
|
107
|
+
* @returns Array of token ranks
|
|
108
|
+
*/
|
|
109
|
+
export function bytePairEncode(piece, ranks) {
|
|
110
|
+
if (piece.length === 1) {
|
|
111
|
+
const key = bytesToKey(piece);
|
|
112
|
+
const rank = ranks.get(key);
|
|
113
|
+
if (rank === undefined) {
|
|
114
|
+
throw new Error(`Unknown byte: ${piece[0]}`);
|
|
115
|
+
}
|
|
116
|
+
return [rank];
|
|
117
|
+
}
|
|
118
|
+
const parts = _bytePairMerge(ranks, piece);
|
|
119
|
+
// Convert parts to token ranks
|
|
120
|
+
const tokens = [];
|
|
121
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
122
|
+
const start = parts[i][0];
|
|
123
|
+
const end = parts[i + 1][0];
|
|
124
|
+
const tokenBytes = piece.slice(start, end);
|
|
125
|
+
const key = bytesToKey(tokenBytes);
|
|
126
|
+
const rank = ranks.get(key);
|
|
127
|
+
if (rank === undefined) {
|
|
128
|
+
throw new Error(`Unknown token: ${Array.from(tokenBytes).join(",")}`);
|
|
129
|
+
}
|
|
130
|
+
tokens.push(rank);
|
|
131
|
+
}
|
|
132
|
+
return tokens;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Split a byte sequence into BPE tokens (returns byte slices, not ranks)
|
|
136
|
+
*
|
|
137
|
+
* @param piece - The byte sequence to split
|
|
138
|
+
* @param ranks - The vocabulary mapping byte sequences to ranks
|
|
139
|
+
* @returns Array of byte slices
|
|
140
|
+
*/
|
|
141
|
+
export function bytePairSplit(piece, ranks) {
|
|
142
|
+
if (piece.length <= 1) {
|
|
143
|
+
return [piece];
|
|
144
|
+
}
|
|
145
|
+
const parts = _bytePairMerge(ranks, piece);
|
|
146
|
+
const splits = [];
|
|
147
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
148
|
+
const start = parts[i][0];
|
|
149
|
+
const end = parts[i + 1][0];
|
|
150
|
+
splits.push(piece.slice(start, end));
|
|
151
|
+
}
|
|
152
|
+
return splits;
|
|
153
|
+
}
|
|
154
|
+
//# sourceMappingURL=byte-pair-encoding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"byte-pair-encoding.js","sourceRoot":"","sources":["../../src/core/byte-pair-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAiBH;;;GAGG;AACH,MAAM,UAAU,UAAU,CAAC,KAAiB;IAC1C,4CAA4C;IAC5C,8CAA8C;IAC9C,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,cAAc,CACrB,KAAiB,EACjB,KAAiB;IAEjB,qCAAqC;IACrC,sDAAsD;IACtD,MAAM,KAAK,GAA0B,EAAE,CAAC;IAExC,+FAA+F;IAC/F,8FAA8F;IAC9F,uEAAuE;IACvE,IAAI,OAAO,GAAmB;QAC5B,MAAM,CAAC,gBAAgB;QACvB,MAAM,CAAC,gBAAgB;KACxB,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAClD,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,gBAAgB,CAAC;QAC3D,IAAI,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;YACtB,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IACxB,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,CAAC,KAA4B,EAAE,CAAS,EAAQ,EAAE;QAChE,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACzB,+EAA+E;YAC/E,8CAA8C;YAC9C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YACpD,OAAO,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,gBAAgB,CAAC;QACvD,CAAC;aAAM,CAAC;YACN,OAAO,MAAM,CAAC,gBAAgB,CAAC;QACjC,CAAC;IACH,CAAC,CAAC;IAEF,0DAA0D;IAC1D,4DAA4D;IAC5D,uFAAuF;IACvF,8CAA8C;IAC9C,OAAO,OAAO,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,gBAAgB,EAAE,CAAC;QAC9C,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,uEAAuE;QACvE,kDAAkD;QAClD,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1C,CAAC;QACD,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAChC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAEvB,OAAO,GAAG,CAAC,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;QAC7D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACzB,IAAI,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtB,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,cAAc,CAAC,KAAiB,EAAE,KAAiB;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,iBAAiB,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAC/C,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAE3C,+BAA+B;IAC/B,MAAM,MAAM,GAAW,EAAE,CAAC;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,kBAAkB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACxE,CAAC;QACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAC3B,KAAiB,EACjB,KAAiB;IAEjB,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAiB,EAAE,CAAC;IAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Encoding Definitions
|
|
3
|
+
*
|
|
4
|
+
* EXACT definitions from tiktoken-rs for all supported encodings.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tiktoken_ext/openai_public.rs
|
|
6
|
+
*/
|
|
7
|
+
import type { Rank } from "./byte-pair-encoding.js";
|
|
8
|
+
/**
|
|
9
|
+
* Special token constants
|
|
10
|
+
*/
|
|
11
|
+
export declare const SPECIAL_TOKENS: {
|
|
12
|
+
readonly ENDOFTEXT: "<|endoftext|>";
|
|
13
|
+
readonly FIM_PREFIX: "<|fim_prefix|>";
|
|
14
|
+
readonly FIM_MIDDLE: "<|fim_middle|>";
|
|
15
|
+
readonly FIM_SUFFIX: "<|fim_suffix|>";
|
|
16
|
+
readonly ENDOFPROMPT: "<|endofprompt|>";
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* Encoding definition interface
|
|
20
|
+
*/
|
|
21
|
+
export interface EncodingDefinition {
|
|
22
|
+
/** Encoding name */
|
|
23
|
+
name: string;
|
|
24
|
+
/**
|
|
25
|
+
* Regex pattern for splitting text into pieces
|
|
26
|
+
* Note: JavaScript regex patterns differ slightly from Rust
|
|
27
|
+
*/
|
|
28
|
+
pattern: string;
|
|
29
|
+
/** Special tokens with their ranks */
|
|
30
|
+
specialTokens: Record<string, Rank>;
|
|
31
|
+
/** URL to download the vocabulary file */
|
|
32
|
+
vocabularyUrl: string;
|
|
33
|
+
/** Expected vocabulary size (for validation) */
|
|
34
|
+
expectedVocabSize: number;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* r50k_base encoding (GPT-2)
|
|
38
|
+
*
|
|
39
|
+
* Used by: GPT-3 davinci, curie, babbage, ada models
|
|
40
|
+
* Vocabulary size: 50,257 tokens
|
|
41
|
+
*/
|
|
42
|
+
export declare const R50K_BASE: EncodingDefinition;
|
|
43
|
+
/**
|
|
44
|
+
* p50k_base encoding
|
|
45
|
+
*
|
|
46
|
+
* Used by: Code models, text-davinci-002, text-davinci-003
|
|
47
|
+
* Vocabulary size: 50,281 tokens
|
|
48
|
+
*/
|
|
49
|
+
export declare const P50K_BASE: EncodingDefinition;
|
|
50
|
+
/**
|
|
51
|
+
* p50k_edit encoding
|
|
52
|
+
*
|
|
53
|
+
* Used by: Edit models like text-davinci-edit-001, code-davinci-edit-001
|
|
54
|
+
* Same vocabulary as p50k_base but with FIM special tokens
|
|
55
|
+
*/
|
|
56
|
+
export declare const P50K_EDIT: EncodingDefinition;
|
|
57
|
+
/**
|
|
58
|
+
* cl100k_base encoding
|
|
59
|
+
*
|
|
60
|
+
* Used by: ChatGPT models, GPT-4, GPT-3.5-turbo, text-embedding-ada-002
|
|
61
|
+
* Vocabulary size: 100,256 tokens
|
|
62
|
+
*/
|
|
63
|
+
export declare const CL100K_BASE: EncodingDefinition;
|
|
64
|
+
/**
|
|
65
|
+
* o200k_base encoding
|
|
66
|
+
*
|
|
67
|
+
* Used by: GPT-4o, GPT-4.1, GPT-5, o-series models
|
|
68
|
+
* Vocabulary size: 200,000 tokens
|
|
69
|
+
*
|
|
70
|
+
* This encoding has a much larger vocabulary and is optimized for:
|
|
71
|
+
* - Better handling of non-English languages
|
|
72
|
+
* - More efficient tokenization of code
|
|
73
|
+
* - Better multimodal support
|
|
74
|
+
*/
|
|
75
|
+
export declare const O200K_BASE: EncodingDefinition;
|
|
76
|
+
/**
|
|
77
|
+
* o200k_harmony encoding
|
|
78
|
+
*
|
|
79
|
+
* Used by: gpt-oss models (open source)
|
|
80
|
+
* Same vocabulary as o200k_base but with additional special tokens
|
|
81
|
+
*/
|
|
82
|
+
export declare const O200K_HARMONY: EncodingDefinition;
|
|
83
|
+
/**
|
|
84
|
+
* All encoding definitions
|
|
85
|
+
*/
|
|
86
|
+
export declare const ENCODING_DEFINITIONS: Record<string, EncodingDefinition>;
|
|
87
|
+
/**
|
|
88
|
+
* Get encoding definition by name
|
|
89
|
+
*/
|
|
90
|
+
export declare function getEncodingDefinition(name: string): EncodingDefinition | undefined;
|
|
91
|
+
/**
|
|
92
|
+
* List all encoding names
|
|
93
|
+
*/
|
|
94
|
+
export declare function listEncodingNames(): string[];
|
|
95
|
+
//# sourceMappingURL=encoding-definitions.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"encoding-definitions.d.ts","sourceRoot":"","sources":["../../src/core/encoding-definitions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,yBAAyB,CAAC;AAEpD;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;CAMjB,CAAC;AAEX;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,oBAAoB;IACpB,IAAI,EAAE,MAAM,CAAC;IAEb;;;OAGG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB,sCAAsC;IACtC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAEpC,0CAA0C;IAC1C,aAAa,EAAE,MAAM,CAAC;IAEtB,gDAAgD;IAChD,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAYvB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAWvB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAevB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,WAAW,EAAE,kBAoBzB,CAAC;AAEF;;;;;;;;;;GAUG;AACH,eAAO,MAAM,UAAU,EAAE,kBA4BxB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,aAAa,EAAE,kBA2B3B,CAAC;AAgBF;;GAEG;AACH,eAAO,MAAM,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE,kBAAkB,CASnE,CAAC;AAEF;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,GACX,kBAAkB,GAAG,SAAS,CAEhC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,IAAI,MAAM,EAAE,CAE5C"}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Encoding Definitions
|
|
3
|
+
*
|
|
4
|
+
* EXACT definitions from tiktoken-rs for all supported encodings.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tiktoken_ext/openai_public.rs
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Special token constants
|
|
9
|
+
*/
|
|
10
|
+
export const SPECIAL_TOKENS = {
|
|
11
|
+
ENDOFTEXT: "<|endoftext|>",
|
|
12
|
+
FIM_PREFIX: "<|fim_prefix|>",
|
|
13
|
+
FIM_MIDDLE: "<|fim_middle|>",
|
|
14
|
+
FIM_SUFFIX: "<|fim_suffix|>",
|
|
15
|
+
ENDOFPROMPT: "<|endofprompt|>",
|
|
16
|
+
};
|
|
17
|
+
/**
|
|
18
|
+
* r50k_base encoding (GPT-2)
|
|
19
|
+
*
|
|
20
|
+
* Used by: GPT-3 davinci, curie, babbage, ada models
|
|
21
|
+
* Vocabulary size: 50,257 tokens
|
|
22
|
+
*/
|
|
23
|
+
export const R50K_BASE = {
|
|
24
|
+
name: "r50k_base",
|
|
25
|
+
// Pattern from tiktoken-rs:
|
|
26
|
+
// 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
|
27
|
+
pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
|
|
28
|
+
specialTokens: {
|
|
29
|
+
[SPECIAL_TOKENS.ENDOFTEXT]: 50256,
|
|
30
|
+
},
|
|
31
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
|
|
32
|
+
expectedVocabSize: 50256,
|
|
33
|
+
};
|
|
34
|
+
/**
|
|
35
|
+
* p50k_base encoding
|
|
36
|
+
*
|
|
37
|
+
* Used by: Code models, text-davinci-002, text-davinci-003
|
|
38
|
+
* Vocabulary size: 50,281 tokens
|
|
39
|
+
*/
|
|
40
|
+
export const P50K_BASE = {
|
|
41
|
+
name: "p50k_base",
|
|
42
|
+
// Same pattern as r50k_base
|
|
43
|
+
pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
|
|
44
|
+
specialTokens: {
|
|
45
|
+
[SPECIAL_TOKENS.ENDOFTEXT]: 50256,
|
|
46
|
+
},
|
|
47
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
|
48
|
+
expectedVocabSize: 50280,
|
|
49
|
+
};
|
|
50
|
+
/**
|
|
51
|
+
* p50k_edit encoding
|
|
52
|
+
*
|
|
53
|
+
* Used by: Edit models like text-davinci-edit-001, code-davinci-edit-001
|
|
54
|
+
* Same vocabulary as p50k_base but with FIM special tokens
|
|
55
|
+
*/
|
|
56
|
+
export const P50K_EDIT = {
|
|
57
|
+
name: "p50k_edit",
|
|
58
|
+
// Same pattern as p50k_base
|
|
59
|
+
pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
|
|
60
|
+
specialTokens: {
|
|
61
|
+
[SPECIAL_TOKENS.ENDOFTEXT]: 50256,
|
|
62
|
+
[SPECIAL_TOKENS.FIM_PREFIX]: 50281,
|
|
63
|
+
[SPECIAL_TOKENS.FIM_MIDDLE]: 50282,
|
|
64
|
+
[SPECIAL_TOKENS.FIM_SUFFIX]: 50283,
|
|
65
|
+
},
|
|
66
|
+
// Uses the same vocabulary file as p50k_base
|
|
67
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
|
68
|
+
expectedVocabSize: 50280,
|
|
69
|
+
};
|
|
70
|
+
/**
|
|
71
|
+
* cl100k_base encoding
|
|
72
|
+
*
|
|
73
|
+
* Used by: ChatGPT models, GPT-4, GPT-3.5-turbo, text-embedding-ada-002
|
|
74
|
+
* Vocabulary size: 100,256 tokens
|
|
75
|
+
*/
|
|
76
|
+
export const CL100K_BASE = {
|
|
77
|
+
name: "cl100k_base",
|
|
78
|
+
// Pattern from tiktoken-rs:
|
|
79
|
+
// (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
|
|
80
|
+
//
|
|
81
|
+
// JavaScript equivalent - (?i:...) is not supported, so we use case-insensitive alternatives
|
|
82
|
+
// Note: JavaScript doesn't support inline case-insensitive groups (?i:)
|
|
83
|
+
// We need to manually expand the contractions pattern
|
|
84
|
+
pattern: "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
85
|
+
specialTokens: {
|
|
86
|
+
[SPECIAL_TOKENS.ENDOFTEXT]: 100257,
|
|
87
|
+
[SPECIAL_TOKENS.FIM_PREFIX]: 100258,
|
|
88
|
+
[SPECIAL_TOKENS.FIM_MIDDLE]: 100259,
|
|
89
|
+
[SPECIAL_TOKENS.FIM_SUFFIX]: 100260,
|
|
90
|
+
[SPECIAL_TOKENS.ENDOFPROMPT]: 100276,
|
|
91
|
+
},
|
|
92
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
|
|
93
|
+
expectedVocabSize: 100256,
|
|
94
|
+
};
|
|
95
|
+
/**
|
|
96
|
+
* o200k_base encoding
|
|
97
|
+
*
|
|
98
|
+
* Used by: GPT-4o, GPT-4.1, GPT-5, o-series models
|
|
99
|
+
* Vocabulary size: 200,000 tokens
|
|
100
|
+
*
|
|
101
|
+
* This encoding has a much larger vocabulary and is optimized for:
|
|
102
|
+
* - Better handling of non-English languages
|
|
103
|
+
* - More efficient tokenization of code
|
|
104
|
+
* - Better multimodal support
|
|
105
|
+
*/
|
|
106
|
+
export const O200K_BASE = {
|
|
107
|
+
name: "o200k_base",
|
|
108
|
+
// Pattern from tiktoken-rs O200K_BASE_PAT_STR:
|
|
109
|
+
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?
|
|
110
|
+
// |[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?
|
|
111
|
+
// |\p{N}{1,3}
|
|
112
|
+
// | ?[^\s\p{L}\p{N}]+[\r\n/]* <-- Note: includes forward slash
|
|
113
|
+
// |\s*[\r\n]+
|
|
114
|
+
// |\s+(?!\S)
|
|
115
|
+
// |\s+
|
|
116
|
+
//
|
|
117
|
+
// JavaScript equivalent - expand the case-insensitive contractions
|
|
118
|
+
pattern: [
|
|
119
|
+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
|
|
120
|
+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
|
|
121
|
+
"\\p{N}{1,3}",
|
|
122
|
+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*", // Note: includes forward slash
|
|
123
|
+
"\\s*[\\r\\n]+",
|
|
124
|
+
"\\s+(?!\\S)",
|
|
125
|
+
"\\s+",
|
|
126
|
+
].join("|"),
|
|
127
|
+
specialTokens: {
|
|
128
|
+
[SPECIAL_TOKENS.ENDOFTEXT]: 199999,
|
|
129
|
+
[SPECIAL_TOKENS.ENDOFPROMPT]: 200018,
|
|
130
|
+
},
|
|
131
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
|
132
|
+
expectedVocabSize: 199998,
|
|
133
|
+
};
|
|
134
|
+
/**
|
|
135
|
+
* o200k_harmony encoding
|
|
136
|
+
*
|
|
137
|
+
* Used by: gpt-oss models (open source)
|
|
138
|
+
* Same vocabulary as o200k_base but with additional special tokens
|
|
139
|
+
*/
|
|
140
|
+
export const O200K_HARMONY = {
|
|
141
|
+
name: "o200k_harmony",
|
|
142
|
+
// Same pattern as o200k_base
|
|
143
|
+
pattern: O200K_BASE.pattern,
|
|
144
|
+
specialTokens: {
|
|
145
|
+
"<|startoftext|>": 199998,
|
|
146
|
+
"<|endoftext|>": 199999,
|
|
147
|
+
"<|reserved_200000|>": 200000,
|
|
148
|
+
"<|reserved_200001|>": 200001,
|
|
149
|
+
"<|return|>": 200002,
|
|
150
|
+
"<|constrain|>": 200003,
|
|
151
|
+
"<|reserved_200004|>": 200004,
|
|
152
|
+
"<|channel|>": 200005,
|
|
153
|
+
"<|start|>": 200006,
|
|
154
|
+
"<|end|>": 200007,
|
|
155
|
+
"<|message|>": 200008,
|
|
156
|
+
"<|reserved_200009|>": 200009,
|
|
157
|
+
"<|reserved_200010|>": 200010,
|
|
158
|
+
"<|reserved_200011|>": 200011,
|
|
159
|
+
"<|call|>": 200012,
|
|
160
|
+
// Reserved tokens from 200013 to 201087 are generated dynamically
|
|
161
|
+
...generateReservedTokens(200013, 201087),
|
|
162
|
+
},
|
|
163
|
+
// Uses the same vocabulary file as o200k_base
|
|
164
|
+
vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
|
165
|
+
expectedVocabSize: 199998,
|
|
166
|
+
};
|
|
167
|
+
/**
|
|
168
|
+
* Generate reserved special tokens
|
|
169
|
+
*/
|
|
170
|
+
function generateReservedTokens(start, end) {
|
|
171
|
+
const tokens = {};
|
|
172
|
+
for (let i = start; i <= end; i++) {
|
|
173
|
+
tokens[`<|reserved_${i}|>`] = i;
|
|
174
|
+
}
|
|
175
|
+
return tokens;
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* All encoding definitions
|
|
179
|
+
*/
|
|
180
|
+
export const ENCODING_DEFINITIONS = {
|
|
181
|
+
r50k_base: R50K_BASE,
|
|
182
|
+
p50k_base: P50K_BASE,
|
|
183
|
+
p50k_edit: P50K_EDIT,
|
|
184
|
+
cl100k_base: CL100K_BASE,
|
|
185
|
+
o200k_base: O200K_BASE,
|
|
186
|
+
o200k_harmony: O200K_HARMONY,
|
|
187
|
+
// Aliases
|
|
188
|
+
gpt2: R50K_BASE,
|
|
189
|
+
};
|
|
190
|
+
/**
|
|
191
|
+
* Get encoding definition by name
|
|
192
|
+
*/
|
|
193
|
+
export function getEncodingDefinition(name) {
|
|
194
|
+
return ENCODING_DEFINITIONS[name];
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* List all encoding names
|
|
198
|
+
*/
|
|
199
|
+
export function listEncodingNames() {
|
|
200
|
+
return Object.keys(ENCODING_DEFINITIONS);
|
|
201
|
+
}
|
|
202
|
+
//# sourceMappingURL=encoding-definitions.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"encoding-definitions.js","sourceRoot":"","sources":["../../src/core/encoding-definitions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG;IAC5B,SAAS,EAAE,eAAe;IAC1B,UAAU,EAAE,gBAAgB;IAC5B,UAAU,EAAE,gBAAgB;IAC5B,UAAU,EAAE,gBAAgB;IAC5B,WAAW,EAAE,iBAAiB;CACtB,CAAC;AAyBX;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,6EAA6E;IAC7E,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;KAClC;IACD,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;KAClC;IACD,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;QACjC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;KACnC;IACD,6CAA6C;IAC7C,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,WAAW,GAAuB;IAC7C,IAAI,EAAE,aAAa;IACnB,4BAA4B;IAC5B,sHAAsH;IACtH,EAAE;IACF,6FAA6F;IAC7F,wEAAwE;IACxE,sDAAsD;IACtD,OAAO,EACL,mKAAmK;IACrK,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,MAAM;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,WAAW,CAAC,EAAE,MAAM;KACrC;IACD,aAAa,EACX,2EAA2E;IAC7E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;;;;;;;;;GAUG;AACH,MAAM,CAAC,MAAM,UAAU,GAAuB;IAC5C,IAAI,EAAE,YAAY;IAClB,+CAA+C;IAC/C,4GAA4G;IAC5G,6GAA6G;IAC7G,cAAc;IACd,iEAAiE;IACjE,cAAc;IACd,aAAa;IACb,OAAO;IACP,EAAE;IACF,mEAAmE;IACnE,OAAO,EAAE;QACP,qJAAqJ;QACrJ,qJAAqJ;QACrJ,aAAa;QACb,iCAAiC,EAAE,+BAA+B;QAClE,eAAe;QACf,aAAa;QACb,MAAM;KACP,CAAC,IAAI,CAAC,GAAG,CAAC;IACX,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,MAAM;QAClC,CAAC,cAAc,CAAC,WAAW,CAAC,EAAE,MAAM;KACrC;IACD,aAAa,EACX,0EAA0E;IAC5E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,aAAa,GAAuB;IAC/C,IAAI,EAAE,eAAe;IACrB,6BAA6B;IAC7B,OAAO,EAAE,UAAU,CAAC,OAAO;IAC3B,aAAa,EAAE;QACb,iBAAiB,EAAE,MAAM;QACzB,eAAe,EAAE,MAAM;QACvB,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,YAAY,EAAE,MAAM;QACpB,eAAe,EAAE,MAAM;QACvB,qBAAqB,EAAE,MAAM;QAC7B,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,MAAM;QACnB,SAAS,EAAE,MAAM;QACjB,aAAa,EAAE,MAAM;QACrB,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,UAAU,EAAE,MAAM;QAClB,kEAAkE;QAClE,GAAG,sBAAsB,CAAC,MAAM,EAAE,MAAM,CAAC;KAC1C;IACD,8CAA8C;IAC9C,aAAa,EACX,0EAA0E;IAC5E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;GAEG;AACH,SAAS,sBAAsB,CAC7B,KAAa,EACb,GAAW;IAEX,MAAM,MAAM,GAAyB,EAAE,CAAC;IACxC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,IAAI,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAClC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAuC;IACtE,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,SAAS;IACpB,WAAW,EAAE,WAAW;IACxB,UAAU,EAAE,UAAU;IACtB,aAAa,EAAE,aAAa;IAC5B,UAAU;IACV,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY;IAEZ,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB;IAC/B,OAAO,MAAM,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;AAC3C,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core Tiktoken Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides the exact BPE implementation from tiktoken-rs,
|
|
5
|
+
* ported to TypeScript for use in both Node.js and browser environments.
|
|
6
|
+
*/
|
|
7
|
+
export { type Rank, type Vocabulary, type ReverseVocabulary, bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, } from "./byte-pair-encoding.js";
|
|
8
|
+
export { parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, createSpecialTokenMaps, clearVocabularyCache, getVocabularyFromCache, isVocabularyCached, VOCABULARY_URLS, base64Encode, keyToBytes as vocabKeyToBytes, } from "./vocab-loader.js";
|
|
9
|
+
export { CoreBPE, DecodeKeyError } from "./tiktoken.js";
|
|
10
|
+
export { type EncodingDefinition, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, } from "./encoding-definitions.js";
|
|
11
|
+
export { type TokenizerName, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./model-to-encoding.js";
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EACL,KAAK,IAAI,EACT,KAAK,UAAU,EACf,KAAK,iBAAiB,EACtB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,GACd,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,oBAAoB,EACpB,sBAAsB,EACtB,kBAAkB,EAClB,eAAe,EACf,YAAY,EACZ,UAAU,IAAI,eAAe,GAC9B,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAGxD,OAAO,EACL,KAAK,kBAAkB,EACvB,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,GAClB,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,aAAa,EAClB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core Tiktoken Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides the exact BPE implementation from tiktoken-rs,
|
|
5
|
+
* ported to TypeScript for use in both Node.js and browser environments.
|
|
6
|
+
*/
|
|
7
|
+
// Byte-Pair Encoding algorithm
|
|
8
|
+
export { bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, } from "./byte-pair-encoding.js";
|
|
9
|
+
// Vocabulary loading
|
|
10
|
+
export { parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, createSpecialTokenMaps, clearVocabularyCache, getVocabularyFromCache, isVocabularyCached, VOCABULARY_URLS, base64Encode, keyToBytes as vocabKeyToBytes, } from "./vocab-loader.js";
|
|
11
|
+
// Core BPE tokenizer
|
|
12
|
+
export { CoreBPE, DecodeKeyError } from "./tiktoken.js";
|
|
13
|
+
// Encoding definitions
|
|
14
|
+
export { SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, } from "./encoding-definitions.js";
|
|
15
|
+
// Model to encoding mappings
|
|
16
|
+
export { getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./model-to-encoding.js";
|
|
17
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,+BAA+B;AAC/B,OAAO,EAIL,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,GACd,MAAM,yBAAyB,CAAC;AAEjC,qBAAqB;AACrB,OAAO,EACL,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,oBAAoB,EACpB,sBAAsB,EACtB,kBAAkB,EAClB,eAAe,EACf,YAAY,EACZ,UAAU,IAAI,eAAe,GAC9B,MAAM,mBAAmB,CAAC;AAE3B,qBAAqB;AACrB,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAExD,uBAAuB;AACvB,OAAO,EAEL,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,GAClB,MAAM,2BAA2B,CAAC;AAEnC,6BAA6B;AAC7B,OAAO,EAEL,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model to Encoding Mappings
|
|
3
|
+
*
|
|
4
|
+
* EXACT mappings from tiktoken-rs for model name to encoding.
|
|
5
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs
|
|
6
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Tokenizer/Encoding enum values (matching tiktoken-rs)
|
|
10
|
+
*/
|
|
11
|
+
export type TokenizerName = "o200k_harmony" | "o200k_base" | "cl100k_base" | "p50k_base" | "r50k_base" | "p50k_edit" | "gpt2";
|
|
12
|
+
/**
|
|
13
|
+
* Get the tokenizer/encoding for a model name
|
|
14
|
+
*
|
|
15
|
+
* This function matches the logic in tiktoken-rs get_tokenizer()
|
|
16
|
+
*
|
|
17
|
+
* @param modelName - The model name
|
|
18
|
+
* @returns The tokenizer name, or undefined if not found
|
|
19
|
+
*/
|
|
20
|
+
export declare function getTokenizerForModel(modelName: string): TokenizerName | undefined;
|
|
21
|
+
/**
|
|
22
|
+
* Context size mapping from tiktoken-rs model.rs
|
|
23
|
+
*
|
|
24
|
+
* Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
|
|
25
|
+
*/
|
|
26
|
+
export declare function getContextSize(model: string): number;
|
|
27
|
+
/**
|
|
28
|
+
* Extended context limits including specific model versions
|
|
29
|
+
* from tiktoken-rs model.rs get_context_size match statement
|
|
30
|
+
*/
|
|
31
|
+
export declare const EXACT_CONTEXT_SIZES: Record<string, number>;
|
|
32
|
+
/**
|
|
33
|
+
* Get context size with exact match support
|
|
34
|
+
*/
|
|
35
|
+
export declare function getExactContextSize(model: string): number;
|
|
36
|
+
//# sourceMappingURL=model-to-encoding.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-to-encoding.d.ts","sourceRoot":"","sources":["../../src/core/model-to-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH;;GAEG;AACH,MAAM,MAAM,aAAa,GACrB,eAAe,GACf,YAAY,GACZ,aAAa,GACb,WAAW,GACX,WAAW,GACX,WAAW,GACX,MAAM,CAAC;AA8FX;;;;;;;GAOG;AACH,wBAAgB,oBAAoB,CAClC,SAAS,EAAE,MAAM,GAChB,aAAa,GAAG,SAAS,CAe3B;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CA0FpD;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAoFtD,CAAC;AAEF;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CASzD"}
|