@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Byte-Pair Encoding Core Algorithm
3
+ *
4
+ * This is an EXACT port of the BPE algorithm from tiktoken-rs.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
6
+ *
7
+ * The algorithm works as follows:
8
+ * 1. Split input text into pieces using a regex pattern
9
+ * 2. For each piece, convert to bytes and apply BPE merges
10
+ * 3. Look up each merged byte sequence in the vocabulary to get token IDs
11
+ */
12
+ /**
13
+ * Token rank type (same as tiktoken-rs)
14
+ */
15
+ export type Rank = number;
16
+ /**
17
+ * Vocabulary type: maps byte sequences to ranks
18
+ */
19
+ export type Vocabulary = Map<string, Rank>;
20
+ /**
21
+ * Reverse vocabulary: maps ranks to byte sequences
22
+ */
23
+ export type ReverseVocabulary = Map<Rank, Uint8Array>;
24
+ /**
25
+ * Convert a byte array to a string key for Map lookup
26
+ * This is necessary because JavaScript Maps use reference equality for arrays
27
+ */
28
+ export declare function bytesToKey(bytes: Uint8Array): string;
29
+ /**
30
+ * Convert a string key back to bytes
31
+ */
32
+ export declare function keyToBytes(key: string): Uint8Array;
33
+ /**
34
+ * Encode a byte sequence using BPE
35
+ *
36
+ * @param piece - The byte sequence to encode
37
+ * @param ranks - The vocabulary mapping byte sequences to ranks
38
+ * @returns Array of token ranks
39
+ */
40
+ export declare function bytePairEncode(piece: Uint8Array, ranks: Vocabulary): Rank[];
41
+ /**
42
+ * Split a byte sequence into BPE tokens (returns byte slices, not ranks)
43
+ *
44
+ * @param piece - The byte sequence to split
45
+ * @param ranks - The vocabulary mapping byte sequences to ranks
46
+ * @returns Array of byte slices
47
+ */
48
+ export declare function bytePairSplit(piece: Uint8Array, ranks: Vocabulary): Uint8Array[];
49
+ //# sourceMappingURL=byte-pair-encoding.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"byte-pair-encoding.d.ts","sourceRoot":"","sources":["../../src/core/byte-pair-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH;;GAEG;AACH,MAAM,MAAM,IAAI,GAAG,MAAM,CAAC;AAE1B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;AAE3C;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;AAEtD;;;GAGG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAQpD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,CAMlD;AA8ED;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,UAAU,GAAG,IAAI,EAAE,CA2B3E;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAC3B,KAAK,EAAE,UAAU,EACjB,KAAK,EAAE,UAAU,GAChB,UAAU,EAAE,CAed"}
@@ -0,0 +1,154 @@
1
+ /**
2
+ * Byte-Pair Encoding Core Algorithm
3
+ *
4
+ * This is an EXACT port of the BPE algorithm from tiktoken-rs.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/vendor_tiktoken.rs
6
+ *
7
+ * The algorithm works as follows:
8
+ * 1. Split input text into pieces using a regex pattern
9
+ * 2. For each piece, convert to bytes and apply BPE merges
10
+ * 3. Look up each merged byte sequence in the vocabulary to get token IDs
11
+ */
12
+ /**
13
+ * Convert a byte array to a string key for Map lookup
14
+ * This is necessary because JavaScript Maps use reference equality for arrays
15
+ */
16
+ export function bytesToKey(bytes) {
17
+ // Use a more efficient encoding for the key
18
+ // Each byte becomes a character in the string
19
+ let key = "";
20
+ for (let i = 0; i < bytes.length; i++) {
21
+ key += String.fromCharCode(bytes[i]);
22
+ }
23
+ return key;
24
+ }
25
+ /**
26
+ * Convert a string key back to bytes
27
+ */
28
+ export function keyToBytes(key) {
29
+ const bytes = new Uint8Array(key.length);
30
+ for (let i = 0; i < key.length; i++) {
31
+ bytes[i] = key.charCodeAt(i);
32
+ }
33
+ return bytes;
34
+ }
35
+ /**
36
+ * Internal BPE merge algorithm
37
+ *
38
+ * This is the core of the BPE algorithm. It finds the lowest-ranked pair
39
+ * of adjacent bytes and merges them, repeating until no more merges are possible.
40
+ *
41
+ * @param ranks - The vocabulary mapping byte sequences to ranks
42
+ * @param piece - The byte sequence to process
43
+ * @returns Array of (start_index, rank) tuples representing the merge points
44
+ */
45
+ function _bytePairMerge(ranks, piece) {
46
+ // This is a vector of (start, rank).
47
+ // The rank is of the pair starting at position start.
48
+ const parts = [];
49
+ // Note that we hash bytes when indexing into `ranks`, not token pairs. As long as we train BPE
50
+ // the way we currently do, this is equivalent. An easy way to break this would be to decouple
51
+ // merge priority from token index or to prevent specific token merges.
52
+ let minRank = [
53
+ Number.MAX_SAFE_INTEGER,
54
+ Number.MAX_SAFE_INTEGER,
55
+ ];
56
+ for (let i = 0; i < piece.length - 1; i++) {
57
+ const pairKey = bytesToKey(piece.slice(i, i + 2));
58
+ const rank = ranks.get(pairKey) ?? Number.MAX_SAFE_INTEGER;
59
+ if (rank < minRank[0]) {
60
+ minRank = [rank, i];
61
+ }
62
+ parts.push([i, rank]);
63
+ }
64
+ parts.push([piece.length - 1, Number.MAX_SAFE_INTEGER]);
65
+ parts.push([piece.length, Number.MAX_SAFE_INTEGER]);
66
+ const getRank = (parts, i) => {
67
+ if (i + 3 < parts.length) {
68
+ // Similar to `piece[i..i + 2]` above. The +3 is because we haven't yet deleted
69
+ // parts[i + 1], see comment in the main loop.
70
+ const start = parts[i][0];
71
+ const end = parts[i + 3][0];
72
+ const pairKey = bytesToKey(piece.slice(start, end));
73
+ return ranks.get(pairKey) ?? Number.MAX_SAFE_INTEGER;
74
+ }
75
+ else {
76
+ return Number.MAX_SAFE_INTEGER;
77
+ }
78
+ };
79
+ // If you have n parts and m merges, this does O(mn) work.
80
+ // We could do something with a heap and do O(m log n) work.
81
+ // n is often very small so considerations like cache-locality outweigh the algorithmic
82
+ // complexity downsides of the `parts` vector.
83
+ while (minRank[0] !== Number.MAX_SAFE_INTEGER) {
84
+ const i = minRank[1];
85
+ // Update parts[i] and parts[i - 1] before removing parts[i + 1], since
86
+ // `parts.splice(i + 1, 1)` will thrash the cache.
87
+ if (i > 0) {
88
+ parts[i - 1][1] = getRank(parts, i - 1);
89
+ }
90
+ parts[i][1] = getRank(parts, i);
91
+ parts.splice(i + 1, 1);
92
+ minRank = [Number.MAX_SAFE_INTEGER, Number.MAX_SAFE_INTEGER];
93
+ for (let j = 0; j < parts.length - 1; j++) {
94
+ const rank = parts[j][1];
95
+ if (rank < minRank[0]) {
96
+ minRank = [rank, j];
97
+ }
98
+ }
99
+ }
100
+ return parts;
101
+ }
102
+ /**
103
+ * Encode a byte sequence using BPE
104
+ *
105
+ * @param piece - The byte sequence to encode
106
+ * @param ranks - The vocabulary mapping byte sequences to ranks
107
+ * @returns Array of token ranks
108
+ */
109
+ export function bytePairEncode(piece, ranks) {
110
+ if (piece.length === 1) {
111
+ const key = bytesToKey(piece);
112
+ const rank = ranks.get(key);
113
+ if (rank === undefined) {
114
+ throw new Error(`Unknown byte: ${piece[0]}`);
115
+ }
116
+ return [rank];
117
+ }
118
+ const parts = _bytePairMerge(ranks, piece);
119
+ // Convert parts to token ranks
120
+ const tokens = [];
121
+ for (let i = 0; i < parts.length - 1; i++) {
122
+ const start = parts[i][0];
123
+ const end = parts[i + 1][0];
124
+ const tokenBytes = piece.slice(start, end);
125
+ const key = bytesToKey(tokenBytes);
126
+ const rank = ranks.get(key);
127
+ if (rank === undefined) {
128
+ throw new Error(`Unknown token: ${Array.from(tokenBytes).join(",")}`);
129
+ }
130
+ tokens.push(rank);
131
+ }
132
+ return tokens;
133
+ }
134
+ /**
135
+ * Split a byte sequence into BPE tokens (returns byte slices, not ranks)
136
+ *
137
+ * @param piece - The byte sequence to split
138
+ * @param ranks - The vocabulary mapping byte sequences to ranks
139
+ * @returns Array of byte slices
140
+ */
141
+ export function bytePairSplit(piece, ranks) {
142
+ if (piece.length <= 1) {
143
+ return [piece];
144
+ }
145
+ const parts = _bytePairMerge(ranks, piece);
146
+ const splits = [];
147
+ for (let i = 0; i < parts.length - 1; i++) {
148
+ const start = parts[i][0];
149
+ const end = parts[i + 1][0];
150
+ splits.push(piece.slice(start, end));
151
+ }
152
+ return splits;
153
+ }
154
+ //# sourceMappingURL=byte-pair-encoding.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"byte-pair-encoding.js","sourceRoot":"","sources":["../../src/core/byte-pair-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAiBH;;;GAGG;AACH,MAAM,UAAU,UAAU,CAAC,KAAiB;IAC1C,4CAA4C;IAC5C,8CAA8C;IAC9C,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,cAAc,CACrB,KAAiB,EACjB,KAAiB;IAEjB,qCAAqC;IACrC,sDAAsD;IACtD,MAAM,KAAK,GAA0B,EAAE,CAAC;IAExC,+FAA+F;IAC/F,8FAA8F;IAC9F,uEAAuE;IACvE,IAAI,OAAO,GAAmB;QAC5B,MAAM,CAAC,gBAAgB;QACvB,MAAM,CAAC,gBAAgB;KACxB,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAClD,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,gBAAgB,CAAC;QAC3D,IAAI,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;YACtB,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IACxB,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,CAAC,KAA4B,EAAE,CAAS,EAAQ,EAAE;QAChE,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACzB,+EAA+E;YAC/E,8CAA8C;YAC9C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YACpD,OAAO,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,gBAAgB,CAAC;QACvD,CAAC;aAAM,CAAC;YACN,OAAO,MAAM,CAAC,gBAAgB,CAAC;QACjC,CAAC;IACH,CAAC,CAAC;IAEF,0DAA0D;IAC1D,4DAA4D;IAC5D,uFAAuF;IACvF,8CAA8C;IAC9C,OAAO,OAAO,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,gBAAgB,EAAE,CAAC;QAC9C,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,uEAAuE;QACvE,kDAAkD;QAClD,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1C,CAAC;QACD,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAChC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAEvB,OAAO,GAAG,CAAC,MAAM,CAAC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;QAC7D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACzB,IAAI,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtB,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,cAAc,CAAC,KAAiB,EAAE,KAAiB;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,iBAAiB,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAC/C,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAE3C,+BAA+B;IAC/B,MAAM,MAAM,GAAW,EAAE,CAAC;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,kBAAkB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACxE,CAAC;QACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAC3B,KAAiB,EACjB,KAAiB;IAEjB,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAiB,EAAE,CAAC;IAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Encoding Definitions
3
+ *
4
+ * EXACT definitions from tiktoken-rs for all supported encodings.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tiktoken_ext/openai_public.rs
6
+ */
7
+ import type { Rank } from "./byte-pair-encoding.js";
8
+ /**
9
+ * Special token constants
10
+ */
11
+ export declare const SPECIAL_TOKENS: {
12
+ readonly ENDOFTEXT: "<|endoftext|>";
13
+ readonly FIM_PREFIX: "<|fim_prefix|>";
14
+ readonly FIM_MIDDLE: "<|fim_middle|>";
15
+ readonly FIM_SUFFIX: "<|fim_suffix|>";
16
+ readonly ENDOFPROMPT: "<|endofprompt|>";
17
+ };
18
+ /**
19
+ * Encoding definition interface
20
+ */
21
+ export interface EncodingDefinition {
22
+ /** Encoding name */
23
+ name: string;
24
+ /**
25
+ * Regex pattern for splitting text into pieces
26
+ * Note: JavaScript regex patterns differ slightly from Rust
27
+ */
28
+ pattern: string;
29
+ /** Special tokens with their ranks */
30
+ specialTokens: Record<string, Rank>;
31
+ /** URL to download the vocabulary file */
32
+ vocabularyUrl: string;
33
+ /** Expected vocabulary size (for validation) */
34
+ expectedVocabSize: number;
35
+ }
36
+ /**
37
+ * r50k_base encoding (GPT-2)
38
+ *
39
+ * Used by: GPT-3 davinci, curie, babbage, ada models
40
+ * Vocabulary size: 50,257 tokens
41
+ */
42
+ export declare const R50K_BASE: EncodingDefinition;
43
+ /**
44
+ * p50k_base encoding
45
+ *
46
+ * Used by: Code models, text-davinci-002, text-davinci-003
47
+ * Vocabulary size: 50,281 tokens
48
+ */
49
+ export declare const P50K_BASE: EncodingDefinition;
50
+ /**
51
+ * p50k_edit encoding
52
+ *
53
+ * Used by: Edit models like text-davinci-edit-001, code-davinci-edit-001
54
+ * Same vocabulary as p50k_base but with FIM special tokens
55
+ */
56
+ export declare const P50K_EDIT: EncodingDefinition;
57
+ /**
58
+ * cl100k_base encoding
59
+ *
60
+ * Used by: ChatGPT models, GPT-4, GPT-3.5-turbo, text-embedding-ada-002
61
+ * Vocabulary size: 100,256 tokens
62
+ */
63
+ export declare const CL100K_BASE: EncodingDefinition;
64
+ /**
65
+ * o200k_base encoding
66
+ *
67
+ * Used by: GPT-4o, GPT-4.1, GPT-5, o-series models
68
+ * Vocabulary size: 200,000 tokens
69
+ *
70
+ * This encoding has a much larger vocabulary and is optimized for:
71
+ * - Better handling of non-English languages
72
+ * - More efficient tokenization of code
73
+ * - Better multimodal support
74
+ */
75
+ export declare const O200K_BASE: EncodingDefinition;
76
+ /**
77
+ * o200k_harmony encoding
78
+ *
79
+ * Used by: gpt-oss models (open source)
80
+ * Same vocabulary as o200k_base but with additional special tokens
81
+ */
82
+ export declare const O200K_HARMONY: EncodingDefinition;
83
+ /**
84
+ * All encoding definitions
85
+ */
86
+ export declare const ENCODING_DEFINITIONS: Record<string, EncodingDefinition>;
87
+ /**
88
+ * Get encoding definition by name
89
+ */
90
+ export declare function getEncodingDefinition(name: string): EncodingDefinition | undefined;
91
+ /**
92
+ * List all encoding names
93
+ */
94
+ export declare function listEncodingNames(): string[];
95
+ //# sourceMappingURL=encoding-definitions.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"encoding-definitions.d.ts","sourceRoot":"","sources":["../../src/core/encoding-definitions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,yBAAyB,CAAC;AAEpD;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;CAMjB,CAAC;AAEX;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,oBAAoB;IACpB,IAAI,EAAE,MAAM,CAAC;IAEb;;;OAGG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB,sCAAsC;IACtC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAEpC,0CAA0C;IAC1C,aAAa,EAAE,MAAM,CAAC;IAEtB,gDAAgD;IAChD,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAYvB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAWvB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,SAAS,EAAE,kBAevB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,WAAW,EAAE,kBAoBzB,CAAC;AAEF;;;;;;;;;;GAUG;AACH,eAAO,MAAM,UAAU,EAAE,kBA4BxB,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,aAAa,EAAE,kBA2B3B,CAAC;AAgBF;;GAEG;AACH,eAAO,MAAM,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE,kBAAkB,CASnE,CAAC;AAEF;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,GACX,kBAAkB,GAAG,SAAS,CAEhC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,IAAI,MAAM,EAAE,CAE5C"}
@@ -0,0 +1,202 @@
1
+ /**
2
+ * Encoding Definitions
3
+ *
4
+ * EXACT definitions from tiktoken-rs for all supported encodings.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tiktoken_ext/openai_public.rs
6
+ */
7
+ /**
8
+ * Special token constants
9
+ */
10
+ export const SPECIAL_TOKENS = {
11
+ ENDOFTEXT: "<|endoftext|>",
12
+ FIM_PREFIX: "<|fim_prefix|>",
13
+ FIM_MIDDLE: "<|fim_middle|>",
14
+ FIM_SUFFIX: "<|fim_suffix|>",
15
+ ENDOFPROMPT: "<|endofprompt|>",
16
+ };
17
+ /**
18
+ * r50k_base encoding (GPT-2)
19
+ *
20
+ * Used by: GPT-3 davinci, curie, babbage, ada models
21
+ * Vocabulary size: 50,257 tokens
22
+ */
23
+ export const R50K_BASE = {
24
+ name: "r50k_base",
25
+ // Pattern from tiktoken-rs:
26
+ // 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
27
+ pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
28
+ specialTokens: {
29
+ [SPECIAL_TOKENS.ENDOFTEXT]: 50256,
30
+ },
31
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
32
+ expectedVocabSize: 50256,
33
+ };
34
+ /**
35
+ * p50k_base encoding
36
+ *
37
+ * Used by: Code models, text-davinci-002, text-davinci-003
38
+ * Vocabulary size: 50,281 tokens
39
+ */
40
+ export const P50K_BASE = {
41
+ name: "p50k_base",
42
+ // Same pattern as r50k_base
43
+ pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
44
+ specialTokens: {
45
+ [SPECIAL_TOKENS.ENDOFTEXT]: 50256,
46
+ },
47
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
48
+ expectedVocabSize: 50280,
49
+ };
50
+ /**
51
+ * p50k_edit encoding
52
+ *
53
+ * Used by: Edit models like text-davinci-edit-001, code-davinci-edit-001
54
+ * Same vocabulary as p50k_base but with FIM special tokens
55
+ */
56
+ export const P50K_EDIT = {
57
+ name: "p50k_edit",
58
+ // Same pattern as p50k_base
59
+ pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
60
+ specialTokens: {
61
+ [SPECIAL_TOKENS.ENDOFTEXT]: 50256,
62
+ [SPECIAL_TOKENS.FIM_PREFIX]: 50281,
63
+ [SPECIAL_TOKENS.FIM_MIDDLE]: 50282,
64
+ [SPECIAL_TOKENS.FIM_SUFFIX]: 50283,
65
+ },
66
+ // Uses the same vocabulary file as p50k_base
67
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
68
+ expectedVocabSize: 50280,
69
+ };
70
+ /**
71
+ * cl100k_base encoding
72
+ *
73
+ * Used by: ChatGPT models, GPT-4, GPT-3.5-turbo, text-embedding-ada-002
74
+ * Vocabulary size: 100,256 tokens
75
+ */
76
+ export const CL100K_BASE = {
77
+ name: "cl100k_base",
78
+ // Pattern from tiktoken-rs:
79
+ // (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
80
+ //
81
+ // JavaScript equivalent - (?i:...) is not supported, so we use case-insensitive alternatives
82
+ // Note: JavaScript doesn't support inline case-insensitive groups (?i:)
83
+ // We need to manually expand the contractions pattern
84
+ pattern: "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
85
+ specialTokens: {
86
+ [SPECIAL_TOKENS.ENDOFTEXT]: 100257,
87
+ [SPECIAL_TOKENS.FIM_PREFIX]: 100258,
88
+ [SPECIAL_TOKENS.FIM_MIDDLE]: 100259,
89
+ [SPECIAL_TOKENS.FIM_SUFFIX]: 100260,
90
+ [SPECIAL_TOKENS.ENDOFPROMPT]: 100276,
91
+ },
92
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
93
+ expectedVocabSize: 100256,
94
+ };
95
+ /**
96
+ * o200k_base encoding
97
+ *
98
+ * Used by: GPT-4o, GPT-4.1, GPT-5, o-series models
99
+ * Vocabulary size: 200,000 tokens
100
+ *
101
+ * This encoding has a much larger vocabulary and is optimized for:
102
+ * - Better handling of non-English languages
103
+ * - More efficient tokenization of code
104
+ * - Better multimodal support
105
+ */
106
+ export const O200K_BASE = {
107
+ name: "o200k_base",
108
+ // Pattern from tiktoken-rs O200K_BASE_PAT_STR:
109
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?
110
+ // |[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?
111
+ // |\p{N}{1,3}
112
+ // | ?[^\s\p{L}\p{N}]+[\r\n/]* <-- Note: includes forward slash
113
+ // |\s*[\r\n]+
114
+ // |\s+(?!\S)
115
+ // |\s+
116
+ //
117
+ // JavaScript equivalent - expand the case-insensitive contractions
118
+ pattern: [
119
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
120
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
121
+ "\\p{N}{1,3}",
122
+ " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*", // Note: includes forward slash
123
+ "\\s*[\\r\\n]+",
124
+ "\\s+(?!\\S)",
125
+ "\\s+",
126
+ ].join("|"),
127
+ specialTokens: {
128
+ [SPECIAL_TOKENS.ENDOFTEXT]: 199999,
129
+ [SPECIAL_TOKENS.ENDOFPROMPT]: 200018,
130
+ },
131
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
132
+ expectedVocabSize: 199998,
133
+ };
134
+ /**
135
+ * o200k_harmony encoding
136
+ *
137
+ * Used by: gpt-oss models (open source)
138
+ * Same vocabulary as o200k_base but with additional special tokens
139
+ */
140
+ export const O200K_HARMONY = {
141
+ name: "o200k_harmony",
142
+ // Same pattern as o200k_base
143
+ pattern: O200K_BASE.pattern,
144
+ specialTokens: {
145
+ "<|startoftext|>": 199998,
146
+ "<|endoftext|>": 199999,
147
+ "<|reserved_200000|>": 200000,
148
+ "<|reserved_200001|>": 200001,
149
+ "<|return|>": 200002,
150
+ "<|constrain|>": 200003,
151
+ "<|reserved_200004|>": 200004,
152
+ "<|channel|>": 200005,
153
+ "<|start|>": 200006,
154
+ "<|end|>": 200007,
155
+ "<|message|>": 200008,
156
+ "<|reserved_200009|>": 200009,
157
+ "<|reserved_200010|>": 200010,
158
+ "<|reserved_200011|>": 200011,
159
+ "<|call|>": 200012,
160
+ // Reserved tokens from 200013 to 201087 are generated dynamically
161
+ ...generateReservedTokens(200013, 201087),
162
+ },
163
+ // Uses the same vocabulary file as o200k_base
164
+ vocabularyUrl: "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
165
+ expectedVocabSize: 199998,
166
+ };
167
+ /**
168
+ * Generate reserved special tokens
169
+ */
170
+ function generateReservedTokens(start, end) {
171
+ const tokens = {};
172
+ for (let i = start; i <= end; i++) {
173
+ tokens[`<|reserved_${i}|>`] = i;
174
+ }
175
+ return tokens;
176
+ }
177
+ /**
178
+ * All encoding definitions
179
+ */
180
+ export const ENCODING_DEFINITIONS = {
181
+ r50k_base: R50K_BASE,
182
+ p50k_base: P50K_BASE,
183
+ p50k_edit: P50K_EDIT,
184
+ cl100k_base: CL100K_BASE,
185
+ o200k_base: O200K_BASE,
186
+ o200k_harmony: O200K_HARMONY,
187
+ // Aliases
188
+ gpt2: R50K_BASE,
189
+ };
190
+ /**
191
+ * Get encoding definition by name
192
+ */
193
+ export function getEncodingDefinition(name) {
194
+ return ENCODING_DEFINITIONS[name];
195
+ }
196
+ /**
197
+ * List all encoding names
198
+ */
199
+ export function listEncodingNames() {
200
+ return Object.keys(ENCODING_DEFINITIONS);
201
+ }
202
+ //# sourceMappingURL=encoding-definitions.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"encoding-definitions.js","sourceRoot":"","sources":["../../src/core/encoding-definitions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG;IAC5B,SAAS,EAAE,eAAe;IAC1B,UAAU,EAAE,gBAAgB;IAC5B,UAAU,EAAE,gBAAgB;IAC5B,UAAU,EAAE,gBAAgB;IAC5B,WAAW,EAAE,iBAAiB;CACtB,CAAC;AAyBX;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,6EAA6E;IAC7E,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;KAClC;IACD,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;KAClC;IACD,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,IAAI,EAAE,WAAW;IACjB,4BAA4B;IAC5B,OAAO,EACL,oFAAoF;IACtF,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,KAAK;QACjC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,KAAK;KACnC;IACD,6CAA6C;IAC7C,aAAa,EACX,yEAAyE;IAC3E,iBAAiB,EAAE,KAAK;CACzB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,WAAW,GAAuB;IAC7C,IAAI,EAAE,aAAa;IACnB,4BAA4B;IAC5B,sHAAsH;IACtH,EAAE;IACF,6FAA6F;IAC7F,wEAAwE;IACxE,sDAAsD;IACtD,OAAO,EACL,mKAAmK;IACrK,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,MAAM;QAClC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,UAAU,CAAC,EAAE,MAAM;QACnC,CAAC,cAAc,CAAC,WAAW,CAAC,EAAE,MAAM;KACrC;IACD,aAAa,EACX,2EAA2E;IAC7E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;;;;;;;;;GAUG;AACH,MAAM,CAAC,MAAM,UAAU,GAAuB;IAC5C,IAAI,EAAE,YAAY;IAClB,+CAA+C;IAC/C,4GAA4G;IAC5G,6GAA6G;IAC7G,cAAc;IACd,iEAAiE;IACjE,cAAc;IACd,aAAa;IACb,OAAO;IACP,EAAE;IACF,mEAAmE;IACnE,OAAO,EAAE;QACP,qJAAqJ;QACrJ,qJAAqJ;QACrJ,aAAa;QACb,iCAAiC,EAAE,+BAA+B;QAClE,eAAe;QACf,aAAa;QACb,MAAM;KACP,CAAC,IAAI,CAAC,GAAG,CAAC;IACX,aAAa,EAAE;QACb,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,MAAM;QAClC,CAAC,cAAc,CAAC,WAAW,CAAC,EAAE,MAAM;KACrC;IACD,aAAa,EACX,0EAA0E;IAC5E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,aAAa,GAAuB;IAC/C,IAAI,EAAE,eAAe;IACrB,6BAA6B;IAC7B,OAAO,EAAE,UAAU,CAAC,OAAO;IAC3B,aAAa,EAAE;QACb,iBAAiB,EAAE,MAAM;QACzB,eAAe,EAAE,MAAM;QACvB,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,YAAY,EAAE,MAAM;QACpB,eAAe,EAAE,MAAM;QACvB,qBAAqB,EAAE,MAAM;QAC7B,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,MAAM;QACnB,SAAS,EAAE,MAAM;QACjB,aAAa,EAAE,MAAM;QACrB,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,qBAAqB,EAAE,MAAM;QAC7B,UAAU,EAAE,MAAM;QAClB,kEAAkE;QAClE,GAAG,sBAAsB,CAAC,MAAM,EAAE,MAAM,CAAC;KAC1C;IACD,8CAA8C;IAC9C,aAAa,EACX,0EAA0E;IAC5E,iBAAiB,EAAE,MAAM;CAC1B,CAAC;AAEF;;GAEG;AACH,SAAS,sBAAsB,CAC7B,KAAa,EACb,GAAW;IAEX,MAAM,MAAM,GAAyB,EAAE,CAAC;IACxC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,IAAI,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAClC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAuC;IACtE,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,SAAS;IACpB,SAAS,EAAE,SAAS;IACpB,WAAW,EAAE,WAAW;IACxB,UAAU,EAAE,UAAU;IACtB,aAAa,EAAE,aAAa;IAC5B,UAAU;IACV,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY;IAEZ,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC;AACpC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB;IAC/B,OAAO,MAAM,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;AAC3C,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Core Tiktoken Module
3
+ *
4
+ * This module provides the exact BPE implementation from tiktoken-rs,
5
+ * ported to TypeScript for use in both Node.js and browser environments.
6
+ */
7
+ export { type Rank, type Vocabulary, type ReverseVocabulary, bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, } from "./byte-pair-encoding.js";
8
+ export { parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, createSpecialTokenMaps, clearVocabularyCache, getVocabularyFromCache, isVocabularyCached, VOCABULARY_URLS, base64Encode, keyToBytes as vocabKeyToBytes, } from "./vocab-loader.js";
9
+ export { CoreBPE, DecodeKeyError } from "./tiktoken.js";
10
+ export { type EncodingDefinition, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, } from "./encoding-definitions.js";
11
+ export { type TokenizerName, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./model-to-encoding.js";
12
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EACL,KAAK,IAAI,EACT,KAAK,UAAU,EACf,KAAK,iBAAiB,EACtB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,GACd,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,oBAAoB,EACpB,sBAAsB,EACtB,kBAAkB,EAClB,eAAe,EACf,YAAY,EACZ,UAAU,IAAI,eAAe,GAC9B,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAGxD,OAAO,EACL,KAAK,kBAAkB,EACvB,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,GAClB,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,aAAa,EAClB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,wBAAwB,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Core Tiktoken Module
3
+ *
4
+ * This module provides the exact BPE implementation from tiktoken-rs,
5
+ * ported to TypeScript for use in both Node.js and browser environments.
6
+ */
7
+ // Byte-Pair Encoding algorithm
8
+ export { bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, } from "./byte-pair-encoding.js";
9
+ // Vocabulary loading
10
+ export { parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, createSpecialTokenMaps, clearVocabularyCache, getVocabularyFromCache, isVocabularyCached, VOCABULARY_URLS, base64Encode, keyToBytes as vocabKeyToBytes, } from "./vocab-loader.js";
11
+ // Core BPE tokenizer
12
+ export { CoreBPE, DecodeKeyError } from "./tiktoken.js";
13
+ // Encoding definitions
14
+ export { SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, } from "./encoding-definitions.js";
15
+ // Model to encoding mappings
16
+ export { getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./model-to-encoding.js";
17
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,+BAA+B;AAC/B,OAAO,EAIL,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,GACd,MAAM,yBAAyB,CAAC;AAEjC,qBAAqB;AACrB,OAAO,EACL,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,oBAAoB,EACpB,sBAAsB,EACtB,kBAAkB,EAClB,eAAe,EACf,YAAY,EACZ,UAAU,IAAI,eAAe,GAC9B,MAAM,mBAAmB,CAAC;AAE3B,qBAAqB;AACrB,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAExD,uBAAuB;AACvB,OAAO,EAEL,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,GAClB,MAAM,2BAA2B,CAAC;AAEnC,6BAA6B;AAC7B,OAAO,EAEL,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,wBAAwB,CAAC"}
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Model to Encoding Mappings
3
+ *
4
+ * EXACT mappings from tiktoken-rs for model name to encoding.
5
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs
6
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
7
+ */
8
+ /**
9
+ * Tokenizer/Encoding enum values (matching tiktoken-rs)
10
+ */
11
+ export type TokenizerName = "o200k_harmony" | "o200k_base" | "cl100k_base" | "p50k_base" | "r50k_base" | "p50k_edit" | "gpt2";
12
+ /**
13
+ * Get the tokenizer/encoding for a model name
14
+ *
15
+ * This function matches the logic in tiktoken-rs get_tokenizer()
16
+ *
17
+ * @param modelName - The model name
18
+ * @returns The tokenizer name, or undefined if not found
19
+ */
20
+ export declare function getTokenizerForModel(modelName: string): TokenizerName | undefined;
21
+ /**
22
+ * Context size mapping from tiktoken-rs model.rs
23
+ *
24
+ * Reference: https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
25
+ */
26
+ export declare function getContextSize(model: string): number;
27
+ /**
28
+ * Extended context limits including specific model versions
29
+ * from tiktoken-rs model.rs get_context_size match statement
30
+ */
31
+ export declare const EXACT_CONTEXT_SIZES: Record<string, number>;
32
+ /**
33
+ * Get context size with exact match support
34
+ */
35
+ export declare function getExactContextSize(model: string): number;
36
+ //# sourceMappingURL=model-to-encoding.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"model-to-encoding.d.ts","sourceRoot":"","sources":["../../src/core/model-to-encoding.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH;;GAEG;AACH,MAAM,MAAM,aAAa,GACrB,eAAe,GACf,YAAY,GACZ,aAAa,GACb,WAAW,GACX,WAAW,GACX,WAAW,GACX,MAAM,CAAC;AA8FX;;;;;;;GAOG;AACH,wBAAgB,oBAAoB,CAClC,SAAS,EAAE,MAAM,GAChB,aAAa,GAAG,SAAS,CAe3B;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CA0FpD;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAoFtD,CAAC;AAEF;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CASzD"}