@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer Utilities
|
|
3
|
+
* Helper functions for tokenization
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Convert a string to UTF-8 bytes
|
|
7
|
+
*
|
|
8
|
+
* @param text - Input text
|
|
9
|
+
* @returns Uint8Array of UTF-8 bytes
|
|
10
|
+
*/
|
|
11
|
+
export declare function stringToBytes(text: string): Uint8Array;
|
|
12
|
+
/**
|
|
13
|
+
* Convert UTF-8 bytes to string
|
|
14
|
+
*
|
|
15
|
+
* @param bytes - UTF-8 bytes
|
|
16
|
+
* @returns Decoded string
|
|
17
|
+
*/
|
|
18
|
+
export declare function bytesToString(bytes: Uint8Array | number[]): string;
|
|
19
|
+
/**
|
|
20
|
+
* Convert a byte to its hexadecimal representation
|
|
21
|
+
*
|
|
22
|
+
* @param byte - Single byte value (0-255)
|
|
23
|
+
* @returns Two-character hex string
|
|
24
|
+
*/
|
|
25
|
+
export declare function byteToHex(byte: number): string;
|
|
26
|
+
/**
|
|
27
|
+
* Convert hexadecimal string to byte
|
|
28
|
+
*
|
|
29
|
+
* @param hex - Two-character hex string
|
|
30
|
+
* @returns Byte value (0-255)
|
|
31
|
+
*/
|
|
32
|
+
export declare function hexToByte(hex: string): number;
|
|
33
|
+
/**
|
|
34
|
+
* Check if a character is a whitespace character
|
|
35
|
+
*
|
|
36
|
+
* @param char - Single character
|
|
37
|
+
* @returns True if whitespace
|
|
38
|
+
*/
|
|
39
|
+
export declare function isWhitespace(char: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Check if a character is a letter
|
|
42
|
+
*
|
|
43
|
+
* @param char - Single character
|
|
44
|
+
* @returns True if letter
|
|
45
|
+
*/
|
|
46
|
+
export declare function isLetter(char: string): boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Check if a character is a digit
|
|
49
|
+
*
|
|
50
|
+
* @param char - Single character
|
|
51
|
+
* @returns True if digit
|
|
52
|
+
*/
|
|
53
|
+
export declare function isDigit(char: string): boolean;
|
|
54
|
+
/**
|
|
55
|
+
* Check if a character is punctuation
|
|
56
|
+
*
|
|
57
|
+
* @param char - Single character
|
|
58
|
+
* @returns True if punctuation
|
|
59
|
+
*/
|
|
60
|
+
export declare function isPunctuation(char: string): boolean;
|
|
61
|
+
/**
|
|
62
|
+
* Check if a string contains only ASCII characters
|
|
63
|
+
*
|
|
64
|
+
* @param text - Input text
|
|
65
|
+
* @returns True if ASCII only
|
|
66
|
+
*/
|
|
67
|
+
export declare function isAscii(text: string): boolean;
|
|
68
|
+
/**
|
|
69
|
+
* Count the number of Unicode code points in a string
|
|
70
|
+
* Handles surrogate pairs correctly
|
|
71
|
+
*
|
|
72
|
+
* @param text - Input text
|
|
73
|
+
* @returns Number of code points
|
|
74
|
+
*/
|
|
75
|
+
export declare function countCodePoints(text: string): number;
|
|
76
|
+
/**
|
|
77
|
+
* Split text into words using Unicode-aware boundaries
|
|
78
|
+
*
|
|
79
|
+
* @param text - Input text
|
|
80
|
+
* @returns Array of words and whitespace
|
|
81
|
+
*/
|
|
82
|
+
export declare function splitIntoWords(text: string): string[];
|
|
83
|
+
/**
|
|
84
|
+
* Escape special regex characters in a string
|
|
85
|
+
*
|
|
86
|
+
* @param text - Input text
|
|
87
|
+
* @returns Escaped string safe for regex
|
|
88
|
+
*/
|
|
89
|
+
export declare function escapeRegex(text: string): string;
|
|
90
|
+
/**
|
|
91
|
+
* Clamp a number between min and max values
|
|
92
|
+
*
|
|
93
|
+
* @param value - Value to clamp
|
|
94
|
+
* @param min - Minimum value
|
|
95
|
+
* @param max - Maximum value
|
|
96
|
+
* @returns Clamped value
|
|
97
|
+
*/
|
|
98
|
+
export declare function clamp(value: number, min: number, max: number): number;
|
|
99
|
+
/**
|
|
100
|
+
* Calculate the percentage of a value
|
|
101
|
+
*
|
|
102
|
+
* @param value - Base value
|
|
103
|
+
* @param percentage - Percentage (0-1)
|
|
104
|
+
* @returns Calculated percentage value
|
|
105
|
+
*/
|
|
106
|
+
export declare function percentage(value: number, pct: number): number;
|
|
107
|
+
/**
|
|
108
|
+
* Create a hash from a string (for vocabulary lookup)
|
|
109
|
+
* Simple FNV-1a hash for performance
|
|
110
|
+
*
|
|
111
|
+
* @param text - Input text
|
|
112
|
+
* @returns 32-bit hash value
|
|
113
|
+
*/
|
|
114
|
+
export declare function hashString(text: string): number;
|
|
115
|
+
/**
|
|
116
|
+
* Count occurrences of a substring in text
|
|
117
|
+
*
|
|
118
|
+
* @param text - Text to search in
|
|
119
|
+
* @param substring - Substring to count
|
|
120
|
+
* @returns Number of occurrences
|
|
121
|
+
*/
|
|
122
|
+
export declare function countOccurrences(text: string, substring: string): number;
|
|
123
|
+
/**
|
|
124
|
+
* Normalize whitespace in text (collapse multiple spaces)
|
|
125
|
+
*
|
|
126
|
+
* @param text - Input text
|
|
127
|
+
* @returns Text with normalized whitespace
|
|
128
|
+
*/
|
|
129
|
+
export declare function normalizeWhitespace(text: string): string;
|
|
130
|
+
/**
|
|
131
|
+
* Check if text contains CJK (Chinese/Japanese/Korean) characters
|
|
132
|
+
*
|
|
133
|
+
* @param text - Input text
|
|
134
|
+
* @returns True if contains CJK characters
|
|
135
|
+
*/
|
|
136
|
+
export declare function containsCJK(text: string): boolean;
|
|
137
|
+
/**
|
|
138
|
+
* Check if text contains emoji
|
|
139
|
+
*
|
|
140
|
+
* @param text - Input text
|
|
141
|
+
* @returns True if contains emoji
|
|
142
|
+
*/
|
|
143
|
+
export declare function containsEmoji(text: string): boolean;
|
|
144
|
+
/**
|
|
145
|
+
* Estimate token multiplier based on text characteristics
|
|
146
|
+
* Used for non-English text where token count can vary significantly
|
|
147
|
+
*
|
|
148
|
+
* @param text - Input text
|
|
149
|
+
* @returns Multiplier to apply to base token estimate
|
|
150
|
+
*/
|
|
151
|
+
export declare function getTextComplexityMultiplier(text: string): number;
|
|
152
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,UAAU,CAGtD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,EAAE,GAAG,MAAM,CAKlE;AAED;;;;;GAKG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE9C;AAED;;;;;GAKG;AACH,wBAAgB,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAE7C;AAED;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAElD;AAED;;;;;GAKG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAE9C;AAED;;;;;GAKG;AACH,wBAAgB,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAE7C;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAEnD;AAED;;;;;GAKG;AACH,wBAAgB,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAG7C;AAED;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGpD;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAyBrD;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD;AAED;;;;;;;GAOG;AACH,wBAAgB,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAErE;AAED;;;;;;GAMG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAE7D;AAED;;;;;;GAMG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAU/C;AAED;;;;;;GAMG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,CAYxE;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAExD;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAKjD;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAEnD;AAED;;;;;;GAMG;AACH,wBAAgB,2BAA2B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAmBhE"}
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer Utilities
|
|
3
|
+
* Helper functions for tokenization
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Convert a string to UTF-8 bytes
|
|
7
|
+
*
|
|
8
|
+
* @param text - Input text
|
|
9
|
+
* @returns Uint8Array of UTF-8 bytes
|
|
10
|
+
*/
|
|
11
|
+
export function stringToBytes(text) {
|
|
12
|
+
const encoder = new TextEncoder();
|
|
13
|
+
return encoder.encode(text);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Convert UTF-8 bytes to string
|
|
17
|
+
*
|
|
18
|
+
* @param bytes - UTF-8 bytes
|
|
19
|
+
* @returns Decoded string
|
|
20
|
+
*/
|
|
21
|
+
export function bytesToString(bytes) {
|
|
22
|
+
const decoder = new TextDecoder("utf-8", { fatal: false });
|
|
23
|
+
return decoder.decode(bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes));
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Convert a byte to its hexadecimal representation
|
|
27
|
+
*
|
|
28
|
+
* @param byte - Single byte value (0-255)
|
|
29
|
+
* @returns Two-character hex string
|
|
30
|
+
*/
|
|
31
|
+
export function byteToHex(byte) {
|
|
32
|
+
return byte.toString(16).padStart(2, "0");
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Convert hexadecimal string to byte
|
|
36
|
+
*
|
|
37
|
+
* @param hex - Two-character hex string
|
|
38
|
+
* @returns Byte value (0-255)
|
|
39
|
+
*/
|
|
40
|
+
export function hexToByte(hex) {
|
|
41
|
+
return parseInt(hex, 16);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Check if a character is a whitespace character
|
|
45
|
+
*
|
|
46
|
+
* @param char - Single character
|
|
47
|
+
* @returns True if whitespace
|
|
48
|
+
*/
|
|
49
|
+
export function isWhitespace(char) {
|
|
50
|
+
return /\s/.test(char);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Check if a character is a letter
|
|
54
|
+
*
|
|
55
|
+
* @param char - Single character
|
|
56
|
+
* @returns True if letter
|
|
57
|
+
*/
|
|
58
|
+
export function isLetter(char) {
|
|
59
|
+
return /\p{L}/u.test(char);
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Check if a character is a digit
|
|
63
|
+
*
|
|
64
|
+
* @param char - Single character
|
|
65
|
+
* @returns True if digit
|
|
66
|
+
*/
|
|
67
|
+
export function isDigit(char) {
|
|
68
|
+
return /\p{N}/u.test(char);
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Check if a character is punctuation
|
|
72
|
+
*
|
|
73
|
+
* @param char - Single character
|
|
74
|
+
* @returns True if punctuation
|
|
75
|
+
*/
|
|
76
|
+
export function isPunctuation(char) {
|
|
77
|
+
return /\p{P}/u.test(char);
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Check if a string contains only ASCII characters
|
|
81
|
+
*
|
|
82
|
+
* @param text - Input text
|
|
83
|
+
* @returns True if ASCII only
|
|
84
|
+
*/
|
|
85
|
+
export function isAscii(text) {
|
|
86
|
+
// eslint-disable-next-line no-control-regex
|
|
87
|
+
return /^[\x00-\x7F]*$/.test(text);
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Count the number of Unicode code points in a string
|
|
91
|
+
* Handles surrogate pairs correctly
|
|
92
|
+
*
|
|
93
|
+
* @param text - Input text
|
|
94
|
+
* @returns Number of code points
|
|
95
|
+
*/
|
|
96
|
+
export function countCodePoints(text) {
|
|
97
|
+
// Using the spread operator to handle surrogate pairs
|
|
98
|
+
return [...text].length;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Split text into words using Unicode-aware boundaries
|
|
102
|
+
*
|
|
103
|
+
* @param text - Input text
|
|
104
|
+
* @returns Array of words and whitespace
|
|
105
|
+
*/
|
|
106
|
+
export function splitIntoWords(text) {
|
|
107
|
+
// Split on word boundaries while preserving whitespace
|
|
108
|
+
const segments = [];
|
|
109
|
+
let current = "";
|
|
110
|
+
let inWord = false;
|
|
111
|
+
for (const char of text) {
|
|
112
|
+
const charIsWord = isLetter(char) || isDigit(char) || char === "'";
|
|
113
|
+
if (charIsWord !== inWord) {
|
|
114
|
+
if (current) {
|
|
115
|
+
segments.push(current);
|
|
116
|
+
}
|
|
117
|
+
current = char;
|
|
118
|
+
inWord = charIsWord;
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
current += char;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
if (current) {
|
|
125
|
+
segments.push(current);
|
|
126
|
+
}
|
|
127
|
+
return segments;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Escape special regex characters in a string
|
|
131
|
+
*
|
|
132
|
+
* @param text - Input text
|
|
133
|
+
* @returns Escaped string safe for regex
|
|
134
|
+
*/
|
|
135
|
+
export function escapeRegex(text) {
|
|
136
|
+
return text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Clamp a number between min and max values
|
|
140
|
+
*
|
|
141
|
+
* @param value - Value to clamp
|
|
142
|
+
* @param min - Minimum value
|
|
143
|
+
* @param max - Maximum value
|
|
144
|
+
* @returns Clamped value
|
|
145
|
+
*/
|
|
146
|
+
export function clamp(value, min, max) {
|
|
147
|
+
return Math.min(Math.max(value, min), max);
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Calculate the percentage of a value
|
|
151
|
+
*
|
|
152
|
+
* @param value - Base value
|
|
153
|
+
* @param percentage - Percentage (0-1)
|
|
154
|
+
* @returns Calculated percentage value
|
|
155
|
+
*/
|
|
156
|
+
export function percentage(value, pct) {
|
|
157
|
+
return Math.round(value * pct);
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Create a hash from a string (for vocabulary lookup)
|
|
161
|
+
* Simple FNV-1a hash for performance
|
|
162
|
+
*
|
|
163
|
+
* @param text - Input text
|
|
164
|
+
* @returns 32-bit hash value
|
|
165
|
+
*/
|
|
166
|
+
export function hashString(text) {
|
|
167
|
+
let hash = 0x811c9dc5; // FNV offset basis
|
|
168
|
+
const bytes = stringToBytes(text);
|
|
169
|
+
for (const byte of bytes) {
|
|
170
|
+
hash ^= byte;
|
|
171
|
+
hash = Math.imul(hash, 0x01000193); // FNV prime
|
|
172
|
+
}
|
|
173
|
+
return hash >>> 0; // Convert to unsigned
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Count occurrences of a substring in text
|
|
177
|
+
*
|
|
178
|
+
* @param text - Text to search in
|
|
179
|
+
* @param substring - Substring to count
|
|
180
|
+
* @returns Number of occurrences
|
|
181
|
+
*/
|
|
182
|
+
export function countOccurrences(text, substring) {
|
|
183
|
+
if (!substring)
|
|
184
|
+
return 0;
|
|
185
|
+
let count = 0;
|
|
186
|
+
let position = 0;
|
|
187
|
+
while ((position = text.indexOf(substring, position)) !== -1) {
|
|
188
|
+
count++;
|
|
189
|
+
position += substring.length;
|
|
190
|
+
}
|
|
191
|
+
return count;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Normalize whitespace in text (collapse multiple spaces)
|
|
195
|
+
*
|
|
196
|
+
* @param text - Input text
|
|
197
|
+
* @returns Text with normalized whitespace
|
|
198
|
+
*/
|
|
199
|
+
export function normalizeWhitespace(text) {
|
|
200
|
+
return text.replace(/\s+/g, " ").trim();
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Check if text contains CJK (Chinese/Japanese/Korean) characters
|
|
204
|
+
*
|
|
205
|
+
* @param text - Input text
|
|
206
|
+
* @returns True if contains CJK characters
|
|
207
|
+
*/
|
|
208
|
+
export function containsCJK(text) {
|
|
209
|
+
// CJK Unified Ideographs and related blocks
|
|
210
|
+
return /[\u4e00-\u9fff\u3400-\u4dbf\u{20000}-\u{2a6df}\u{2a700}-\u{2b73f}\u{2b740}-\u{2b81f}\u{2b820}-\u{2ceaf}\u{2ceb0}-\u{2ebef}\u{30000}-\u{3134f}\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/u.test(text);
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Check if text contains emoji
|
|
214
|
+
*
|
|
215
|
+
* @param text - Input text
|
|
216
|
+
* @returns True if contains emoji
|
|
217
|
+
*/
|
|
218
|
+
export function containsEmoji(text) {
|
|
219
|
+
return /\p{Emoji}/u.test(text);
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Estimate token multiplier based on text characteristics
|
|
223
|
+
* Used for non-English text where token count can vary significantly
|
|
224
|
+
*
|
|
225
|
+
* @param text - Input text
|
|
226
|
+
* @returns Multiplier to apply to base token estimate
|
|
227
|
+
*/
|
|
228
|
+
export function getTextComplexityMultiplier(text) {
|
|
229
|
+
let multiplier = 1.0;
|
|
230
|
+
// CJK characters typically use more tokens per character
|
|
231
|
+
if (containsCJK(text)) {
|
|
232
|
+
multiplier *= 1.5;
|
|
233
|
+
}
|
|
234
|
+
// Emoji can use multiple tokens
|
|
235
|
+
if (containsEmoji(text)) {
|
|
236
|
+
multiplier *= 1.2;
|
|
237
|
+
}
|
|
238
|
+
// Non-ASCII text generally uses more tokens
|
|
239
|
+
if (!isAscii(text)) {
|
|
240
|
+
multiplier *= 1.1;
|
|
241
|
+
}
|
|
242
|
+
return multiplier;
|
|
243
|
+
}
|
|
244
|
+
//# sourceMappingURL=utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAC;IAClC,OAAO,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,KAA4B;IACxD,MAAM,OAAO,GAAG,IAAI,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;IAC3D,OAAO,OAAO,CAAC,MAAM,CACnB,KAAK,YAAY,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,CAC5D,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,OAAO,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,SAAS,CAAC,GAAW;IACnC,OAAO,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzB,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,QAAQ,CAAC,IAAY;IACnC,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,OAAO,CAAC,IAAY;IAClC,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,OAAO,CAAC,IAAY;IAClC,4CAA4C;IAC5C,OAAO,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACrC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,sDAAsD;IACtD,OAAO,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC;AAC1B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,uDAAuD;IACvD,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,IAAI,KAAK,GAAG,CAAC;QAEnE,IAAI,UAAU,KAAK,MAAM,EAAE,CAAC;YAC1B,IAAI,OAAO,EAAE,CAAC;gBACZ,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACzB,CAAC;YACD,OAAO,GAAG,IAAI,CAAC;YACf,MAAM,GAAG,UAAU,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,IAAI,CAAC;QAClB,CAAC;IACH,CAAC;IAED,IAAI,OAAO,EAAE,CAAC;QACZ,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACzB,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AACrD,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,KAAK,CAAC,KAAa,EAAE,GAAW,EAAE,GAAW;IAC3D,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;AAC7C,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,UAAU,CAAC,KAAa,EAAE,GAAW;IACnD,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC;AACjC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,IAAI,GAAG,UAAU,CAAC,CAAC,mBAAmB;IAC1C,MAAM,KAAK,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,IAAI,CAAC;QACb,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC,YAAY;IAClD,CAAC;IAED,OAAO,IAAI,KAAK,CAAC,CAAC,CAAC,sBAAsB;AAC3C,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,SAAiB;IAC9D,IAAI,CAAC,SAAS;QAAE,OAAO,CAAC,CAAC;IAEzB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;QAC7D,KAAK,EAAE,CAAC;QACR,QAAQ,IAAI,SAAS,CAAC,MAAM,CAAC;IAC/B,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AAC1C,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,4CAA4C;IAC5C,OAAO,wLAAwL,CAAC,IAAI,CAClM,IAAI,CACL,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,2BAA2B,CAAC,IAAY;IACtD,IAAI,UAAU,GAAG,GAAG,CAAC;IAErB,yDAAyD;IACzD,IAAI,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;QACtB,UAAU,IAAI,GAAG,CAAC;IACpB,CAAC;IAED,gCAAgC;IAChC,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,UAAU,IAAI,GAAG,CAAC;IACpB,CAAC;IAED,4CAA4C;IAC5C,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;QACnB,UAAU,IAAI,GAAG,CAAC;IACpB,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@hyvmind/tiktoken-ts",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"sideEffects": false,
|
|
5
|
+
"description": "A pure TypeScript implementation of OpenAI's tiktoken tokenizer, compatible with tiktoken-rs",
|
|
6
|
+
"author": "HyvMind",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"type": "module",
|
|
9
|
+
"main": "./dist/index.js",
|
|
10
|
+
"module": "./dist/index.js",
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"import": "./dist/index.js",
|
|
15
|
+
"types": "./dist/index.d.ts"
|
|
16
|
+
},
|
|
17
|
+
"./encodings": {
|
|
18
|
+
"import": "./dist/encodings/index.js",
|
|
19
|
+
"types": "./dist/encodings/index.d.ts"
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"files": [
|
|
23
|
+
"dist",
|
|
24
|
+
"README.md",
|
|
25
|
+
"LICENSE"
|
|
26
|
+
],
|
|
27
|
+
"keywords": [
|
|
28
|
+
"tiktoken",
|
|
29
|
+
"tokenizer",
|
|
30
|
+
"bpe",
|
|
31
|
+
"openai",
|
|
32
|
+
"gpt",
|
|
33
|
+
"gpt-4",
|
|
34
|
+
"gpt-4o",
|
|
35
|
+
"claude",
|
|
36
|
+
"llm",
|
|
37
|
+
"tokens",
|
|
38
|
+
"encoding"
|
|
39
|
+
],
|
|
40
|
+
"repository": {
|
|
41
|
+
"type": "git",
|
|
42
|
+
"url": "git+https://github.com/hyvmind-io/tiktoken-ts.git"
|
|
43
|
+
},
|
|
44
|
+
"bugs": {
|
|
45
|
+
"url": "https://github.com/hyvmind-io/tiktoken-ts/issues"
|
|
46
|
+
},
|
|
47
|
+
"homepage": "https://github.com/hyvmind-io/tiktoken-ts#readme",
|
|
48
|
+
"engines": {
|
|
49
|
+
"node": ">=18.0.0"
|
|
50
|
+
},
|
|
51
|
+
"devDependencies": {
|
|
52
|
+
"@eslint/js": "^9.39.2",
|
|
53
|
+
"@types/node": "^22.0.0",
|
|
54
|
+
"@typescript-eslint/eslint-plugin": "^8.0.0",
|
|
55
|
+
"@typescript-eslint/parser": "^8.0.0",
|
|
56
|
+
"@vitest/coverage-v8": "^2.0.0",
|
|
57
|
+
"eslint": "^9.0.0",
|
|
58
|
+
"eslint-config-prettier": "^9.0.0",
|
|
59
|
+
"prettier": "^3.0.0",
|
|
60
|
+
"typescript": "^5.6.0",
|
|
61
|
+
"typescript-eslint": "^8.54.0",
|
|
62
|
+
"vitest": "^2.0.0"
|
|
63
|
+
},
|
|
64
|
+
"scripts": {
|
|
65
|
+
"clean": "rm -rf dist",
|
|
66
|
+
"prebuild": "pnpm run clean",
|
|
67
|
+
"build": "tsc",
|
|
68
|
+
"dev": "tsc --watch",
|
|
69
|
+
"typecheck": "tsc --noEmit",
|
|
70
|
+
"test": "vitest run",
|
|
71
|
+
"test:watch": "vitest",
|
|
72
|
+
"test:coverage": "vitest run --coverage",
|
|
73
|
+
"lint": "eslint src --ext .ts",
|
|
74
|
+
"lint:fix": "eslint src --ext .ts --fix",
|
|
75
|
+
"format": "prettier --write \"src/**/*.ts\"",
|
|
76
|
+
"format:check": "prettier --check \"src/**/*.ts\""
|
|
77
|
+
}
|
|
78
|
+
}
|