@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* o200k_base Encoding
|
|
3
|
+
* Used by GPT-4o, GPT-4.1, GPT-5 models
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 200,000 tokens (larger vocabulary)
|
|
7
|
+
* - Better handling of non-English languages
|
|
8
|
+
* - Improved efficiency for common patterns
|
|
9
|
+
* - Better support for multimodal inputs
|
|
10
|
+
*/
|
|
11
|
+
import { BPETokenizer } from "../bpe.js";
|
|
12
|
+
/**
|
|
13
|
+
* Encoding name constant
|
|
14
|
+
*/
|
|
15
|
+
export const ENCODING_NAME = "o200k_base";
|
|
16
|
+
/**
|
|
17
|
+
* Create an o200k_base tokenizer instance
|
|
18
|
+
*
|
|
19
|
+
* @returns Tokenizer instance
|
|
20
|
+
*/
|
|
21
|
+
export function createO200kTokenizer() {
|
|
22
|
+
const bpe = new BPETokenizer(ENCODING_NAME);
|
|
23
|
+
return {
|
|
24
|
+
encodingName: ENCODING_NAME,
|
|
25
|
+
encode: (text) => bpe.encode(text),
|
|
26
|
+
decode: (tokens) => bpe.decode(tokens),
|
|
27
|
+
countTokens: (text) => bpe.countTokens(text),
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Pre-computed token counts for common patterns
|
|
32
|
+
* o200k_base has a larger vocabulary, so more words are single tokens
|
|
33
|
+
*/
|
|
34
|
+
export const COMMON_TOKEN_PATTERNS = {
|
|
35
|
+
// Contractions (same as cl100k)
|
|
36
|
+
"'s": 1,
|
|
37
|
+
"'t": 1,
|
|
38
|
+
"'re": 1,
|
|
39
|
+
"'ve": 1,
|
|
40
|
+
"'m": 1,
|
|
41
|
+
"'ll": 1,
|
|
42
|
+
"'d": 1,
|
|
43
|
+
// Common words (more are single tokens in o200k)
|
|
44
|
+
the: 1,
|
|
45
|
+
and: 1,
|
|
46
|
+
is: 1,
|
|
47
|
+
are: 1,
|
|
48
|
+
was: 1,
|
|
49
|
+
were: 1,
|
|
50
|
+
will: 1,
|
|
51
|
+
would: 1,
|
|
52
|
+
could: 1,
|
|
53
|
+
should: 1,
|
|
54
|
+
have: 1,
|
|
55
|
+
has: 1,
|
|
56
|
+
had: 1,
|
|
57
|
+
been: 1,
|
|
58
|
+
being: 1,
|
|
59
|
+
this: 1,
|
|
60
|
+
that: 1,
|
|
61
|
+
these: 1,
|
|
62
|
+
those: 1,
|
|
63
|
+
with: 1,
|
|
64
|
+
from: 1,
|
|
65
|
+
into: 1,
|
|
66
|
+
because: 1, // Single token in o200k
|
|
67
|
+
however: 1,
|
|
68
|
+
therefore: 1,
|
|
69
|
+
although: 1,
|
|
70
|
+
meanwhile: 1,
|
|
71
|
+
// Programming keywords
|
|
72
|
+
function: 1,
|
|
73
|
+
const: 1,
|
|
74
|
+
let: 1,
|
|
75
|
+
var: 1,
|
|
76
|
+
return: 1,
|
|
77
|
+
if: 1,
|
|
78
|
+
else: 1,
|
|
79
|
+
while: 1,
|
|
80
|
+
class: 1,
|
|
81
|
+
interface: 1,
|
|
82
|
+
type: 1,
|
|
83
|
+
import: 1,
|
|
84
|
+
export: 1,
|
|
85
|
+
async: 1,
|
|
86
|
+
await: 1,
|
|
87
|
+
true: 1,
|
|
88
|
+
false: 1,
|
|
89
|
+
null: 1,
|
|
90
|
+
undefined: 1,
|
|
91
|
+
// TypeScript/React keywords (more are single tokens)
|
|
92
|
+
React: 1,
|
|
93
|
+
useState: 1,
|
|
94
|
+
useEffect: 1,
|
|
95
|
+
useCallback: 1,
|
|
96
|
+
useMemo: 1,
|
|
97
|
+
useRef: 1,
|
|
98
|
+
useContext: 1,
|
|
99
|
+
Component: 1,
|
|
100
|
+
Fragment: 1,
|
|
101
|
+
children: 1,
|
|
102
|
+
props: 1,
|
|
103
|
+
// Common API terms
|
|
104
|
+
request: 1,
|
|
105
|
+
response: 1,
|
|
106
|
+
error: 1,
|
|
107
|
+
success: 1,
|
|
108
|
+
message: 1,
|
|
109
|
+
data: 1,
|
|
110
|
+
status: 1,
|
|
111
|
+
headers: 1,
|
|
112
|
+
// Common symbols
|
|
113
|
+
"=>": 1,
|
|
114
|
+
"===": 1,
|
|
115
|
+
"!==": 1,
|
|
116
|
+
"&&": 1,
|
|
117
|
+
"||": 1,
|
|
118
|
+
"++": 1,
|
|
119
|
+
"--": 1,
|
|
120
|
+
"+=": 1,
|
|
121
|
+
"-=": 1,
|
|
122
|
+
"...": 1,
|
|
123
|
+
"?.": 1, // Optional chaining
|
|
124
|
+
"??": 1, // Nullish coalescing
|
|
125
|
+
};
|
|
126
|
+
/**
|
|
127
|
+
* Average characters per token for o200k_base
|
|
128
|
+
* Slightly higher than cl100k due to larger vocabulary
|
|
129
|
+
*/
|
|
130
|
+
export const AVERAGE_CHARS_PER_TOKEN = 4.0;
|
|
131
|
+
/**
|
|
132
|
+
* Token estimation adjustments for different content types
|
|
133
|
+
* o200k is generally more efficient across all types
|
|
134
|
+
*/
|
|
135
|
+
export const CONTENT_TYPE_MULTIPLIERS = {
|
|
136
|
+
prose: 0.95, // More efficient for English
|
|
137
|
+
code: 0.8, // Even better for code
|
|
138
|
+
json: 0.7, // Very efficient for JSON
|
|
139
|
+
markdown: 0.9, // Better for markdown
|
|
140
|
+
html: 0.75, // Better for HTML
|
|
141
|
+
math: 1.1, // Still needs more tokens for math
|
|
142
|
+
multilingual: 0.85, // Much better for non-English
|
|
143
|
+
};
|
|
144
|
+
/**
|
|
145
|
+
* Language-specific efficiency factors for o200k_base
|
|
146
|
+
* The larger vocabulary handles more languages efficiently
|
|
147
|
+
*/
|
|
148
|
+
export const LANGUAGE_EFFICIENCY = {
|
|
149
|
+
en: 1.0, // English (baseline)
|
|
150
|
+
es: 1.05, // Spanish
|
|
151
|
+
fr: 1.05, // French
|
|
152
|
+
de: 1.08, // German (compound words)
|
|
153
|
+
pt: 1.05, // Portuguese
|
|
154
|
+
it: 1.05, // Italian
|
|
155
|
+
nl: 1.08, // Dutch
|
|
156
|
+
ru: 1.2, // Russian (Cyrillic)
|
|
157
|
+
ja: 1.5, // Japanese (needs more tokens)
|
|
158
|
+
zh: 1.4, // Chinese (CJK characters)
|
|
159
|
+
ko: 1.4, // Korean (Hangul)
|
|
160
|
+
ar: 1.3, // Arabic
|
|
161
|
+
hi: 1.3, // Hindi
|
|
162
|
+
vi: 1.15, // Vietnamese
|
|
163
|
+
th: 1.3, // Thai
|
|
164
|
+
};
|
|
165
|
+
/**
|
|
166
|
+
* Get estimated token count with content-type awareness
|
|
167
|
+
*
|
|
168
|
+
* @param text - Input text
|
|
169
|
+
* @param contentType - Type of content
|
|
170
|
+
* @returns Estimated token count
|
|
171
|
+
*/
|
|
172
|
+
export function estimateTokensForContent(text, contentType = "prose") {
|
|
173
|
+
const tokenizer = createO200kTokenizer();
|
|
174
|
+
const baseCount = tokenizer.countTokens(text);
|
|
175
|
+
const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 0.95;
|
|
176
|
+
return Math.max(1, Math.round(baseCount * multiplier));
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get estimated token count with language awareness
|
|
180
|
+
*
|
|
181
|
+
* @param text - Input text
|
|
182
|
+
* @param language - ISO 639-1 language code
|
|
183
|
+
* @returns Estimated token count
|
|
184
|
+
*/
|
|
185
|
+
export function estimateTokensForLanguage(text, language) {
|
|
186
|
+
const tokenizer = createO200kTokenizer();
|
|
187
|
+
const baseCount = tokenizer.countTokens(text);
|
|
188
|
+
const efficiency = LANGUAGE_EFFICIENCY[language.toLowerCase()] ?? 1.0;
|
|
189
|
+
return Math.max(1, Math.round(baseCount * efficiency));
|
|
190
|
+
}
|
|
191
|
+
//# sourceMappingURL=o200k-base.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"o200k-base.js","sourceRoot":"","sources":["../../src/encodings/o200k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,YAAY,CAAC;AAExD;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IAClC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAA2B;IAC3D,gCAAgC;IAChC,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IAEP,iDAAiD;IACjD,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,EAAE,EAAE,CAAC;IACL,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,MAAM,EAAE,CAAC;IACT,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,OAAO,EAAE,CAAC,EAAE,wBAAwB;IACpC,OAAO,EAAE,CAAC;IACV,SAAS,EAAE,CAAC;IACZ,QAAQ,EAAE,CAAC;IACX,SAAS,EAAE,CAAC;IAEZ,uBAAuB;IACvB,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,MAAM,EAAE,CAAC;IACT,EAAE,EAAE,CAAC;IACL,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,SAAS,EAAE,CAAC;IACZ,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,MAAM,EAAE,CAAC;IACT,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,CAAC;IAEZ,qDAAqD;IACrD,KAAK,EAAE,CAAC;IACR,QAAQ,EAAE,CAAC;IACX,SAAS,EAAE,CAAC;IACZ,WAAW,EAAE,CAAC;IACd,OAAO,EAAE,CAAC;IACV,MAAM,EAAE,CAAC;IACT,UAAU,EAAE,CAAC;IACb,SAAS,EAAE,CAAC;IACZ,QAAQ,EAAE,CAAC;IACX,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IAER,mBAAmB;IACnB,OAAO,EAAE,CAAC;IACV,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,OAAO,EAAE,CAAC;IACV,OAAO,EAAE,CAAC;IACV,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,OAAO,EAAE,CAAC;IAEV,iBAAiB;IACjB,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC,EAAE,oBAAoB;IAC7B,IAAI,EAAE,CAAC,EAAE,qBAAqB;CAC/B,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;GAGG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,IAAI,EAAE,6BAA6B;IAC1C,IAAI,EAAE,GAAG,EAAE,uBAAuB;IAClC,IAAI,EAAE,GAAG,EAAE,0BAA0B;IACrC,QAAQ,EAAE,GAAG,EAAE,sBAAsB;IACrC,IAAI,EAAE,IAAI,EAAE,kBAAkB;IAC9B,IAAI,EAAE,GAAG,EAAE,mCAAmC;IAC9C,YAAY,EAAE,IAAI,EAAE,8BAA8B;CACnD,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAA2B;IACzD,EAAE,EAAE,GAAG,EAAE,qBAAqB;IAC9B,EAAE,EAAE,IAAI,EAAE,UAAU;IACpB,EAAE,EAAE,IAAI,EAAE,SAAS;IACnB,EAAE,EAAE,IAAI,EAAE,0BAA0B;IACpC,EAAE,EAAE,IAAI,EAAE,aAAa;IACvB,EAAE,EAAE,IAAI,EAAE,UAAU;IACpB,EAAE,EAAE,IAAI,EAAE,QAAQ;IAClB,EAAE,EAAE,GAAG,EAAE,qBAAqB;IAC9B,EAAE,EAAE,GAAG,EAAE,+BAA+B;IACxC,EAAE,EAAE,GAAG,EAAE,2BAA2B;IACpC,EAAE,EAAE,GAAG,EAAE,kBAAkB;IAC3B,EAAE,EAAE,GAAG,EAAE,SAAS;IAClB,EAAE,EAAE,GAAG,EAAE,QAAQ;IACjB,EAAE,EAAE,IAAI,EAAE,aAAa;IACvB,EAAE,EAAE,GAAG,EAAE,OAAO;CACjB,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,oBAAoB,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC;IAEjE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,yBAAyB,CACvC,IAAY,EACZ,QAAgB;IAEhB,MAAM,SAAS,GAAG,oBAAoB,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,mBAAmB,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,IAAI,GAAG,CAAC;IAEtE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* p50k_base Encoding (Legacy)
|
|
3
|
+
* Used by older Codex models (code-davinci-002, code-cushman-001)
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 50,257 tokens (smallest vocabulary)
|
|
7
|
+
* - Optimized for code completion
|
|
8
|
+
* - Less efficient for natural language
|
|
9
|
+
*
|
|
10
|
+
* @deprecated This encoding is used by deprecated models.
|
|
11
|
+
* Use o200k_base or cl100k_base for current models.
|
|
12
|
+
*/
|
|
13
|
+
import type { Tokenizer, EncodingName } from "../types.js";
|
|
14
|
+
/**
|
|
15
|
+
* Encoding name constant
|
|
16
|
+
*/
|
|
17
|
+
export declare const ENCODING_NAME: EncodingName;
|
|
18
|
+
/**
|
|
19
|
+
* Create a p50k_base tokenizer instance
|
|
20
|
+
*
|
|
21
|
+
* @returns Tokenizer instance
|
|
22
|
+
* @deprecated Use createO200kTokenizer or createCL100kTokenizer instead
|
|
23
|
+
*/
|
|
24
|
+
export declare function createP50kTokenizer(): Tokenizer;
|
|
25
|
+
/**
|
|
26
|
+
* Average characters per token for p50k_base
|
|
27
|
+
* Lower than newer encodings due to smaller vocabulary
|
|
28
|
+
*/
|
|
29
|
+
export declare const AVERAGE_CHARS_PER_TOKEN = 3.5;
|
|
30
|
+
/**
|
|
31
|
+
* Token estimation adjustments for different content types
|
|
32
|
+
* p50k was optimized primarily for code
|
|
33
|
+
*/
|
|
34
|
+
export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
|
|
35
|
+
/**
|
|
36
|
+
* Get estimated token count with content-type awareness
|
|
37
|
+
*
|
|
38
|
+
* @param text - Input text
|
|
39
|
+
* @param contentType - Type of content
|
|
40
|
+
* @returns Estimated token count
|
|
41
|
+
* @deprecated Use o200k_base or cl100k_base for current models
|
|
42
|
+
*/
|
|
43
|
+
export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
|
|
44
|
+
//# sourceMappingURL=p50k-base.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"p50k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/p50k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA0B,CAAC;AAEvD;;;;;GAKG;AACH,wBAAgB,mBAAmB,IAAI,SAAS,CAS/C;AAED;;;GAGG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* p50k_base Encoding (Legacy)
|
|
3
|
+
* Used by older Codex models (code-davinci-002, code-cushman-001)
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 50,257 tokens (smallest vocabulary)
|
|
7
|
+
* - Optimized for code completion
|
|
8
|
+
* - Less efficient for natural language
|
|
9
|
+
*
|
|
10
|
+
* @deprecated This encoding is used by deprecated models.
|
|
11
|
+
* Use o200k_base or cl100k_base for current models.
|
|
12
|
+
*/
|
|
13
|
+
import { BPETokenizer } from "../bpe.js";
|
|
14
|
+
/**
|
|
15
|
+
* Encoding name constant
|
|
16
|
+
*/
|
|
17
|
+
export const ENCODING_NAME = "p50k_base";
|
|
18
|
+
/**
|
|
19
|
+
* Create a p50k_base tokenizer instance
|
|
20
|
+
*
|
|
21
|
+
* @returns Tokenizer instance
|
|
22
|
+
* @deprecated Use createO200kTokenizer or createCL100kTokenizer instead
|
|
23
|
+
*/
|
|
24
|
+
export function createP50kTokenizer() {
|
|
25
|
+
const bpe = new BPETokenizer(ENCODING_NAME);
|
|
26
|
+
return {
|
|
27
|
+
encodingName: ENCODING_NAME,
|
|
28
|
+
encode: (text) => bpe.encode(text),
|
|
29
|
+
decode: (tokens) => bpe.decode(tokens),
|
|
30
|
+
countTokens: (text) => bpe.countTokens(text),
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Average characters per token for p50k_base
|
|
35
|
+
* Lower than newer encodings due to smaller vocabulary
|
|
36
|
+
*/
|
|
37
|
+
export const AVERAGE_CHARS_PER_TOKEN = 3.5;
|
|
38
|
+
/**
|
|
39
|
+
* Token estimation adjustments for different content types
|
|
40
|
+
* p50k was optimized primarily for code
|
|
41
|
+
*/
|
|
42
|
+
export const CONTENT_TYPE_MULTIPLIERS = {
|
|
43
|
+
prose: 1.15, // Less efficient for natural language
|
|
44
|
+
code: 0.9, // Better for code (its primary use case)
|
|
45
|
+
json: 0.85, // Reasonable for JSON
|
|
46
|
+
markdown: 1.05, // Less efficient for markdown
|
|
47
|
+
html: 0.9, // Reasonable for HTML
|
|
48
|
+
math: 1.25, // Less efficient for math
|
|
49
|
+
};
|
|
50
|
+
/**
|
|
51
|
+
* Get estimated token count with content-type awareness
|
|
52
|
+
*
|
|
53
|
+
* @param text - Input text
|
|
54
|
+
* @param contentType - Type of content
|
|
55
|
+
* @returns Estimated token count
|
|
56
|
+
* @deprecated Use o200k_base or cl100k_base for current models
|
|
57
|
+
*/
|
|
58
|
+
export function estimateTokensForContent(text, contentType = "prose") {
|
|
59
|
+
const tokenizer = createP50kTokenizer();
|
|
60
|
+
const baseCount = tokenizer.countTokens(text);
|
|
61
|
+
const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.15;
|
|
62
|
+
return Math.max(1, Math.round(baseCount * multiplier));
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=p50k-base.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"p50k-base.js","sourceRoot":"","sources":["../../src/encodings/p50k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,WAAW,CAAC;AAEvD;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB;IACjC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;GAGG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,IAAI,EAAE,sCAAsC;IACnD,IAAI,EAAE,GAAG,EAAE,yCAAyC;IACpD,IAAI,EAAE,IAAI,EAAE,sBAAsB;IAClC,QAAQ,EAAE,IAAI,EAAE,8BAA8B;IAC9C,IAAI,EAAE,GAAG,EAAE,sBAAsB;IACjC,IAAI,EAAE,IAAI,EAAE,0BAA0B;CACvC,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,mBAAmB,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC;IAEjE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tiktoken-ts - A pure TypeScript implementation of OpenAI's tiktoken
|
|
3
|
+
*
|
|
4
|
+
* A self-contained TypeScript tokenizer for AI applications.
|
|
5
|
+
* Implements EXACT BPE (Byte-Pair Encoding) algorithm from tiktoken-rs.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*
|
|
9
|
+
* @example Exact BPE Tokenization (Async)
|
|
10
|
+
* ```typescript
|
|
11
|
+
* import { getEncodingAsync, countTokensAsync } from "tiktoken-ts";
|
|
12
|
+
*
|
|
13
|
+
* // Load encoding and tokenize
|
|
14
|
+
* const tiktoken = await getEncodingAsync("cl100k_base");
|
|
15
|
+
* const tokens = tiktoken.encode("Hello, world!");
|
|
16
|
+
* console.log(tokens); // [15496, 11, 1917, 0]
|
|
17
|
+
*
|
|
18
|
+
* // Decode back to text
|
|
19
|
+
* const text = tiktoken.decode(tokens);
|
|
20
|
+
* console.log(text); // "Hello, world!"
|
|
21
|
+
*
|
|
22
|
+
* // Count tokens
|
|
23
|
+
* const count = await countTokensAsync("Hello, world!", "cl100k_base");
|
|
24
|
+
* console.log(count); // 4
|
|
25
|
+
* ```
|
|
26
|
+
*
|
|
27
|
+
* @example Token Estimation (Sync, uses heuristics)
|
|
28
|
+
* ```typescript
|
|
29
|
+
* import {
|
|
30
|
+
* countTokens,
|
|
31
|
+
* estimateMaxTokens,
|
|
32
|
+
* getTokenEstimation,
|
|
33
|
+
* getEncodingForModelName,
|
|
34
|
+
* getModelContextLimit
|
|
35
|
+
* } from "tiktoken-ts";
|
|
36
|
+
*
|
|
37
|
+
* // Count tokens in text (estimation)
|
|
38
|
+
* const tokens = countTokens("Hello, world!", { model: "gpt-4o" });
|
|
39
|
+
*
|
|
40
|
+
* // Estimate safe max_tokens value
|
|
41
|
+
* const maxTokens = estimateMaxTokens(promptText, "gpt-5-nano", {
|
|
42
|
+
* desiredOutputTokens: 1000,
|
|
43
|
+
* safetyMargin: 0.1
|
|
44
|
+
* });
|
|
45
|
+
*
|
|
46
|
+
* // Get detailed estimation with warnings
|
|
47
|
+
* const estimation = getTokenEstimation(promptText, "gpt-4o");
|
|
48
|
+
* if (!estimation.fitsInContext) {
|
|
49
|
+
* console.warn(estimation.warning);
|
|
50
|
+
* }
|
|
51
|
+
* ```
|
|
52
|
+
*/
|
|
53
|
+
export type { EncodingName, ModelFamily, SpecialTokens, EncodingConfig, Tokenizer, ModelConfig, TokenEstimation, ChatMessage, TokenCountOptions, MaxTokensOptions, } from "./types.js";
|
|
54
|
+
export { type Rank, type Vocabulary, type ReverseVocabulary, bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, clearVocabularyCache, isVocabularyCached, VOCABULARY_URLS, CoreBPE, DecodeKeyError, type EncodingDefinition, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, type TokenizerName, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./core/index.js";
|
|
55
|
+
export { Tiktoken, getEncoding as getTiktoken, getEncodingForModel as getTiktokenForModel, getEncodingAsync, getEncodingForModelAsync, clearTiktokenCache, encodeAsync, decodeAsync, countTokensAsync, countTokensForModelAsync, } from "./tiktoken.js";
|
|
56
|
+
export { getEncoding, getEncodingForModelName, countTokens, countChatTokens, countPromptTokens, estimateMaxTokens, getTokenEstimation, getChatTokenEstimation, fitsInContext, truncateToTokenLimit, splitIntoChunks, clearTokenizerCache, } from "./tokenizer.js";
|
|
57
|
+
export { MODEL_CONFIGS, MODEL_ALIASES, getModelConfig, getEncodingForModel, getModelContextLimit, getModelMaxOutputTokens, getModelFamily, usesO200kEncoding, usesClaudeEstimation, isClaudeModel, listModels, listModelsByFamily, } from "./models.js";
|
|
58
|
+
export { BPETokenizer, DEFAULT_SPECIAL_TOKENS, CL100K_BASE_CONFIG, O200K_BASE_CONFIG, P50K_BASE_CONFIG, CLAUDE_ESTIMATION_CONFIG, CLAUDE_SAFETY_MULTIPLIER, getEncodingConfig, } from "./bpe.js";
|
|
59
|
+
export { createCL100kTokenizer, CL100K_ENCODING_NAME, CL100K_AVG_CHARS, estimateCL100kTokens, createO200kTokenizer, O200K_ENCODING_NAME, O200K_AVG_CHARS, O200K_LANGUAGE_EFFICIENCY, estimateO200kTokens, estimateO200kTokensForLanguage, createP50kTokenizer, P50K_ENCODING_NAME, P50K_AVG_CHARS, estimateP50kTokens, createClaudeEstimationTokenizer, CLAUDE_ENCODING_NAME, CLAUDE_AVG_CHARS, CLAUDE_VOCAB_SIZE, CLAUDE_SAFETY_MULT, estimateClaudeTokens, estimateClaudeTokensConservative, claudeIncludesSafetyMultiplier, } from "./encodings/index.js";
|
|
60
|
+
export { stringToBytes, bytesToString, countCodePoints, containsCJK, containsEmoji, isAscii, getTextComplexityMultiplier, normalizeWhitespace, } from "./utils.js";
|
|
61
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAMH,YAAY,EACV,YAAY,EACZ,WAAW,EACX,aAAa,EACb,cAAc,EACd,SAAS,EACT,WAAW,EACX,eAAe,EACf,WAAW,EACX,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,YAAY,CAAC;AAMpB,OAAO,EAEL,KAAK,IAAI,EACT,KAAK,UAAU,EACf,KAAK,iBAAiB,EAGtB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,EAGb,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,oBAAoB,EACpB,kBAAkB,EAClB,eAAe,EAGf,OAAO,EACP,cAAc,EAGd,KAAK,kBAAkB,EACvB,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,EAGjB,KAAK,aAAa,EAClB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,iBAAiB,CAAC;AAMzB,OAAO,EAEL,QAAQ,EAGR,WAAW,IAAI,WAAW,EAC1B,mBAAmB,IAAI,mBAAmB,EAC1C,gBAAgB,EAChB,wBAAwB,EACxB,kBAAkB,EAGlB,WAAW,EACX,WAAW,EACX,gBAAgB,EAChB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AAMvB,OAAO,EAEL,WAAW,EACX,uBAAuB,EAGvB,WAAW,EACX,eAAe,EACf,iBAAiB,EAGjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EAGtB,aAAa,EACb,oBAAoB,EACpB,eAAe,EACf,mBAAmB,GACpB,MAAM,gBAAgB,CAAC;AAMxB,OAAO,EACL,aAAa,EACb,aAAa,EACb,cAAc,EACd,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,cAAc,EACd,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,UAAU,EACV,kBAAkB,GACnB,MAAM,aAAa,CAAC;AAMrB,OAAO,EACL,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,wBAAwB,EACxB,wBAAwB,EACxB,iBAAiB,GAClB,MAAM,UAAU,CAAC;AAMlB,OAAO,EAEL,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,oBAAoB,EAGpB,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,EACzB,mBAAmB,EACnB,8BAA8B,EAG9B,mBAAmB,EACnB,kBAAkB,EAClB,cAAc,EACd,kBAAkB,EAGlB,+BAA+B,EAC/B,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,oBAAoB,EACpB,gCAAgC,EAChC,8BAA8B,GAC/B,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,aAAa,EACb,aAAa,EACb,eAAe,EACf,WAAW,EACX,aAAa,EACb,OAAO,EACP,2BAA2B,EAC3B,mBAAmB,GACpB,MAAM,YAAY,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tiktoken-ts - A pure TypeScript implementation of OpenAI's tiktoken
|
|
3
|
+
*
|
|
4
|
+
* A self-contained TypeScript tokenizer for AI applications.
|
|
5
|
+
* Implements EXACT BPE (Byte-Pair Encoding) algorithm from tiktoken-rs.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*
|
|
9
|
+
* @example Exact BPE Tokenization (Async)
|
|
10
|
+
* ```typescript
|
|
11
|
+
* import { getEncodingAsync, countTokensAsync } from "tiktoken-ts";
|
|
12
|
+
*
|
|
13
|
+
* // Load encoding and tokenize
|
|
14
|
+
* const tiktoken = await getEncodingAsync("cl100k_base");
|
|
15
|
+
* const tokens = tiktoken.encode("Hello, world!");
|
|
16
|
+
* console.log(tokens); // [15496, 11, 1917, 0]
|
|
17
|
+
*
|
|
18
|
+
* // Decode back to text
|
|
19
|
+
* const text = tiktoken.decode(tokens);
|
|
20
|
+
* console.log(text); // "Hello, world!"
|
|
21
|
+
*
|
|
22
|
+
* // Count tokens
|
|
23
|
+
* const count = await countTokensAsync("Hello, world!", "cl100k_base");
|
|
24
|
+
* console.log(count); // 4
|
|
25
|
+
* ```
|
|
26
|
+
*
|
|
27
|
+
* @example Token Estimation (Sync, uses heuristics)
|
|
28
|
+
* ```typescript
|
|
29
|
+
* import {
|
|
30
|
+
* countTokens,
|
|
31
|
+
* estimateMaxTokens,
|
|
32
|
+
* getTokenEstimation,
|
|
33
|
+
* getEncodingForModelName,
|
|
34
|
+
* getModelContextLimit
|
|
35
|
+
* } from "tiktoken-ts";
|
|
36
|
+
*
|
|
37
|
+
* // Count tokens in text (estimation)
|
|
38
|
+
* const tokens = countTokens("Hello, world!", { model: "gpt-4o" });
|
|
39
|
+
*
|
|
40
|
+
* // Estimate safe max_tokens value
|
|
41
|
+
* const maxTokens = estimateMaxTokens(promptText, "gpt-5-nano", {
|
|
42
|
+
* desiredOutputTokens: 1000,
|
|
43
|
+
* safetyMargin: 0.1
|
|
44
|
+
* });
|
|
45
|
+
*
|
|
46
|
+
* // Get detailed estimation with warnings
|
|
47
|
+
* const estimation = getTokenEstimation(promptText, "gpt-4o");
|
|
48
|
+
* if (!estimation.fitsInContext) {
|
|
49
|
+
* console.warn(estimation.warning);
|
|
50
|
+
* }
|
|
51
|
+
* ```
|
|
52
|
+
*/
|
|
53
|
+
// =============================================================================
|
|
54
|
+
// Core BPE Implementation (Exact tiktoken-rs port)
|
|
55
|
+
// =============================================================================
|
|
56
|
+
export {
|
|
57
|
+
// BPE algorithm
|
|
58
|
+
bytesToKey, keyToBytes, bytePairEncode, bytePairSplit,
|
|
59
|
+
// Vocabulary loading
|
|
60
|
+
parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, clearVocabularyCache, isVocabularyCached, VOCABULARY_URLS,
|
|
61
|
+
// Core BPE tokenizer
|
|
62
|
+
CoreBPE, DecodeKeyError, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./core/index.js";
|
|
63
|
+
// =============================================================================
|
|
64
|
+
// High-Level Tiktoken API (Async, requires vocabulary loading)
|
|
65
|
+
// =============================================================================
|
|
66
|
+
export {
|
|
67
|
+
// Tiktoken class
|
|
68
|
+
Tiktoken,
|
|
69
|
+
// Factory functions
|
|
70
|
+
getEncoding as getTiktoken, getEncodingForModel as getTiktokenForModel, getEncodingAsync, getEncodingForModelAsync, clearTiktokenCache,
|
|
71
|
+
// Async convenience functions
|
|
72
|
+
encodeAsync, decodeAsync, countTokensAsync, countTokensForModelAsync, } from "./tiktoken.js";
|
|
73
|
+
// =============================================================================
|
|
74
|
+
// Legacy/Estimation API (Sync, uses heuristics)
|
|
75
|
+
// =============================================================================
|
|
76
|
+
export {
|
|
77
|
+
// Factory functions
|
|
78
|
+
getEncoding, getEncodingForModelName,
|
|
79
|
+
// Token counting (estimation)
|
|
80
|
+
countTokens, countChatTokens, countPromptTokens,
|
|
81
|
+
// Max tokens estimation
|
|
82
|
+
estimateMaxTokens, getTokenEstimation, getChatTokenEstimation,
|
|
83
|
+
// Utility functions
|
|
84
|
+
fitsInContext, truncateToTokenLimit, splitIntoChunks, clearTokenizerCache, } from "./tokenizer.js";
|
|
85
|
+
// =============================================================================
|
|
86
|
+
// Model Configuration
|
|
87
|
+
// =============================================================================
|
|
88
|
+
export { MODEL_CONFIGS, MODEL_ALIASES, getModelConfig, getEncodingForModel, getModelContextLimit, getModelMaxOutputTokens, getModelFamily, usesO200kEncoding, usesClaudeEstimation, isClaudeModel, listModels, listModelsByFamily, } from "./models.js";
|
|
89
|
+
// =============================================================================
|
|
90
|
+
// Legacy BPE Implementation (Estimation-based)
|
|
91
|
+
// =============================================================================
|
|
92
|
+
export { BPETokenizer, DEFAULT_SPECIAL_TOKENS, CL100K_BASE_CONFIG, O200K_BASE_CONFIG, P50K_BASE_CONFIG, CLAUDE_ESTIMATION_CONFIG, CLAUDE_SAFETY_MULTIPLIER, getEncodingConfig, } from "./bpe.js";
|
|
93
|
+
// =============================================================================
|
|
94
|
+
// Encoding-Specific Exports
|
|
95
|
+
// =============================================================================
|
|
96
|
+
export {
|
|
97
|
+
// cl100k_base (GPT-4, GPT-3.5-turbo)
|
|
98
|
+
createCL100kTokenizer, CL100K_ENCODING_NAME, CL100K_AVG_CHARS, estimateCL100kTokens,
|
|
99
|
+
// o200k_base (GPT-4o, GPT-4.1, GPT-5)
|
|
100
|
+
createO200kTokenizer, O200K_ENCODING_NAME, O200K_AVG_CHARS, O200K_LANGUAGE_EFFICIENCY, estimateO200kTokens, estimateO200kTokensForLanguage,
|
|
101
|
+
// p50k_base (Legacy Codex)
|
|
102
|
+
createP50kTokenizer, P50K_ENCODING_NAME, P50K_AVG_CHARS, estimateP50kTokens,
|
|
103
|
+
// claude_estimation (Anthropic Claude - estimation only with safety multiplier)
|
|
104
|
+
createClaudeEstimationTokenizer, CLAUDE_ENCODING_NAME, CLAUDE_AVG_CHARS, CLAUDE_VOCAB_SIZE, CLAUDE_SAFETY_MULT, estimateClaudeTokens, estimateClaudeTokensConservative, claudeIncludesSafetyMultiplier, } from "./encodings/index.js";
|
|
105
|
+
// =============================================================================
|
|
106
|
+
// Utility Functions
|
|
107
|
+
// =============================================================================
|
|
108
|
+
export { stringToBytes, bytesToString, countCodePoints, containsCJK, containsEmoji, isAscii, getTextComplexityMultiplier, normalizeWhitespace, } from "./utils.js";
|
|
109
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAmBH,gFAAgF;AAChF,mDAAmD;AACnD,gFAAgF;AAEhF,OAAO;AAML,gBAAgB;AAChB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa;AAEb,qBAAqB;AACrB,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,oBAAoB,EACpB,kBAAkB,EAClB,eAAe;AAEf,qBAAqB;AACrB,OAAO,EACP,cAAc,EAId,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,EAIjB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,iBAAiB,CAAC;AAEzB,gFAAgF;AAChF,+DAA+D;AAC/D,gFAAgF;AAEhF,OAAO;AACL,iBAAiB;AACjB,QAAQ;AAER,oBAAoB;AACpB,WAAW,IAAI,WAAW,EAC1B,mBAAmB,IAAI,mBAAmB,EAC1C,gBAAgB,EAChB,wBAAwB,EACxB,kBAAkB;AAElB,8BAA8B;AAC9B,WAAW,EACX,WAAW,EACX,gBAAgB,EAChB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AAEvB,gFAAgF;AAChF,gDAAgD;AAChD,gFAAgF;AAEhF,OAAO;AACL,oBAAoB;AACpB,WAAW,EACX,uBAAuB;AAEvB,8BAA8B;AAC9B,WAAW,EACX,eAAe,EACf,iBAAiB;AAEjB,wBAAwB;AACxB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB;AAEtB,oBAAoB;AACpB,aAAa,EACb,oBAAoB,EACpB,eAAe,EACf,mBAAmB,GACpB,MAAM,gBAAgB,CAAC;AAExB,gFAAgF;AAChF,sBAAsB;AACtB,gFAAgF;AAEhF,OAAO,EACL,aAAa,EACb,aAAa,EACb,cAAc,EACd,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,cAAc,EACd,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,UAAU,EACV,kBAAkB,GACnB,MAAM,aAAa,CAAC;AAErB,gFAAgF;AAChF,+CAA+C;AAC/C,gFAAgF;AAEhF,OAAO,EACL,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,wBAAwB,EACxB,wBAAwB,EACxB,iBAAiB,GAClB,MAAM,UAAU,CAAC;AAElB,gFAAgF;AAChF,4BAA4B;AAC5B,gFAAgF;AAEhF,OAAO;AACL,qCAAqC;AACrC,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,oBAAoB;AAEpB,sCAAsC;AACtC,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,EACzB,mBAAmB,EACnB,8BAA8B;AAE9B,2BAA2B;AAC3B,mBAAmB,EACnB,kBAAkB,EAClB,cAAc,EACd,kBAAkB;AAElB,gFAAgF;AAChF,+BAA+B,EAC/B,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,oBAAoB,EACpB,gCAAgC,EAChC,8BAA8B,GAC/B,MAAM,sBAAsB,CAAC;AAE9B,gFAAgF;AAChF,oBAAoB;AACpB,gFAAgF;AAEhF,OAAO,EACL,aAAa,EACb,aAAa,EACb,eAAe,EACf,WAAW,EACX,aAAa,EACb,OAAO,EACP,2BAA2B,EAC3B,mBAAmB,GACpB,MAAM,YAAY,CAAC"}
|
package/dist/models.d.ts
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Configuration
|
|
3
|
+
* Maps model names to encodings and context limits
|
|
4
|
+
*
|
|
5
|
+
* Context limits are synced with tiktoken-rs:
|
|
6
|
+
* https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
|
|
7
|
+
*/
|
|
8
|
+
import type { EncodingName, ModelConfig, ModelFamily } from "./types.js";
|
|
9
|
+
/**
|
|
10
|
+
* Model context limits and configurations
|
|
11
|
+
* Context limits from tiktoken-rs, max output tokens from OpenAI docs
|
|
12
|
+
*/
|
|
13
|
+
export declare const MODEL_CONFIGS: Record<string, ModelConfig>;
|
|
14
|
+
/**
|
|
15
|
+
* Model name aliases for flexibility
|
|
16
|
+
*/
|
|
17
|
+
export declare const MODEL_ALIASES: Record<string, string>;
|
|
18
|
+
/**
|
|
19
|
+
* Get model configuration by name
|
|
20
|
+
* Handles aliases and partial matches
|
|
21
|
+
*
|
|
22
|
+
* @param modelName - Model name or alias
|
|
23
|
+
* @returns Model configuration or undefined
|
|
24
|
+
*/
|
|
25
|
+
export declare function getModelConfig(modelName: string): ModelConfig | undefined;
|
|
26
|
+
/**
|
|
27
|
+
* Get encoding name for a model
|
|
28
|
+
*
|
|
29
|
+
* @param modelName - Model name
|
|
30
|
+
* @returns Encoding name, defaults to o200k_base for unknown models
|
|
31
|
+
*/
|
|
32
|
+
export declare function getEncodingForModel(modelName: string): EncodingName;
|
|
33
|
+
/**
|
|
34
|
+
* Get context limit for a model
|
|
35
|
+
*
|
|
36
|
+
* @param modelName - Model name
|
|
37
|
+
* @returns Context limit in tokens
|
|
38
|
+
*/
|
|
39
|
+
export declare function getModelContextLimit(modelName: string): number;
|
|
40
|
+
/**
|
|
41
|
+
* Get max output tokens for a model
|
|
42
|
+
*
|
|
43
|
+
* @param modelName - Model name
|
|
44
|
+
* @returns Max output tokens
|
|
45
|
+
*/
|
|
46
|
+
export declare function getModelMaxOutputTokens(modelName: string): number;
|
|
47
|
+
/**
|
|
48
|
+
* Get model family
|
|
49
|
+
*
|
|
50
|
+
* @param modelName - Model name
|
|
51
|
+
* @returns Model family classification
|
|
52
|
+
*/
|
|
53
|
+
export declare function getModelFamily(modelName: string): ModelFamily;
|
|
54
|
+
/**
|
|
55
|
+
* Check if a model uses the newer o200k_base encoding
|
|
56
|
+
*
|
|
57
|
+
* @param modelName - Model name
|
|
58
|
+
* @returns True if model uses o200k_base
|
|
59
|
+
*/
|
|
60
|
+
export declare function usesO200kEncoding(modelName: string): boolean;
|
|
61
|
+
/**
|
|
62
|
+
* Check if a model uses the Claude estimation encoding
|
|
63
|
+
*
|
|
64
|
+
* Claude models use a proprietary tokenizer. This encoding provides
|
|
65
|
+
* "safe" estimates that intentionally over-count tokens to prevent
|
|
66
|
+
* API truncation issues.
|
|
67
|
+
*
|
|
68
|
+
* @param modelName - Model name
|
|
69
|
+
* @returns True if model uses claude_estimation
|
|
70
|
+
*/
|
|
71
|
+
export declare function usesClaudeEstimation(modelName: string): boolean;
|
|
72
|
+
/**
|
|
73
|
+
* Check if a model is a Claude model
|
|
74
|
+
*
|
|
75
|
+
* @param modelName - Model name
|
|
76
|
+
* @returns True if model is from Anthropic Claude family
|
|
77
|
+
*/
|
|
78
|
+
export declare function isClaudeModel(modelName: string): boolean;
|
|
79
|
+
/**
|
|
80
|
+
* List all known models
|
|
81
|
+
*
|
|
82
|
+
* @returns Array of model names
|
|
83
|
+
*/
|
|
84
|
+
export declare function listModels(): string[];
|
|
85
|
+
/**
|
|
86
|
+
* List models by family
|
|
87
|
+
*
|
|
88
|
+
* @param family - Model family to filter
|
|
89
|
+
* @returns Array of model configurations
|
|
90
|
+
*/
|
|
91
|
+
export declare function listModelsByFamily(family: ModelFamily): ModelConfig[];
|
|
92
|
+
//# sourceMappingURL=models.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAsBzE;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CA2brD,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgEhD,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS,CAsBzE;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,GAAG,YAAY,CAGnE;AAED;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAG9D;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAGjE;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,WAAW,CAG7D;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAE5D;AAED;;;;;;;;;GASG;AACH,wBAAgB,oBAAoB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAE/D;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAGxD;AAED;;;;GAIG;AACH,wBAAgB,UAAU,IAAI,MAAM,EAAE,CAErC;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,WAAW,GAAG,WAAW,EAAE,CAIrE"}
|