@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,191 @@
1
+ /**
2
+ * o200k_base Encoding
3
+ * Used by GPT-4o, GPT-4.1, GPT-5 models
4
+ *
5
+ * This encoding has:
6
+ * - 200,000 tokens (larger vocabulary)
7
+ * - Better handling of non-English languages
8
+ * - Improved efficiency for common patterns
9
+ * - Better support for multimodal inputs
10
+ */
11
+ import { BPETokenizer } from "../bpe.js";
12
+ /**
13
+ * Encoding name constant
14
+ */
15
+ export const ENCODING_NAME = "o200k_base";
16
+ /**
17
+ * Create an o200k_base tokenizer instance
18
+ *
19
+ * @returns Tokenizer instance
20
+ */
21
+ export function createO200kTokenizer() {
22
+ const bpe = new BPETokenizer(ENCODING_NAME);
23
+ return {
24
+ encodingName: ENCODING_NAME,
25
+ encode: (text) => bpe.encode(text),
26
+ decode: (tokens) => bpe.decode(tokens),
27
+ countTokens: (text) => bpe.countTokens(text),
28
+ };
29
+ }
30
+ /**
31
+ * Pre-computed token counts for common patterns
32
+ * o200k_base has a larger vocabulary, so more words are single tokens
33
+ */
34
+ export const COMMON_TOKEN_PATTERNS = {
35
+ // Contractions (same as cl100k)
36
+ "'s": 1,
37
+ "'t": 1,
38
+ "'re": 1,
39
+ "'ve": 1,
40
+ "'m": 1,
41
+ "'ll": 1,
42
+ "'d": 1,
43
+ // Common words (more are single tokens in o200k)
44
+ the: 1,
45
+ and: 1,
46
+ is: 1,
47
+ are: 1,
48
+ was: 1,
49
+ were: 1,
50
+ will: 1,
51
+ would: 1,
52
+ could: 1,
53
+ should: 1,
54
+ have: 1,
55
+ has: 1,
56
+ had: 1,
57
+ been: 1,
58
+ being: 1,
59
+ this: 1,
60
+ that: 1,
61
+ these: 1,
62
+ those: 1,
63
+ with: 1,
64
+ from: 1,
65
+ into: 1,
66
+ because: 1, // Single token in o200k
67
+ however: 1,
68
+ therefore: 1,
69
+ although: 1,
70
+ meanwhile: 1,
71
+ // Programming keywords
72
+ function: 1,
73
+ const: 1,
74
+ let: 1,
75
+ var: 1,
76
+ return: 1,
77
+ if: 1,
78
+ else: 1,
79
+ while: 1,
80
+ class: 1,
81
+ interface: 1,
82
+ type: 1,
83
+ import: 1,
84
+ export: 1,
85
+ async: 1,
86
+ await: 1,
87
+ true: 1,
88
+ false: 1,
89
+ null: 1,
90
+ undefined: 1,
91
+ // TypeScript/React keywords (more are single tokens)
92
+ React: 1,
93
+ useState: 1,
94
+ useEffect: 1,
95
+ useCallback: 1,
96
+ useMemo: 1,
97
+ useRef: 1,
98
+ useContext: 1,
99
+ Component: 1,
100
+ Fragment: 1,
101
+ children: 1,
102
+ props: 1,
103
+ // Common API terms
104
+ request: 1,
105
+ response: 1,
106
+ error: 1,
107
+ success: 1,
108
+ message: 1,
109
+ data: 1,
110
+ status: 1,
111
+ headers: 1,
112
+ // Common symbols
113
+ "=>": 1,
114
+ "===": 1,
115
+ "!==": 1,
116
+ "&&": 1,
117
+ "||": 1,
118
+ "++": 1,
119
+ "--": 1,
120
+ "+=": 1,
121
+ "-=": 1,
122
+ "...": 1,
123
+ "?.": 1, // Optional chaining
124
+ "??": 1, // Nullish coalescing
125
+ };
126
+ /**
127
+ * Average characters per token for o200k_base
128
+ * Slightly higher than cl100k due to larger vocabulary
129
+ */
130
+ export const AVERAGE_CHARS_PER_TOKEN = 4.0;
131
+ /**
132
+ * Token estimation adjustments for different content types
133
+ * o200k is generally more efficient across all types
134
+ */
135
+ export const CONTENT_TYPE_MULTIPLIERS = {
136
+ prose: 0.95, // More efficient for English
137
+ code: 0.8, // Even better for code
138
+ json: 0.7, // Very efficient for JSON
139
+ markdown: 0.9, // Better for markdown
140
+ html: 0.75, // Better for HTML
141
+ math: 1.1, // Still needs more tokens for math
142
+ multilingual: 0.85, // Much better for non-English
143
+ };
144
+ /**
145
+ * Language-specific efficiency factors for o200k_base
146
+ * The larger vocabulary handles more languages efficiently
147
+ */
148
+ export const LANGUAGE_EFFICIENCY = {
149
+ en: 1.0, // English (baseline)
150
+ es: 1.05, // Spanish
151
+ fr: 1.05, // French
152
+ de: 1.08, // German (compound words)
153
+ pt: 1.05, // Portuguese
154
+ it: 1.05, // Italian
155
+ nl: 1.08, // Dutch
156
+ ru: 1.2, // Russian (Cyrillic)
157
+ ja: 1.5, // Japanese (needs more tokens)
158
+ zh: 1.4, // Chinese (CJK characters)
159
+ ko: 1.4, // Korean (Hangul)
160
+ ar: 1.3, // Arabic
161
+ hi: 1.3, // Hindi
162
+ vi: 1.15, // Vietnamese
163
+ th: 1.3, // Thai
164
+ };
165
+ /**
166
+ * Get estimated token count with content-type awareness
167
+ *
168
+ * @param text - Input text
169
+ * @param contentType - Type of content
170
+ * @returns Estimated token count
171
+ */
172
+ export function estimateTokensForContent(text, contentType = "prose") {
173
+ const tokenizer = createO200kTokenizer();
174
+ const baseCount = tokenizer.countTokens(text);
175
+ const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 0.95;
176
+ return Math.max(1, Math.round(baseCount * multiplier));
177
+ }
178
+ /**
179
+ * Get estimated token count with language awareness
180
+ *
181
+ * @param text - Input text
182
+ * @param language - ISO 639-1 language code
183
+ * @returns Estimated token count
184
+ */
185
+ export function estimateTokensForLanguage(text, language) {
186
+ const tokenizer = createO200kTokenizer();
187
+ const baseCount = tokenizer.countTokens(text);
188
+ const efficiency = LANGUAGE_EFFICIENCY[language.toLowerCase()] ?? 1.0;
189
+ return Math.max(1, Math.round(baseCount * efficiency));
190
+ }
191
+ //# sourceMappingURL=o200k-base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"o200k-base.js","sourceRoot":"","sources":["../../src/encodings/o200k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,YAAY,CAAC;AAExD;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IAClC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAA2B;IAC3D,gCAAgC;IAChC,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IAEP,iDAAiD;IACjD,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,EAAE,EAAE,CAAC;IACL,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,MAAM,EAAE,CAAC;IACT,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,OAAO,EAAE,CAAC,EAAE,wBAAwB;IACpC,OAAO,EAAE,CAAC;IACV,SAAS,EAAE,CAAC;IACZ,QAAQ,EAAE,CAAC;IACX,SAAS,EAAE,CAAC;IAEZ,uBAAuB;IACvB,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,MAAM,EAAE,CAAC;IACT,EAAE,EAAE,CAAC;IACL,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,SAAS,EAAE,CAAC;IACZ,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,MAAM,EAAE,CAAC;IACT,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,CAAC;IAEZ,qDAAqD;IACrD,KAAK,EAAE,CAAC;IACR,QAAQ,EAAE,CAAC;IACX,SAAS,EAAE,CAAC;IACZ,WAAW,EAAE,CAAC;IACd,OAAO,EAAE,CAAC;IACV,MAAM,EAAE,CAAC;IACT,UAAU,EAAE,CAAC;IACb,SAAS,EAAE,CAAC;IACZ,QAAQ,EAAE,CAAC;IACX,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IAER,mBAAmB;IACnB,OAAO,EAAE,CAAC;IACV,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,OAAO,EAAE,CAAC;IACV,OAAO,EAAE,CAAC;IACV,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,OAAO,EAAE,CAAC;IAEV,iBAAiB;IACjB,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC,EAAE,oBAAoB;IAC7B,IAAI,EAAE,CAAC,EAAE,qBAAqB;CAC/B,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;GAGG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,IAAI,EAAE,6BAA6B;IAC1C,IAAI,EAAE,GAAG,EAAE,uBAAuB;IAClC,IAAI,EAAE,GAAG,EAAE,0BAA0B;IACrC,QAAQ,EAAE,GAAG,EAAE,sBAAsB;IACrC,IAAI,EAAE,IAAI,EAAE,kBAAkB;IAC9B,IAAI,EAAE,GAAG,EAAE,mCAAmC;IAC9C,YAAY,EAAE,IAAI,EAAE,8BAA8B;CACnD,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAA2B;IACzD,EAAE,EAAE,GAAG,EAAE,qBAAqB;IAC9B,EAAE,EAAE,IAAI,EAAE,UAAU;IACpB,EAAE,EAAE,IAAI,EAAE,SAAS;IACnB,EAAE,EAAE,IAAI,EAAE,0BAA0B;IACpC,EAAE,EAAE,IAAI,EAAE,aAAa;IACvB,EAAE,EAAE,IAAI,EAAE,UAAU;IACpB,EAAE,EAAE,IAAI,EAAE,QAAQ;IAClB,EAAE,EAAE,GAAG,EAAE,qBAAqB;IAC9B,EAAE,EAAE,GAAG,EAAE,+BAA+B;IACxC,EAAE,EAAE,GAAG,EAAE,2BAA2B;IACpC,EAAE,EAAE,GAAG,EAAE,kBAAkB;IAC3B,EAAE,EAAE,GAAG,EAAE,SAAS;IAClB,EAAE,EAAE,GAAG,EAAE,QAAQ;IACjB,EAAE,EAAE,IAAI,EAAE,aAAa;IACvB,EAAE,EAAE,GAAG,EAAE,OAAO;CACjB,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,oBAAoB,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC;IAEjE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,yBAAyB,CACvC,IAAY,EACZ,QAAgB;IAEhB,MAAM,SAAS,GAAG,oBAAoB,EAAE,CAAC;IACzC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,mBAAmB,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,IAAI,GAAG,CAAC;IAEtE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
@@ -0,0 +1,44 @@
1
+ /**
2
+ * p50k_base Encoding (Legacy)
3
+ * Used by older Codex models (code-davinci-002, code-cushman-001)
4
+ *
5
+ * This encoding has:
6
+ * - 50,257 tokens (smallest vocabulary)
7
+ * - Optimized for code completion
8
+ * - Less efficient for natural language
9
+ *
10
+ * @deprecated This encoding is used by deprecated models.
11
+ * Use o200k_base or cl100k_base for current models.
12
+ */
13
+ import type { Tokenizer, EncodingName } from "../types.js";
14
+ /**
15
+ * Encoding name constant
16
+ */
17
+ export declare const ENCODING_NAME: EncodingName;
18
+ /**
19
+ * Create a p50k_base tokenizer instance
20
+ *
21
+ * @returns Tokenizer instance
22
+ * @deprecated Use createO200kTokenizer or createCL100kTokenizer instead
23
+ */
24
+ export declare function createP50kTokenizer(): Tokenizer;
25
+ /**
26
+ * Average characters per token for p50k_base
27
+ * Lower than newer encodings due to smaller vocabulary
28
+ */
29
+ export declare const AVERAGE_CHARS_PER_TOKEN = 3.5;
30
+ /**
31
+ * Token estimation adjustments for different content types
32
+ * p50k was optimized primarily for code
33
+ */
34
+ export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
35
+ /**
36
+ * Get estimated token count with content-type awareness
37
+ *
38
+ * @param text - Input text
39
+ * @param contentType - Type of content
40
+ * @returns Estimated token count
41
+ * @deprecated Use o200k_base or cl100k_base for current models
42
+ */
43
+ export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
44
+ //# sourceMappingURL=p50k-base.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"p50k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/p50k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA0B,CAAC;AAEvD;;;;;GAKG;AACH,wBAAgB,mBAAmB,IAAI,SAAS,CAS/C;AAED;;;GAGG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR"}
@@ -0,0 +1,64 @@
1
+ /**
2
+ * p50k_base Encoding (Legacy)
3
+ * Used by older Codex models (code-davinci-002, code-cushman-001)
4
+ *
5
+ * This encoding has:
6
+ * - 50,257 tokens (smallest vocabulary)
7
+ * - Optimized for code completion
8
+ * - Less efficient for natural language
9
+ *
10
+ * @deprecated This encoding is used by deprecated models.
11
+ * Use o200k_base or cl100k_base for current models.
12
+ */
13
+ import { BPETokenizer } from "../bpe.js";
14
+ /**
15
+ * Encoding name constant
16
+ */
17
+ export const ENCODING_NAME = "p50k_base";
18
+ /**
19
+ * Create a p50k_base tokenizer instance
20
+ *
21
+ * @returns Tokenizer instance
22
+ * @deprecated Use createO200kTokenizer or createCL100kTokenizer instead
23
+ */
24
+ export function createP50kTokenizer() {
25
+ const bpe = new BPETokenizer(ENCODING_NAME);
26
+ return {
27
+ encodingName: ENCODING_NAME,
28
+ encode: (text) => bpe.encode(text),
29
+ decode: (tokens) => bpe.decode(tokens),
30
+ countTokens: (text) => bpe.countTokens(text),
31
+ };
32
+ }
33
+ /**
34
+ * Average characters per token for p50k_base
35
+ * Lower than newer encodings due to smaller vocabulary
36
+ */
37
+ export const AVERAGE_CHARS_PER_TOKEN = 3.5;
38
+ /**
39
+ * Token estimation adjustments for different content types
40
+ * p50k was optimized primarily for code
41
+ */
42
+ export const CONTENT_TYPE_MULTIPLIERS = {
43
+ prose: 1.15, // Less efficient for natural language
44
+ code: 0.9, // Better for code (its primary use case)
45
+ json: 0.85, // Reasonable for JSON
46
+ markdown: 1.05, // Less efficient for markdown
47
+ html: 0.9, // Reasonable for HTML
48
+ math: 1.25, // Less efficient for math
49
+ };
50
+ /**
51
+ * Get estimated token count with content-type awareness
52
+ *
53
+ * @param text - Input text
54
+ * @param contentType - Type of content
55
+ * @returns Estimated token count
56
+ * @deprecated Use o200k_base or cl100k_base for current models
57
+ */
58
+ export function estimateTokensForContent(text, contentType = "prose") {
59
+ const tokenizer = createP50kTokenizer();
60
+ const baseCount = tokenizer.countTokens(text);
61
+ const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.15;
62
+ return Math.max(1, Math.round(baseCount * multiplier));
63
+ }
64
+ //# sourceMappingURL=p50k-base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"p50k-base.js","sourceRoot":"","sources":["../../src/encodings/p50k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,WAAW,CAAC;AAEvD;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB;IACjC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;GAGG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,IAAI,EAAE,sCAAsC;IACnD,IAAI,EAAE,GAAG,EAAE,yCAAyC;IACpD,IAAI,EAAE,IAAI,EAAE,sBAAsB;IAClC,QAAQ,EAAE,IAAI,EAAE,8BAA8B;IAC9C,IAAI,EAAE,GAAG,EAAE,sBAAsB;IACjC,IAAI,EAAE,IAAI,EAAE,0BAA0B;CACvC,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,mBAAmB,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC;IAEjE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
@@ -0,0 +1,61 @@
1
+ /**
2
+ * tiktoken-ts - A pure TypeScript implementation of OpenAI's tiktoken
3
+ *
4
+ * A self-contained TypeScript tokenizer for AI applications.
5
+ * Implements EXACT BPE (Byte-Pair Encoding) algorithm from tiktoken-rs.
6
+ *
7
+ * @packageDocumentation
8
+ *
9
+ * @example Exact BPE Tokenization (Async)
10
+ * ```typescript
11
+ * import { getEncodingAsync, countTokensAsync } from "tiktoken-ts";
12
+ *
13
+ * // Load encoding and tokenize
14
+ * const tiktoken = await getEncodingAsync("cl100k_base");
15
+ * const tokens = tiktoken.encode("Hello, world!");
16
+ * console.log(tokens); // [15496, 11, 1917, 0]
17
+ *
18
+ * // Decode back to text
19
+ * const text = tiktoken.decode(tokens);
20
+ * console.log(text); // "Hello, world!"
21
+ *
22
+ * // Count tokens
23
+ * const count = await countTokensAsync("Hello, world!", "cl100k_base");
24
+ * console.log(count); // 4
25
+ * ```
26
+ *
27
+ * @example Token Estimation (Sync, uses heuristics)
28
+ * ```typescript
29
+ * import {
30
+ * countTokens,
31
+ * estimateMaxTokens,
32
+ * getTokenEstimation,
33
+ * getEncodingForModelName,
34
+ * getModelContextLimit
35
+ * } from "tiktoken-ts";
36
+ *
37
+ * // Count tokens in text (estimation)
38
+ * const tokens = countTokens("Hello, world!", { model: "gpt-4o" });
39
+ *
40
+ * // Estimate safe max_tokens value
41
+ * const maxTokens = estimateMaxTokens(promptText, "gpt-5-nano", {
42
+ * desiredOutputTokens: 1000,
43
+ * safetyMargin: 0.1
44
+ * });
45
+ *
46
+ * // Get detailed estimation with warnings
47
+ * const estimation = getTokenEstimation(promptText, "gpt-4o");
48
+ * if (!estimation.fitsInContext) {
49
+ * console.warn(estimation.warning);
50
+ * }
51
+ * ```
52
+ */
53
+ export type { EncodingName, ModelFamily, SpecialTokens, EncodingConfig, Tokenizer, ModelConfig, TokenEstimation, ChatMessage, TokenCountOptions, MaxTokensOptions, } from "./types.js";
54
+ export { type Rank, type Vocabulary, type ReverseVocabulary, bytesToKey, keyToBytes, bytePairEncode, bytePairSplit, parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, clearVocabularyCache, isVocabularyCached, VOCABULARY_URLS, CoreBPE, DecodeKeyError, type EncodingDefinition, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, type TokenizerName, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./core/index.js";
55
+ export { Tiktoken, getEncoding as getTiktoken, getEncodingForModel as getTiktokenForModel, getEncodingAsync, getEncodingForModelAsync, clearTiktokenCache, encodeAsync, decodeAsync, countTokensAsync, countTokensForModelAsync, } from "./tiktoken.js";
56
+ export { getEncoding, getEncodingForModelName, countTokens, countChatTokens, countPromptTokens, estimateMaxTokens, getTokenEstimation, getChatTokenEstimation, fitsInContext, truncateToTokenLimit, splitIntoChunks, clearTokenizerCache, } from "./tokenizer.js";
57
+ export { MODEL_CONFIGS, MODEL_ALIASES, getModelConfig, getEncodingForModel, getModelContextLimit, getModelMaxOutputTokens, getModelFamily, usesO200kEncoding, usesClaudeEstimation, isClaudeModel, listModels, listModelsByFamily, } from "./models.js";
58
+ export { BPETokenizer, DEFAULT_SPECIAL_TOKENS, CL100K_BASE_CONFIG, O200K_BASE_CONFIG, P50K_BASE_CONFIG, CLAUDE_ESTIMATION_CONFIG, CLAUDE_SAFETY_MULTIPLIER, getEncodingConfig, } from "./bpe.js";
59
+ export { createCL100kTokenizer, CL100K_ENCODING_NAME, CL100K_AVG_CHARS, estimateCL100kTokens, createO200kTokenizer, O200K_ENCODING_NAME, O200K_AVG_CHARS, O200K_LANGUAGE_EFFICIENCY, estimateO200kTokens, estimateO200kTokensForLanguage, createP50kTokenizer, P50K_ENCODING_NAME, P50K_AVG_CHARS, estimateP50kTokens, createClaudeEstimationTokenizer, CLAUDE_ENCODING_NAME, CLAUDE_AVG_CHARS, CLAUDE_VOCAB_SIZE, CLAUDE_SAFETY_MULT, estimateClaudeTokens, estimateClaudeTokensConservative, claudeIncludesSafetyMultiplier, } from "./encodings/index.js";
60
+ export { stringToBytes, bytesToString, countCodePoints, containsCJK, containsEmoji, isAscii, getTextComplexityMultiplier, normalizeWhitespace, } from "./utils.js";
61
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAMH,YAAY,EACV,YAAY,EACZ,WAAW,EACX,aAAa,EACb,cAAc,EACd,SAAS,EACT,WAAW,EACX,eAAe,EACf,WAAW,EACX,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,YAAY,CAAC;AAMpB,OAAO,EAEL,KAAK,IAAI,EACT,KAAK,UAAU,EACf,KAAK,iBAAiB,EAGtB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa,EAGb,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,oBAAoB,EACpB,kBAAkB,EAClB,eAAe,EAGf,OAAO,EACP,cAAc,EAGd,KAAK,kBAAkB,EACvB,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,EAGjB,KAAK,aAAa,EAClB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,iBAAiB,CAAC;AAMzB,OAAO,EAEL,QAAQ,EAGR,WAAW,IAAI,WAAW,EAC1B,mBAAmB,IAAI,mBAAmB,EAC1C,gBAAgB,EAChB,wBAAwB,EACxB,kBAAkB,EAGlB,WAAW,EACX,WAAW,EACX,gBAAgB,EAChB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AAMvB,OAAO,EAEL,WAAW,EACX,uBAAuB,EAGvB,WAAW,EACX,eAAe,EACf,iBAAiB,EAGjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EAGtB,aAAa,EACb,oBAAoB,EACpB,eAAe,EACf,mBAAmB,GACpB,MAAM,gBAAgB,CAAC;AAMxB,OAAO,EACL,aAAa,EACb,aAAa,EACb,cAAc,EACd,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,cAAc,EACd,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,UAAU,EACV,kBAAkB,GACnB,MAAM,aAAa,CAAC;AAMrB,OAAO,EACL,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,wBAAwB,EACxB,wBAAwB,EACxB,iBAAiB,GAClB,MAAM,UAAU,CAAC;AAMlB,OAAO,EAEL,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,oBAAoB,EAGpB,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,EACzB,mBAAmB,EACnB,8BAA8B,EAG9B,mBAAmB,EACnB,kBAAkB,EAClB,cAAc,EACd,kBAAkB,EAGlB,+BAA+B,EAC/B,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,oBAAoB,EACpB,gCAAgC,EAChC,8BAA8B,GAC/B,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,aAAa,EACb,aAAa,EACb,eAAe,EACf,WAAW,EACX,aAAa,EACb,OAAO,EACP,2BAA2B,EAC3B,mBAAmB,GACpB,MAAM,YAAY,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,109 @@
1
+ /**
2
+ * tiktoken-ts - A pure TypeScript implementation of OpenAI's tiktoken
3
+ *
4
+ * A self-contained TypeScript tokenizer for AI applications.
5
+ * Implements EXACT BPE (Byte-Pair Encoding) algorithm from tiktoken-rs.
6
+ *
7
+ * @packageDocumentation
8
+ *
9
+ * @example Exact BPE Tokenization (Async)
10
+ * ```typescript
11
+ * import { getEncodingAsync, countTokensAsync } from "tiktoken-ts";
12
+ *
13
+ * // Load encoding and tokenize
14
+ * const tiktoken = await getEncodingAsync("cl100k_base");
15
+ * const tokens = tiktoken.encode("Hello, world!");
16
+ * console.log(tokens); // [15496, 11, 1917, 0]
17
+ *
18
+ * // Decode back to text
19
+ * const text = tiktoken.decode(tokens);
20
+ * console.log(text); // "Hello, world!"
21
+ *
22
+ * // Count tokens
23
+ * const count = await countTokensAsync("Hello, world!", "cl100k_base");
24
+ * console.log(count); // 4
25
+ * ```
26
+ *
27
+ * @example Token Estimation (Sync, uses heuristics)
28
+ * ```typescript
29
+ * import {
30
+ * countTokens,
31
+ * estimateMaxTokens,
32
+ * getTokenEstimation,
33
+ * getEncodingForModelName,
34
+ * getModelContextLimit
35
+ * } from "tiktoken-ts";
36
+ *
37
+ * // Count tokens in text (estimation)
38
+ * const tokens = countTokens("Hello, world!", { model: "gpt-4o" });
39
+ *
40
+ * // Estimate safe max_tokens value
41
+ * const maxTokens = estimateMaxTokens(promptText, "gpt-5-nano", {
42
+ * desiredOutputTokens: 1000,
43
+ * safetyMargin: 0.1
44
+ * });
45
+ *
46
+ * // Get detailed estimation with warnings
47
+ * const estimation = getTokenEstimation(promptText, "gpt-4o");
48
+ * if (!estimation.fitsInContext) {
49
+ * console.warn(estimation.warning);
50
+ * }
51
+ * ```
52
+ */
53
+ // =============================================================================
54
+ // Core BPE Implementation (Exact tiktoken-rs port)
55
+ // =============================================================================
56
+ export {
57
+ // BPE algorithm
58
+ bytesToKey, keyToBytes, bytePairEncode, bytePairSplit,
59
+ // Vocabulary loading
60
+ parseVocabulary, loadVocabularyFromUrl, loadVocabularyFromString, clearVocabularyCache, isVocabularyCached, VOCABULARY_URLS,
61
+ // Core BPE tokenizer
62
+ CoreBPE, DecodeKeyError, SPECIAL_TOKENS, R50K_BASE, P50K_BASE, P50K_EDIT, CL100K_BASE, O200K_BASE, O200K_HARMONY, ENCODING_DEFINITIONS, getEncodingDefinition, listEncodingNames, getTokenizerForModel, getContextSize, getExactContextSize, EXACT_CONTEXT_SIZES, } from "./core/index.js";
63
+ // =============================================================================
64
+ // High-Level Tiktoken API (Async, requires vocabulary loading)
65
+ // =============================================================================
66
+ export {
67
+ // Tiktoken class
68
+ Tiktoken,
69
+ // Factory functions
70
+ getEncoding as getTiktoken, getEncodingForModel as getTiktokenForModel, getEncodingAsync, getEncodingForModelAsync, clearTiktokenCache,
71
+ // Async convenience functions
72
+ encodeAsync, decodeAsync, countTokensAsync, countTokensForModelAsync, } from "./tiktoken.js";
73
+ // =============================================================================
74
+ // Legacy/Estimation API (Sync, uses heuristics)
75
+ // =============================================================================
76
+ export {
77
+ // Factory functions
78
+ getEncoding, getEncodingForModelName,
79
+ // Token counting (estimation)
80
+ countTokens, countChatTokens, countPromptTokens,
81
+ // Max tokens estimation
82
+ estimateMaxTokens, getTokenEstimation, getChatTokenEstimation,
83
+ // Utility functions
84
+ fitsInContext, truncateToTokenLimit, splitIntoChunks, clearTokenizerCache, } from "./tokenizer.js";
85
+ // =============================================================================
86
+ // Model Configuration
87
+ // =============================================================================
88
+ export { MODEL_CONFIGS, MODEL_ALIASES, getModelConfig, getEncodingForModel, getModelContextLimit, getModelMaxOutputTokens, getModelFamily, usesO200kEncoding, usesClaudeEstimation, isClaudeModel, listModels, listModelsByFamily, } from "./models.js";
89
+ // =============================================================================
90
+ // Legacy BPE Implementation (Estimation-based)
91
+ // =============================================================================
92
+ export { BPETokenizer, DEFAULT_SPECIAL_TOKENS, CL100K_BASE_CONFIG, O200K_BASE_CONFIG, P50K_BASE_CONFIG, CLAUDE_ESTIMATION_CONFIG, CLAUDE_SAFETY_MULTIPLIER, getEncodingConfig, } from "./bpe.js";
93
+ // =============================================================================
94
+ // Encoding-Specific Exports
95
+ // =============================================================================
96
+ export {
97
+ // cl100k_base (GPT-4, GPT-3.5-turbo)
98
+ createCL100kTokenizer, CL100K_ENCODING_NAME, CL100K_AVG_CHARS, estimateCL100kTokens,
99
+ // o200k_base (GPT-4o, GPT-4.1, GPT-5)
100
+ createO200kTokenizer, O200K_ENCODING_NAME, O200K_AVG_CHARS, O200K_LANGUAGE_EFFICIENCY, estimateO200kTokens, estimateO200kTokensForLanguage,
101
+ // p50k_base (Legacy Codex)
102
+ createP50kTokenizer, P50K_ENCODING_NAME, P50K_AVG_CHARS, estimateP50kTokens,
103
+ // claude_estimation (Anthropic Claude - estimation only with safety multiplier)
104
+ createClaudeEstimationTokenizer, CLAUDE_ENCODING_NAME, CLAUDE_AVG_CHARS, CLAUDE_VOCAB_SIZE, CLAUDE_SAFETY_MULT, estimateClaudeTokens, estimateClaudeTokensConservative, claudeIncludesSafetyMultiplier, } from "./encodings/index.js";
105
+ // =============================================================================
106
+ // Utility Functions
107
+ // =============================================================================
108
+ export { stringToBytes, bytesToString, countCodePoints, containsCJK, containsEmoji, isAscii, getTextComplexityMultiplier, normalizeWhitespace, } from "./utils.js";
109
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAmBH,gFAAgF;AAChF,mDAAmD;AACnD,gFAAgF;AAEhF,OAAO;AAML,gBAAgB;AAChB,UAAU,EACV,UAAU,EACV,cAAc,EACd,aAAa;AAEb,qBAAqB;AACrB,eAAe,EACf,qBAAqB,EACrB,wBAAwB,EACxB,oBAAoB,EACpB,kBAAkB,EAClB,eAAe;AAEf,qBAAqB;AACrB,OAAO,EACP,cAAc,EAId,cAAc,EACd,SAAS,EACT,SAAS,EACT,SAAS,EACT,WAAW,EACX,UAAU,EACV,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,iBAAiB,EAIjB,oBAAoB,EACpB,cAAc,EACd,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,iBAAiB,CAAC;AAEzB,gFAAgF;AAChF,+DAA+D;AAC/D,gFAAgF;AAEhF,OAAO;AACL,iBAAiB;AACjB,QAAQ;AAER,oBAAoB;AACpB,WAAW,IAAI,WAAW,EAC1B,mBAAmB,IAAI,mBAAmB,EAC1C,gBAAgB,EAChB,wBAAwB,EACxB,kBAAkB;AAElB,8BAA8B;AAC9B,WAAW,EACX,WAAW,EACX,gBAAgB,EAChB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AAEvB,gFAAgF;AAChF,gDAAgD;AAChD,gFAAgF;AAEhF,OAAO;AACL,oBAAoB;AACpB,WAAW,EACX,uBAAuB;AAEvB,8BAA8B;AAC9B,WAAW,EACX,eAAe,EACf,iBAAiB;AAEjB,wBAAwB;AACxB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB;AAEtB,oBAAoB;AACpB,aAAa,EACb,oBAAoB,EACpB,eAAe,EACf,mBAAmB,GACpB,MAAM,gBAAgB,CAAC;AAExB,gFAAgF;AAChF,sBAAsB;AACtB,gFAAgF;AAEhF,OAAO,EACL,aAAa,EACb,aAAa,EACb,cAAc,EACd,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,cAAc,EACd,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,UAAU,EACV,kBAAkB,GACnB,MAAM,aAAa,CAAC;AAErB,gFAAgF;AAChF,+CAA+C;AAC/C,gFAAgF;AAEhF,OAAO,EACL,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,wBAAwB,EACxB,wBAAwB,EACxB,iBAAiB,GAClB,MAAM,UAAU,CAAC;AAElB,gFAAgF;AAChF,4BAA4B;AAC5B,gFAAgF;AAEhF,OAAO;AACL,qCAAqC;AACrC,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,oBAAoB;AAEpB,sCAAsC;AACtC,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,EACzB,mBAAmB,EACnB,8BAA8B;AAE9B,2BAA2B;AAC3B,mBAAmB,EACnB,kBAAkB,EAClB,cAAc,EACd,kBAAkB;AAElB,gFAAgF;AAChF,+BAA+B,EAC/B,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,oBAAoB,EACpB,gCAAgC,EAChC,8BAA8B,GAC/B,MAAM,sBAAsB,CAAC;AAE9B,gFAAgF;AAChF,oBAAoB;AACpB,gFAAgF;AAEhF,OAAO,EACL,aAAa,EACb,aAAa,EACb,eAAe,EACf,WAAW,EACX,aAAa,EACb,OAAO,EACP,2BAA2B,EAC3B,mBAAmB,GACpB,MAAM,YAAY,CAAC"}
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Model Configuration
3
+ * Maps model names to encodings and context limits
4
+ *
5
+ * Context limits are synced with tiktoken-rs:
6
+ * https://github.com/zurawiki/tiktoken-rs/blob/main/tiktoken-rs/src/model.rs
7
+ */
8
+ import type { EncodingName, ModelConfig, ModelFamily } from "./types.js";
9
+ /**
10
+ * Model context limits and configurations
11
+ * Context limits from tiktoken-rs, max output tokens from OpenAI docs
12
+ */
13
+ export declare const MODEL_CONFIGS: Record<string, ModelConfig>;
14
+ /**
15
+ * Model name aliases for flexibility
16
+ */
17
+ export declare const MODEL_ALIASES: Record<string, string>;
18
+ /**
19
+ * Get model configuration by name
20
+ * Handles aliases and partial matches
21
+ *
22
+ * @param modelName - Model name or alias
23
+ * @returns Model configuration or undefined
24
+ */
25
+ export declare function getModelConfig(modelName: string): ModelConfig | undefined;
26
+ /**
27
+ * Get encoding name for a model
28
+ *
29
+ * @param modelName - Model name
30
+ * @returns Encoding name, defaults to o200k_base for unknown models
31
+ */
32
+ export declare function getEncodingForModel(modelName: string): EncodingName;
33
+ /**
34
+ * Get context limit for a model
35
+ *
36
+ * @param modelName - Model name
37
+ * @returns Context limit in tokens
38
+ */
39
+ export declare function getModelContextLimit(modelName: string): number;
40
+ /**
41
+ * Get max output tokens for a model
42
+ *
43
+ * @param modelName - Model name
44
+ * @returns Max output tokens
45
+ */
46
+ export declare function getModelMaxOutputTokens(modelName: string): number;
47
+ /**
48
+ * Get model family
49
+ *
50
+ * @param modelName - Model name
51
+ * @returns Model family classification
52
+ */
53
+ export declare function getModelFamily(modelName: string): ModelFamily;
54
+ /**
55
+ * Check if a model uses the newer o200k_base encoding
56
+ *
57
+ * @param modelName - Model name
58
+ * @returns True if model uses o200k_base
59
+ */
60
+ export declare function usesO200kEncoding(modelName: string): boolean;
61
+ /**
62
+ * Check if a model uses the Claude estimation encoding
63
+ *
64
+ * Claude models use a proprietary tokenizer. This encoding provides
65
+ * "safe" estimates that intentionally over-count tokens to prevent
66
+ * API truncation issues.
67
+ *
68
+ * @param modelName - Model name
69
+ * @returns True if model uses claude_estimation
70
+ */
71
+ export declare function usesClaudeEstimation(modelName: string): boolean;
72
+ /**
73
+ * Check if a model is a Claude model
74
+ *
75
+ * @param modelName - Model name
76
+ * @returns True if model is from Anthropic Claude family
77
+ */
78
+ export declare function isClaudeModel(modelName: string): boolean;
79
+ /**
80
+ * List all known models
81
+ *
82
+ * @returns Array of model names
83
+ */
84
+ export declare function listModels(): string[];
85
+ /**
86
+ * List models by family
87
+ *
88
+ * @param family - Model family to filter
89
+ * @returns Array of model configurations
90
+ */
91
+ export declare function listModelsByFamily(family: ModelFamily): ModelConfig[];
92
+ //# sourceMappingURL=models.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAsBzE;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CA2brD,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgEhD,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS,CAsBzE;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,GAAG,YAAY,CAGnE;AAED;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAG9D;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAGjE;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,WAAW,CAG7D;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAE5D;AAED;;;;;;;;;GASG;AACH,wBAAgB,oBAAoB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAE/D;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAGxD;AAED;;;;GAIG;AACH,wBAAgB,UAAU,IAAI,MAAM,EAAE,CAErC;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,WAAW,GAAG,WAAW,EAAE,CAIrE"}