@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cl100k_base Encoding
|
|
3
|
+
* Used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 100,256 tokens
|
|
7
|
+
* - Better handling of code and programming languages
|
|
8
|
+
* - Improved whitespace handling
|
|
9
|
+
*/
|
|
10
|
+
import { BPETokenizer } from "../bpe.js";
|
|
11
|
+
/**
|
|
12
|
+
* Encoding name constant
|
|
13
|
+
*/
|
|
14
|
+
export const ENCODING_NAME = "cl100k_base";
|
|
15
|
+
/**
|
|
16
|
+
* Create a cl100k_base tokenizer instance
|
|
17
|
+
*
|
|
18
|
+
* @returns Tokenizer instance
|
|
19
|
+
*/
|
|
20
|
+
export function createCL100kTokenizer() {
|
|
21
|
+
const bpe = new BPETokenizer(ENCODING_NAME);
|
|
22
|
+
return {
|
|
23
|
+
encodingName: ENCODING_NAME,
|
|
24
|
+
encode: (text) => bpe.encode(text),
|
|
25
|
+
decode: (tokens) => bpe.decode(tokens),
|
|
26
|
+
countTokens: (text) => bpe.countTokens(text),
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Pre-computed token counts for common patterns
|
|
31
|
+
* Used to improve estimation accuracy
|
|
32
|
+
*/
|
|
33
|
+
export const COMMON_TOKEN_PATTERNS = {
|
|
34
|
+
// Contractions
|
|
35
|
+
"'s": 1,
|
|
36
|
+
"'t": 1,
|
|
37
|
+
"'re": 1,
|
|
38
|
+
"'ve": 1,
|
|
39
|
+
"'m": 1,
|
|
40
|
+
"'ll": 1,
|
|
41
|
+
"'d": 1,
|
|
42
|
+
// Common words (1 token each in cl100k)
|
|
43
|
+
the: 1,
|
|
44
|
+
and: 1,
|
|
45
|
+
is: 1,
|
|
46
|
+
are: 1,
|
|
47
|
+
was: 1,
|
|
48
|
+
were: 1,
|
|
49
|
+
will: 1,
|
|
50
|
+
would: 1,
|
|
51
|
+
could: 1,
|
|
52
|
+
should: 1,
|
|
53
|
+
have: 1,
|
|
54
|
+
has: 1,
|
|
55
|
+
had: 1,
|
|
56
|
+
been: 1,
|
|
57
|
+
being: 1,
|
|
58
|
+
this: 1,
|
|
59
|
+
that: 1,
|
|
60
|
+
these: 1,
|
|
61
|
+
those: 1,
|
|
62
|
+
with: 1,
|
|
63
|
+
from: 1,
|
|
64
|
+
into: 1,
|
|
65
|
+
for: 1,
|
|
66
|
+
// Programming keywords (1 token each)
|
|
67
|
+
function: 1,
|
|
68
|
+
const: 1,
|
|
69
|
+
let: 1,
|
|
70
|
+
var: 1,
|
|
71
|
+
return: 1,
|
|
72
|
+
if: 1,
|
|
73
|
+
else: 1,
|
|
74
|
+
while: 1,
|
|
75
|
+
class: 1,
|
|
76
|
+
interface: 1,
|
|
77
|
+
type: 1,
|
|
78
|
+
import: 1,
|
|
79
|
+
export: 1,
|
|
80
|
+
async: 1,
|
|
81
|
+
await: 1,
|
|
82
|
+
true: 1,
|
|
83
|
+
false: 1,
|
|
84
|
+
null: 1,
|
|
85
|
+
undefined: 1,
|
|
86
|
+
// Common symbols
|
|
87
|
+
"=>": 1,
|
|
88
|
+
"===": 1,
|
|
89
|
+
"!==": 1,
|
|
90
|
+
"&&": 1,
|
|
91
|
+
"||": 1,
|
|
92
|
+
"++": 1,
|
|
93
|
+
"--": 1,
|
|
94
|
+
"+=": 1,
|
|
95
|
+
"-=": 1,
|
|
96
|
+
"...": 1,
|
|
97
|
+
// Punctuation
|
|
98
|
+
".": 1,
|
|
99
|
+
",": 1,
|
|
100
|
+
";": 1,
|
|
101
|
+
":": 1,
|
|
102
|
+
"!": 1,
|
|
103
|
+
"?": 1,
|
|
104
|
+
"(": 1,
|
|
105
|
+
")": 1,
|
|
106
|
+
"[": 1,
|
|
107
|
+
"]": 1,
|
|
108
|
+
"{": 1,
|
|
109
|
+
"}": 1,
|
|
110
|
+
"<": 1,
|
|
111
|
+
">": 1,
|
|
112
|
+
};
|
|
113
|
+
/**
|
|
114
|
+
* Average characters per token for cl100k_base
|
|
115
|
+
* Based on empirical analysis of English text
|
|
116
|
+
*/
|
|
117
|
+
export const AVERAGE_CHARS_PER_TOKEN = 3.8;
|
|
118
|
+
/**
|
|
119
|
+
* Token estimation adjustments for different content types
|
|
120
|
+
*/
|
|
121
|
+
export const CONTENT_TYPE_MULTIPLIERS = {
|
|
122
|
+
prose: 1.0, // Normal English text
|
|
123
|
+
code: 0.85, // Code tends to be more token-efficient
|
|
124
|
+
json: 0.75, // JSON is very structured
|
|
125
|
+
markdown: 0.95, // Markdown has extra syntax tokens
|
|
126
|
+
html: 0.8, // HTML tags are often single tokens
|
|
127
|
+
math: 1.2, // Mathematical notation uses more tokens
|
|
128
|
+
};
|
|
129
|
+
/**
|
|
130
|
+
* Get estimated token count with content-type awareness
|
|
131
|
+
*
|
|
132
|
+
* @param text - Input text
|
|
133
|
+
* @param contentType - Type of content
|
|
134
|
+
* @returns Estimated token count
|
|
135
|
+
*/
|
|
136
|
+
export function estimateTokensForContent(text, contentType = "prose") {
|
|
137
|
+
const tokenizer = createCL100kTokenizer();
|
|
138
|
+
const baseCount = tokenizer.countTokens(text);
|
|
139
|
+
const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.0;
|
|
140
|
+
return Math.max(1, Math.round(baseCount * multiplier));
|
|
141
|
+
}
|
|
142
|
+
//# sourceMappingURL=cl100k-base.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cl100k-base.js","sourceRoot":"","sources":["../../src/encodings/cl100k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,aAAa,CAAC;AAEzD;;;;GAIG;AACH,MAAM,UAAU,qBAAqB;IACnC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAA2B;IAC3D,eAAe;IACf,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IAEP,wCAAwC;IACxC,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,EAAE,EAAE,CAAC;IACL,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,MAAM,EAAE,CAAC;IACT,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IAEN,sCAAsC;IACtC,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,MAAM,EAAE,CAAC;IACT,EAAE,EAAE,CAAC;IACL,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,SAAS,EAAE,CAAC;IACZ,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,MAAM,EAAE,CAAC;IACT,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,CAAC;IAEZ,iBAAiB;IACjB,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IAER,cAAc;IACd,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;CACP,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,GAAG,EAAE,sBAAsB;IAClC,IAAI,EAAE,IAAI,EAAE,wCAAwC;IACpD,IAAI,EAAE,IAAI,EAAE,0BAA0B;IACtC,QAAQ,EAAE,IAAI,EAAE,mCAAmC;IACnD,IAAI,EAAE,GAAG,EAAE,oCAAoC;IAC/C,IAAI,EAAE,GAAG,EAAE,yCAAyC;CACrD,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,qBAAqB,EAAE,CAAC;IAC1C,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC;IAEhE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Estimation Encoding
|
|
3
|
+
*
|
|
4
|
+
* IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
|
|
5
|
+
*
|
|
6
|
+
* Claude (Anthropic) uses a proprietary tokenizer that is NOT publicly available.
|
|
7
|
+
* This encoding provides "safe" estimates that intentionally over-count tokens
|
|
8
|
+
* to prevent API truncation issues when working with Claude models.
|
|
9
|
+
*
|
|
10
|
+
* Based on research findings (see claude-tokenizer-research.md):
|
|
11
|
+
* - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
|
|
12
|
+
* - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content:
|
|
13
|
+
* - English articles: +16%
|
|
14
|
+
* - Math equations: +21%
|
|
15
|
+
* - Python code: +30%
|
|
16
|
+
* - Average ~3.5 characters per token (vs GPT-4's ~4)
|
|
17
|
+
*
|
|
18
|
+
* This encoding applies a 1.25x safety multiplier to ensure estimates err on
|
|
19
|
+
* the side of over-counting, preventing API truncation.
|
|
20
|
+
*
|
|
21
|
+
* For EXACT Claude token counts, use Anthropic's official API:
|
|
22
|
+
* @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
|
|
23
|
+
*/
|
|
24
|
+
import type { Tokenizer, EncodingName } from "../types.js";
|
|
25
|
+
/**
|
|
26
|
+
* Encoding name constant
|
|
27
|
+
*/
|
|
28
|
+
export declare const ENCODING_NAME: EncodingName;
|
|
29
|
+
/**
|
|
30
|
+
* Create a Claude estimation tokenizer instance
|
|
31
|
+
*
|
|
32
|
+
* This tokenizer provides SAFE estimates for Claude models by:
|
|
33
|
+
* 1. Using cl100k_base patterns as a base (70% vocabulary overlap with Claude 1/2)
|
|
34
|
+
* 2. Applying a 1.25x safety multiplier to account for Claude's higher token usage
|
|
35
|
+
*
|
|
36
|
+
* The estimates intentionally err on over-counting to prevent API truncation.
|
|
37
|
+
*
|
|
38
|
+
* @returns Tokenizer instance configured for Claude estimation
|
|
39
|
+
*
|
|
40
|
+
* @example
|
|
41
|
+
* ```typescript
|
|
42
|
+
* import { createClaudeEstimationTokenizer } from "tiktoken-ts";
|
|
43
|
+
*
|
|
44
|
+
* const tokenizer = createClaudeEstimationTokenizer();
|
|
45
|
+
* const count = tokenizer.countTokens("Hello, Claude!");
|
|
46
|
+
* // Returns a SAFE estimate (intentionally higher than actual)
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare function createClaudeEstimationTokenizer(): Tokenizer;
|
|
50
|
+
/**
|
|
51
|
+
* Average characters per token for Claude models.
|
|
52
|
+
*
|
|
53
|
+
* Research indicates Claude averages ~3.5 characters per token,
|
|
54
|
+
* which is lower (less efficient) than GPT-4's ~4 characters per token.
|
|
55
|
+
*/
|
|
56
|
+
export declare const AVERAGE_CHARS_PER_TOKEN = 3.5;
|
|
57
|
+
/**
|
|
58
|
+
* Estimated vocabulary size for Claude 3+.
|
|
59
|
+
*
|
|
60
|
+
* Based on reverse-engineering research by Sander Land, Claude 3's
|
|
61
|
+
* vocabulary is estimated at ~22,000 tokens - remarkably small compared
|
|
62
|
+
* to Mistral's 32K, GPT-4's 100K, or LLaMA 3's 128K.
|
|
63
|
+
*/
|
|
64
|
+
export declare const ESTIMATED_VOCAB_SIZE = 22000;
|
|
65
|
+
/**
|
|
66
|
+
* Safety multiplier applied to token estimates.
|
|
67
|
+
*
|
|
68
|
+
* This multiplier ensures estimates err on over-counting to prevent
|
|
69
|
+
* API truncation. Based on research showing Claude produces 16-30%
|
|
70
|
+
* more tokens than GPT-4:
|
|
71
|
+
* - English: +16%
|
|
72
|
+
* - Math: +21%
|
|
73
|
+
* - Code: +30%
|
|
74
|
+
*
|
|
75
|
+
* We use 1.25 (25%) as a safe middle ground.
|
|
76
|
+
*/
|
|
77
|
+
export declare const SAFETY_MULTIPLIER = 1.25;
|
|
78
|
+
/**
|
|
79
|
+
* Token estimation adjustments for different content types.
|
|
80
|
+
*
|
|
81
|
+
* Claude's tokenizer has different efficiency characteristics than OpenAI's:
|
|
82
|
+
* - Generally less efficient (more tokens per character)
|
|
83
|
+
* - Especially less efficient for code (+30% vs GPT-4)
|
|
84
|
+
* - CJK handling varies by input vs output context
|
|
85
|
+
*
|
|
86
|
+
* These multipliers are applied ON TOP of the base safety multiplier.
|
|
87
|
+
*/
|
|
88
|
+
export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
|
|
89
|
+
/**
|
|
90
|
+
* Get estimated token count with content-type awareness for Claude.
|
|
91
|
+
*
|
|
92
|
+
* The returned estimate includes both:
|
|
93
|
+
* 1. The base safety multiplier (1.25x)
|
|
94
|
+
* 2. Content-type specific adjustments
|
|
95
|
+
*
|
|
96
|
+
* @param text - Input text
|
|
97
|
+
* @param contentType - Type of content (affects estimation accuracy)
|
|
98
|
+
* @returns Safe estimated token count (intentionally over-counts)
|
|
99
|
+
*
|
|
100
|
+
* @example
|
|
101
|
+
* ```typescript
|
|
102
|
+
* import { estimateClaudeTokens } from "tiktoken-ts";
|
|
103
|
+
*
|
|
104
|
+
* // For English prose
|
|
105
|
+
* const count = estimateClaudeTokens("Hello, world!", "prose");
|
|
106
|
+
*
|
|
107
|
+
* // For Python code (applies additional code multiplier)
|
|
108
|
+
* const codeCount = estimateClaudeTokens(pythonCode, "code");
|
|
109
|
+
* ```
|
|
110
|
+
*/
|
|
111
|
+
export declare function estimateClaudeTokens(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
|
|
112
|
+
/**
|
|
113
|
+
* Get a CONSERVATIVE (high) token estimate for Claude.
|
|
114
|
+
*
|
|
115
|
+
* This function returns the highest reasonable estimate, suitable for:
|
|
116
|
+
* - Calculating worst-case API costs
|
|
117
|
+
* - Ensuring prompts fit within context limits
|
|
118
|
+
* - Setting very safe max_tokens values
|
|
119
|
+
*
|
|
120
|
+
* It applies the maximum content multiplier (1.1 for code/math) on top
|
|
121
|
+
* of the base safety multiplier (1.25x), resulting in ~1.375x over-estimation.
|
|
122
|
+
*
|
|
123
|
+
* @param text - Input text
|
|
124
|
+
* @returns Conservative (high) token estimate
|
|
125
|
+
*/
|
|
126
|
+
export declare function estimateClaudeTokensConservative(text: string): number;
|
|
127
|
+
/**
|
|
128
|
+
* Check if Claude token estimation applies safety multiplier.
|
|
129
|
+
*
|
|
130
|
+
* This is a convenience function that always returns true, confirming
|
|
131
|
+
* that Claude estimates include a safety margin.
|
|
132
|
+
*
|
|
133
|
+
* @returns Always true (Claude estimation always includes safety multiplier)
|
|
134
|
+
*/
|
|
135
|
+
export declare function includesSafetyMultiplier(): boolean;
|
|
136
|
+
//# sourceMappingURL=claude-estimation.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-estimation.d.ts","sourceRoot":"","sources":["../../src/encodings/claude-estimation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAAkC,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,+BAA+B,IAAI,SAAS,CAS3D;AAED;;;;;GAKG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;;;;;GAMG;AACH,eAAO,MAAM,oBAAoB,QAAQ,CAAC;AAE1C;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,iBAAiB,OAA2B,CAAC;AAE1D;;;;;;;;;GASG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,gCAAgC,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAErE;AAED;;;;;;;GAOG;AACH,wBAAgB,wBAAwB,IAAI,OAAO,CAElD"}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Estimation Encoding
|
|
3
|
+
*
|
|
4
|
+
* IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
|
|
5
|
+
*
|
|
6
|
+
* Claude (Anthropic) uses a proprietary tokenizer that is NOT publicly available.
|
|
7
|
+
* This encoding provides "safe" estimates that intentionally over-count tokens
|
|
8
|
+
* to prevent API truncation issues when working with Claude models.
|
|
9
|
+
*
|
|
10
|
+
* Based on research findings (see claude-tokenizer-research.md):
|
|
11
|
+
* - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
|
|
12
|
+
* - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content:
|
|
13
|
+
* - English articles: +16%
|
|
14
|
+
* - Math equations: +21%
|
|
15
|
+
* - Python code: +30%
|
|
16
|
+
* - Average ~3.5 characters per token (vs GPT-4's ~4)
|
|
17
|
+
*
|
|
18
|
+
* This encoding applies a 1.25x safety multiplier to ensure estimates err on
|
|
19
|
+
* the side of over-counting, preventing API truncation.
|
|
20
|
+
*
|
|
21
|
+
* For EXACT Claude token counts, use Anthropic's official API:
|
|
22
|
+
* @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
|
|
23
|
+
*/
|
|
24
|
+
import { BPETokenizer, CLAUDE_SAFETY_MULTIPLIER } from "../bpe.js";
|
|
25
|
+
/**
|
|
26
|
+
* Encoding name constant
|
|
27
|
+
*/
|
|
28
|
+
export const ENCODING_NAME = "claude_estimation";
|
|
29
|
+
/**
|
|
30
|
+
* Create a Claude estimation tokenizer instance
|
|
31
|
+
*
|
|
32
|
+
* This tokenizer provides SAFE estimates for Claude models by:
|
|
33
|
+
* 1. Using cl100k_base patterns as a base (70% vocabulary overlap with Claude 1/2)
|
|
34
|
+
* 2. Applying a 1.25x safety multiplier to account for Claude's higher token usage
|
|
35
|
+
*
|
|
36
|
+
* The estimates intentionally err on over-counting to prevent API truncation.
|
|
37
|
+
*
|
|
38
|
+
* @returns Tokenizer instance configured for Claude estimation
|
|
39
|
+
*
|
|
40
|
+
* @example
|
|
41
|
+
* ```typescript
|
|
42
|
+
* import { createClaudeEstimationTokenizer } from "tiktoken-ts";
|
|
43
|
+
*
|
|
44
|
+
* const tokenizer = createClaudeEstimationTokenizer();
|
|
45
|
+
* const count = tokenizer.countTokens("Hello, Claude!");
|
|
46
|
+
* // Returns a SAFE estimate (intentionally higher than actual)
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export function createClaudeEstimationTokenizer() {
|
|
50
|
+
const bpe = new BPETokenizer(ENCODING_NAME);
|
|
51
|
+
return {
|
|
52
|
+
encodingName: ENCODING_NAME,
|
|
53
|
+
encode: (text) => bpe.encode(text),
|
|
54
|
+
decode: (tokens) => bpe.decode(tokens),
|
|
55
|
+
countTokens: (text) => bpe.countTokens(text),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Average characters per token for Claude models.
|
|
60
|
+
*
|
|
61
|
+
* Research indicates Claude averages ~3.5 characters per token,
|
|
62
|
+
* which is lower (less efficient) than GPT-4's ~4 characters per token.
|
|
63
|
+
*/
|
|
64
|
+
export const AVERAGE_CHARS_PER_TOKEN = 3.5;
|
|
65
|
+
/**
|
|
66
|
+
* Estimated vocabulary size for Claude 3+.
|
|
67
|
+
*
|
|
68
|
+
* Based on reverse-engineering research by Sander Land, Claude 3's
|
|
69
|
+
* vocabulary is estimated at ~22,000 tokens - remarkably small compared
|
|
70
|
+
* to Mistral's 32K, GPT-4's 100K, or LLaMA 3's 128K.
|
|
71
|
+
*/
|
|
72
|
+
export const ESTIMATED_VOCAB_SIZE = 22000;
|
|
73
|
+
/**
|
|
74
|
+
* Safety multiplier applied to token estimates.
|
|
75
|
+
*
|
|
76
|
+
* This multiplier ensures estimates err on over-counting to prevent
|
|
77
|
+
* API truncation. Based on research showing Claude produces 16-30%
|
|
78
|
+
* more tokens than GPT-4:
|
|
79
|
+
* - English: +16%
|
|
80
|
+
* - Math: +21%
|
|
81
|
+
* - Code: +30%
|
|
82
|
+
*
|
|
83
|
+
* We use 1.25 (25%) as a safe middle ground.
|
|
84
|
+
*/
|
|
85
|
+
export const SAFETY_MULTIPLIER = CLAUDE_SAFETY_MULTIPLIER;
|
|
86
|
+
/**
|
|
87
|
+
* Token estimation adjustments for different content types.
|
|
88
|
+
*
|
|
89
|
+
* Claude's tokenizer has different efficiency characteristics than OpenAI's:
|
|
90
|
+
* - Generally less efficient (more tokens per character)
|
|
91
|
+
* - Especially less efficient for code (+30% vs GPT-4)
|
|
92
|
+
* - CJK handling varies by input vs output context
|
|
93
|
+
*
|
|
94
|
+
* These multipliers are applied ON TOP of the base safety multiplier.
|
|
95
|
+
*/
|
|
96
|
+
export const CONTENT_TYPE_MULTIPLIERS = {
|
|
97
|
+
prose: 1.0, // Base English text
|
|
98
|
+
code: 1.1, // Code is particularly inefficient on Claude (+30% vs GPT-4)
|
|
99
|
+
json: 1.0, // JSON follows prose patterns
|
|
100
|
+
markdown: 1.0, // Markdown similar to prose
|
|
101
|
+
html: 0.95, // HTML tags slightly more efficient
|
|
102
|
+
math: 1.1, // Math equations use more tokens on Claude (+21% vs GPT-4)
|
|
103
|
+
};
|
|
104
|
+
/**
|
|
105
|
+
* Get estimated token count with content-type awareness for Claude.
|
|
106
|
+
*
|
|
107
|
+
* The returned estimate includes both:
|
|
108
|
+
* 1. The base safety multiplier (1.25x)
|
|
109
|
+
* 2. Content-type specific adjustments
|
|
110
|
+
*
|
|
111
|
+
* @param text - Input text
|
|
112
|
+
* @param contentType - Type of content (affects estimation accuracy)
|
|
113
|
+
* @returns Safe estimated token count (intentionally over-counts)
|
|
114
|
+
*
|
|
115
|
+
* @example
|
|
116
|
+
* ```typescript
|
|
117
|
+
* import { estimateClaudeTokens } from "tiktoken-ts";
|
|
118
|
+
*
|
|
119
|
+
* // For English prose
|
|
120
|
+
* const count = estimateClaudeTokens("Hello, world!", "prose");
|
|
121
|
+
*
|
|
122
|
+
* // For Python code (applies additional code multiplier)
|
|
123
|
+
* const codeCount = estimateClaudeTokens(pythonCode, "code");
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
126
|
+
export function estimateClaudeTokens(text, contentType = "prose") {
|
|
127
|
+
const tokenizer = createClaudeEstimationTokenizer();
|
|
128
|
+
const baseCount = tokenizer.countTokens(text); // Already includes safety multiplier
|
|
129
|
+
const contentMultiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.0;
|
|
130
|
+
return Math.max(1, Math.ceil(baseCount * contentMultiplier));
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Get a CONSERVATIVE (high) token estimate for Claude.
|
|
134
|
+
*
|
|
135
|
+
* This function returns the highest reasonable estimate, suitable for:
|
|
136
|
+
* - Calculating worst-case API costs
|
|
137
|
+
* - Ensuring prompts fit within context limits
|
|
138
|
+
* - Setting very safe max_tokens values
|
|
139
|
+
*
|
|
140
|
+
* It applies the maximum content multiplier (1.1 for code/math) on top
|
|
141
|
+
* of the base safety multiplier (1.25x), resulting in ~1.375x over-estimation.
|
|
142
|
+
*
|
|
143
|
+
* @param text - Input text
|
|
144
|
+
* @returns Conservative (high) token estimate
|
|
145
|
+
*/
|
|
146
|
+
export function estimateClaudeTokensConservative(text) {
|
|
147
|
+
return estimateClaudeTokens(text, "code"); // Uses highest multiplier
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Check if Claude token estimation applies safety multiplier.
|
|
151
|
+
*
|
|
152
|
+
* This is a convenience function that always returns true, confirming
|
|
153
|
+
* that Claude estimates include a safety margin.
|
|
154
|
+
*
|
|
155
|
+
* @returns Always true (Claude estimation always includes safety multiplier)
|
|
156
|
+
*/
|
|
157
|
+
export function includesSafetyMultiplier() {
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
//# sourceMappingURL=claude-estimation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-estimation.js","sourceRoot":"","sources":["../../src/encodings/claude-estimation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAE,YAAY,EAAE,wBAAwB,EAAE,MAAM,WAAW,CAAC;AAGnE;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,mBAAmB,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,+BAA+B;IAC7C,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG,KAAK,CAAC;AAE1C;;;;;;;;;;;GAWG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG,wBAAwB,CAAC;AAE1D;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,GAAG,EAAE,oBAAoB;IAChC,IAAI,EAAE,GAAG,EAAE,6DAA6D;IACxE,IAAI,EAAE,GAAG,EAAE,8BAA8B;IACzC,QAAQ,EAAE,GAAG,EAAE,4BAA4B;IAC3C,IAAI,EAAE,IAAI,EAAE,oCAAoC;IAChD,IAAI,EAAE,GAAG,EAAE,2DAA2D;CACvE,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,+BAA+B,EAAE,CAAC;IACpD,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,qCAAqC;IACpF,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC;IAEvE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,iBAAiB,CAAC,CAAC,CAAC;AAC/D,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,gCAAgC,CAAC,IAAY;IAC3D,OAAO,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC,0BAA0B;AACvE,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,wBAAwB;IACtC,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Encoding Exports
|
|
3
|
+
* Central export for all encoding implementations
|
|
4
|
+
*/
|
|
5
|
+
export { createCL100kTokenizer, ENCODING_NAME as CL100K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as CL100K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as CL100K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as CL100K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateCL100kTokens, } from "./cl100k-base.js";
|
|
6
|
+
export { createO200kTokenizer, ENCODING_NAME as O200K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as O200K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as O200K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as O200K_CONTENT_MULTIPLIERS, LANGUAGE_EFFICIENCY as O200K_LANGUAGE_EFFICIENCY, estimateTokensForContent as estimateO200kTokens, estimateTokensForLanguage as estimateO200kTokensForLanguage, } from "./o200k-base.js";
|
|
7
|
+
export { createP50kTokenizer, ENCODING_NAME as P50K_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as P50K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as P50K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateP50kTokens, } from "./p50k-base.js";
|
|
8
|
+
export { createClaudeEstimationTokenizer, ENCODING_NAME as CLAUDE_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as CLAUDE_AVG_CHARS, ESTIMATED_VOCAB_SIZE as CLAUDE_VOCAB_SIZE, SAFETY_MULTIPLIER as CLAUDE_SAFETY_MULT, CONTENT_TYPE_MULTIPLIERS as CLAUDE_CONTENT_MULTIPLIERS, estimateClaudeTokens, estimateClaudeTokensConservative, includesSafetyMultiplier as claudeIncludesSafetyMultiplier, } from "./claude-estimation.js";
|
|
9
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/encodings/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EACL,qBAAqB,EACrB,aAAa,IAAI,oBAAoB,EACrC,qBAAqB,IAAI,qBAAqB,EAC9C,uBAAuB,IAAI,gBAAgB,EAC3C,wBAAwB,IAAI,0BAA0B,EACtD,wBAAwB,IAAI,oBAAoB,GACjD,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,oBAAoB,EACpB,aAAa,IAAI,mBAAmB,EACpC,qBAAqB,IAAI,oBAAoB,EAC7C,uBAAuB,IAAI,eAAe,EAC1C,wBAAwB,IAAI,yBAAyB,EACrD,mBAAmB,IAAI,yBAAyB,EAChD,wBAAwB,IAAI,mBAAmB,EAC/C,yBAAyB,IAAI,8BAA8B,GAC5D,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,mBAAmB,EACnB,aAAa,IAAI,kBAAkB,EACnC,uBAAuB,IAAI,cAAc,EACzC,wBAAwB,IAAI,wBAAwB,EACpD,wBAAwB,IAAI,kBAAkB,GAC/C,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EACL,+BAA+B,EAC/B,aAAa,IAAI,oBAAoB,EACrC,uBAAuB,IAAI,gBAAgB,EAC3C,oBAAoB,IAAI,iBAAiB,EACzC,iBAAiB,IAAI,kBAAkB,EACvC,wBAAwB,IAAI,0BAA0B,EACtD,oBAAoB,EACpB,gCAAgC,EAChC,wBAAwB,IAAI,8BAA8B,GAC3D,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Encoding Exports
|
|
3
|
+
* Central export for all encoding implementations
|
|
4
|
+
*/
|
|
5
|
+
// cl100k_base (GPT-4, GPT-3.5-turbo)
|
|
6
|
+
export { createCL100kTokenizer, ENCODING_NAME as CL100K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as CL100K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as CL100K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as CL100K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateCL100kTokens, } from "./cl100k-base.js";
|
|
7
|
+
// o200k_base (GPT-4o, GPT-4.1, GPT-5)
|
|
8
|
+
export { createO200kTokenizer, ENCODING_NAME as O200K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as O200K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as O200K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as O200K_CONTENT_MULTIPLIERS, LANGUAGE_EFFICIENCY as O200K_LANGUAGE_EFFICIENCY, estimateTokensForContent as estimateO200kTokens, estimateTokensForLanguage as estimateO200kTokensForLanguage, } from "./o200k-base.js";
|
|
9
|
+
// p50k_base (Legacy Codex models)
|
|
10
|
+
export { createP50kTokenizer, ENCODING_NAME as P50K_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as P50K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as P50K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateP50kTokens, } from "./p50k-base.js";
|
|
11
|
+
// claude_estimation (Anthropic Claude models - estimation only)
|
|
12
|
+
export { createClaudeEstimationTokenizer, ENCODING_NAME as CLAUDE_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as CLAUDE_AVG_CHARS, ESTIMATED_VOCAB_SIZE as CLAUDE_VOCAB_SIZE, SAFETY_MULTIPLIER as CLAUDE_SAFETY_MULT, CONTENT_TYPE_MULTIPLIERS as CLAUDE_CONTENT_MULTIPLIERS, estimateClaudeTokens, estimateClaudeTokensConservative, includesSafetyMultiplier as claudeIncludesSafetyMultiplier, } from "./claude-estimation.js";
|
|
13
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/encodings/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,qCAAqC;AACrC,OAAO,EACL,qBAAqB,EACrB,aAAa,IAAI,oBAAoB,EACrC,qBAAqB,IAAI,qBAAqB,EAC9C,uBAAuB,IAAI,gBAAgB,EAC3C,wBAAwB,IAAI,0BAA0B,EACtD,wBAAwB,IAAI,oBAAoB,GACjD,MAAM,kBAAkB,CAAC;AAE1B,sCAAsC;AACtC,OAAO,EACL,oBAAoB,EACpB,aAAa,IAAI,mBAAmB,EACpC,qBAAqB,IAAI,oBAAoB,EAC7C,uBAAuB,IAAI,eAAe,EAC1C,wBAAwB,IAAI,yBAAyB,EACrD,mBAAmB,IAAI,yBAAyB,EAChD,wBAAwB,IAAI,mBAAmB,EAC/C,yBAAyB,IAAI,8BAA8B,GAC5D,MAAM,iBAAiB,CAAC;AAEzB,kCAAkC;AAClC,OAAO,EACL,mBAAmB,EACnB,aAAa,IAAI,kBAAkB,EACnC,uBAAuB,IAAI,cAAc,EACzC,wBAAwB,IAAI,wBAAwB,EACpD,wBAAwB,IAAI,kBAAkB,GAC/C,MAAM,gBAAgB,CAAC;AAExB,gEAAgE;AAChE,OAAO,EACL,+BAA+B,EAC/B,aAAa,IAAI,oBAAoB,EACrC,uBAAuB,IAAI,gBAAgB,EAC3C,oBAAoB,IAAI,iBAAiB,EACzC,iBAAiB,IAAI,kBAAkB,EACvC,wBAAwB,IAAI,0BAA0B,EACtD,oBAAoB,EACpB,gCAAgC,EAChC,wBAAwB,IAAI,8BAA8B,GAC3D,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* o200k_base Encoding
|
|
3
|
+
* Used by GPT-4o, GPT-4.1, GPT-5 models
|
|
4
|
+
*
|
|
5
|
+
* This encoding has:
|
|
6
|
+
* - 200,000 tokens (larger vocabulary)
|
|
7
|
+
* - Better handling of non-English languages
|
|
8
|
+
* - Improved efficiency for common patterns
|
|
9
|
+
* - Better support for multimodal inputs
|
|
10
|
+
*/
|
|
11
|
+
import type { Tokenizer, EncodingName } from "../types.js";
|
|
12
|
+
/**
|
|
13
|
+
* Encoding name constant
|
|
14
|
+
*/
|
|
15
|
+
export declare const ENCODING_NAME: EncodingName;
|
|
16
|
+
/**
|
|
17
|
+
* Create an o200k_base tokenizer instance
|
|
18
|
+
*
|
|
19
|
+
* @returns Tokenizer instance
|
|
20
|
+
*/
|
|
21
|
+
export declare function createO200kTokenizer(): Tokenizer;
|
|
22
|
+
/**
|
|
23
|
+
* Pre-computed token counts for common patterns
|
|
24
|
+
* o200k_base has a larger vocabulary, so more words are single tokens
|
|
25
|
+
*/
|
|
26
|
+
export declare const COMMON_TOKEN_PATTERNS: Record<string, number>;
|
|
27
|
+
/**
|
|
28
|
+
* Average characters per token for o200k_base
|
|
29
|
+
* Slightly higher than cl100k due to larger vocabulary
|
|
30
|
+
*/
|
|
31
|
+
export declare const AVERAGE_CHARS_PER_TOKEN = 4;
|
|
32
|
+
/**
|
|
33
|
+
* Token estimation adjustments for different content types
|
|
34
|
+
* o200k is generally more efficient across all types
|
|
35
|
+
*/
|
|
36
|
+
export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
|
|
37
|
+
/**
|
|
38
|
+
* Language-specific efficiency factors for o200k_base
|
|
39
|
+
* The larger vocabulary handles more languages efficiently
|
|
40
|
+
*/
|
|
41
|
+
export declare const LANGUAGE_EFFICIENCY: Record<string, number>;
|
|
42
|
+
/**
|
|
43
|
+
* Get estimated token count with content-type awareness
|
|
44
|
+
*
|
|
45
|
+
* @param text - Input text
|
|
46
|
+
* @param contentType - Type of content
|
|
47
|
+
* @returns Estimated token count
|
|
48
|
+
*/
|
|
49
|
+
export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
|
|
50
|
+
/**
|
|
51
|
+
* Get estimated token count with language awareness
|
|
52
|
+
*
|
|
53
|
+
* @param text - Input text
|
|
54
|
+
* @param language - ISO 639-1 language code
|
|
55
|
+
* @returns Estimated token count
|
|
56
|
+
*/
|
|
57
|
+
export declare function estimateTokensForLanguage(text: string, language: string): number;
|
|
58
|
+
//# sourceMappingURL=o200k-base.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"o200k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/o200k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA2B,CAAC;AAExD;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,SAAS,CAShD;AAED;;;GAGG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgGxD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,uBAAuB,IAAM,CAAC;AAE3C;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAQ3D,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgBtD,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR;AAED;;;;;;GAMG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,MAAM,GACf,MAAM,CAMR"}
|