@hyvmind/tiktoken-ts 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +557 -0
  3. package/dist/bpe.d.ts +171 -0
  4. package/dist/bpe.d.ts.map +1 -0
  5. package/dist/bpe.js +478 -0
  6. package/dist/bpe.js.map +1 -0
  7. package/dist/core/byte-pair-encoding.d.ts +49 -0
  8. package/dist/core/byte-pair-encoding.d.ts.map +1 -0
  9. package/dist/core/byte-pair-encoding.js +154 -0
  10. package/dist/core/byte-pair-encoding.js.map +1 -0
  11. package/dist/core/encoding-definitions.d.ts +95 -0
  12. package/dist/core/encoding-definitions.d.ts.map +1 -0
  13. package/dist/core/encoding-definitions.js +202 -0
  14. package/dist/core/encoding-definitions.js.map +1 -0
  15. package/dist/core/index.d.ts +12 -0
  16. package/dist/core/index.d.ts.map +1 -0
  17. package/dist/core/index.js +17 -0
  18. package/dist/core/index.js.map +1 -0
  19. package/dist/core/model-to-encoding.d.ts +36 -0
  20. package/dist/core/model-to-encoding.d.ts.map +1 -0
  21. package/dist/core/model-to-encoding.js +299 -0
  22. package/dist/core/model-to-encoding.js.map +1 -0
  23. package/dist/core/tiktoken.d.ts +126 -0
  24. package/dist/core/tiktoken.d.ts.map +1 -0
  25. package/dist/core/tiktoken.js +295 -0
  26. package/dist/core/tiktoken.js.map +1 -0
  27. package/dist/core/vocab-loader.d.ts +77 -0
  28. package/dist/core/vocab-loader.d.ts.map +1 -0
  29. package/dist/core/vocab-loader.js +176 -0
  30. package/dist/core/vocab-loader.js.map +1 -0
  31. package/dist/encodings/cl100k-base.d.ts +43 -0
  32. package/dist/encodings/cl100k-base.d.ts.map +1 -0
  33. package/dist/encodings/cl100k-base.js +142 -0
  34. package/dist/encodings/cl100k-base.js.map +1 -0
  35. package/dist/encodings/claude-estimation.d.ts +136 -0
  36. package/dist/encodings/claude-estimation.d.ts.map +1 -0
  37. package/dist/encodings/claude-estimation.js +160 -0
  38. package/dist/encodings/claude-estimation.js.map +1 -0
  39. package/dist/encodings/index.d.ts +9 -0
  40. package/dist/encodings/index.d.ts.map +1 -0
  41. package/dist/encodings/index.js +13 -0
  42. package/dist/encodings/index.js.map +1 -0
  43. package/dist/encodings/o200k-base.d.ts +58 -0
  44. package/dist/encodings/o200k-base.d.ts.map +1 -0
  45. package/dist/encodings/o200k-base.js +191 -0
  46. package/dist/encodings/o200k-base.js.map +1 -0
  47. package/dist/encodings/p50k-base.d.ts +44 -0
  48. package/dist/encodings/p50k-base.d.ts.map +1 -0
  49. package/dist/encodings/p50k-base.js +64 -0
  50. package/dist/encodings/p50k-base.js.map +1 -0
  51. package/dist/index.d.ts +61 -0
  52. package/dist/index.d.ts.map +1 -0
  53. package/dist/index.js +109 -0
  54. package/dist/index.js.map +1 -0
  55. package/dist/models.d.ts +92 -0
  56. package/dist/models.d.ts.map +1 -0
  57. package/dist/models.js +320 -0
  58. package/dist/models.js.map +1 -0
  59. package/dist/tiktoken.d.ts +198 -0
  60. package/dist/tiktoken.d.ts.map +1 -0
  61. package/dist/tiktoken.js +331 -0
  62. package/dist/tiktoken.js.map +1 -0
  63. package/dist/tokenizer.d.ts +181 -0
  64. package/dist/tokenizer.d.ts.map +1 -0
  65. package/dist/tokenizer.js +436 -0
  66. package/dist/tokenizer.js.map +1 -0
  67. package/dist/types.d.ts +127 -0
  68. package/dist/types.d.ts.map +1 -0
  69. package/dist/types.js +6 -0
  70. package/dist/types.js.map +1 -0
  71. package/dist/utils.d.ts +152 -0
  72. package/dist/utils.d.ts.map +1 -0
  73. package/dist/utils.js +244 -0
  74. package/dist/utils.js.map +1 -0
  75. package/package.json +78 -0
@@ -0,0 +1,142 @@
1
+ /**
2
+ * cl100k_base Encoding
3
+ * Used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002
4
+ *
5
+ * This encoding has:
6
+ * - 100,256 tokens
7
+ * - Better handling of code and programming languages
8
+ * - Improved whitespace handling
9
+ */
10
+ import { BPETokenizer } from "../bpe.js";
11
+ /**
12
+ * Encoding name constant
13
+ */
14
+ export const ENCODING_NAME = "cl100k_base";
15
+ /**
16
+ * Create a cl100k_base tokenizer instance
17
+ *
18
+ * @returns Tokenizer instance
19
+ */
20
+ export function createCL100kTokenizer() {
21
+ const bpe = new BPETokenizer(ENCODING_NAME);
22
+ return {
23
+ encodingName: ENCODING_NAME,
24
+ encode: (text) => bpe.encode(text),
25
+ decode: (tokens) => bpe.decode(tokens),
26
+ countTokens: (text) => bpe.countTokens(text),
27
+ };
28
+ }
29
+ /**
30
+ * Pre-computed token counts for common patterns
31
+ * Used to improve estimation accuracy
32
+ */
33
+ export const COMMON_TOKEN_PATTERNS = {
34
+ // Contractions
35
+ "'s": 1,
36
+ "'t": 1,
37
+ "'re": 1,
38
+ "'ve": 1,
39
+ "'m": 1,
40
+ "'ll": 1,
41
+ "'d": 1,
42
+ // Common words (1 token each in cl100k)
43
+ the: 1,
44
+ and: 1,
45
+ is: 1,
46
+ are: 1,
47
+ was: 1,
48
+ were: 1,
49
+ will: 1,
50
+ would: 1,
51
+ could: 1,
52
+ should: 1,
53
+ have: 1,
54
+ has: 1,
55
+ had: 1,
56
+ been: 1,
57
+ being: 1,
58
+ this: 1,
59
+ that: 1,
60
+ these: 1,
61
+ those: 1,
62
+ with: 1,
63
+ from: 1,
64
+ into: 1,
65
+ for: 1,
66
+ // Programming keywords (1 token each)
67
+ function: 1,
68
+ const: 1,
69
+ let: 1,
70
+ var: 1,
71
+ return: 1,
72
+ if: 1,
73
+ else: 1,
74
+ while: 1,
75
+ class: 1,
76
+ interface: 1,
77
+ type: 1,
78
+ import: 1,
79
+ export: 1,
80
+ async: 1,
81
+ await: 1,
82
+ true: 1,
83
+ false: 1,
84
+ null: 1,
85
+ undefined: 1,
86
+ // Common symbols
87
+ "=>": 1,
88
+ "===": 1,
89
+ "!==": 1,
90
+ "&&": 1,
91
+ "||": 1,
92
+ "++": 1,
93
+ "--": 1,
94
+ "+=": 1,
95
+ "-=": 1,
96
+ "...": 1,
97
+ // Punctuation
98
+ ".": 1,
99
+ ",": 1,
100
+ ";": 1,
101
+ ":": 1,
102
+ "!": 1,
103
+ "?": 1,
104
+ "(": 1,
105
+ ")": 1,
106
+ "[": 1,
107
+ "]": 1,
108
+ "{": 1,
109
+ "}": 1,
110
+ "<": 1,
111
+ ">": 1,
112
+ };
113
+ /**
114
+ * Average characters per token for cl100k_base
115
+ * Based on empirical analysis of English text
116
+ */
117
+ export const AVERAGE_CHARS_PER_TOKEN = 3.8;
118
+ /**
119
+ * Token estimation adjustments for different content types
120
+ */
121
+ export const CONTENT_TYPE_MULTIPLIERS = {
122
+ prose: 1.0, // Normal English text
123
+ code: 0.85, // Code tends to be more token-efficient
124
+ json: 0.75, // JSON is very structured
125
+ markdown: 0.95, // Markdown has extra syntax tokens
126
+ html: 0.8, // HTML tags are often single tokens
127
+ math: 1.2, // Mathematical notation uses more tokens
128
+ };
129
+ /**
130
+ * Get estimated token count with content-type awareness
131
+ *
132
+ * @param text - Input text
133
+ * @param contentType - Type of content
134
+ * @returns Estimated token count
135
+ */
136
+ export function estimateTokensForContent(text, contentType = "prose") {
137
+ const tokenizer = createCL100kTokenizer();
138
+ const baseCount = tokenizer.countTokens(text);
139
+ const multiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.0;
140
+ return Math.max(1, Math.round(baseCount * multiplier));
141
+ }
142
+ //# sourceMappingURL=cl100k-base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cl100k-base.js","sourceRoot":"","sources":["../../src/encodings/cl100k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAGzC;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,aAAa,CAAC;AAEzD;;;;GAIG;AACH,MAAM,UAAU,qBAAqB;IACnC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAA2B;IAC3D,eAAe;IACf,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IAEP,wCAAwC;IACxC,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,EAAE,EAAE,CAAC;IACL,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,MAAM,EAAE,CAAC;IACT,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IAEN,sCAAsC;IACtC,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,MAAM,EAAE,CAAC;IACT,EAAE,EAAE,CAAC;IACL,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,SAAS,EAAE,CAAC;IACZ,IAAI,EAAE,CAAC;IACP,MAAM,EAAE,CAAC;IACT,MAAM,EAAE,CAAC;IACT,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,SAAS,EAAE,CAAC;IAEZ,iBAAiB;IACjB,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IACR,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;IAER,cAAc;IACd,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;IACN,GAAG,EAAE,CAAC;CACP,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,GAAG,EAAE,sBAAsB;IAClC,IAAI,EAAE,IAAI,EAAE,wCAAwC;IACpD,IAAI,EAAE,IAAI,EAAE,0BAA0B;IACtC,QAAQ,EAAE,IAAI,EAAE,mCAAmC;IACnD,IAAI,EAAE,GAAG,EAAE,oCAAoC;IAC/C,IAAI,EAAE,GAAG,EAAE,yCAAyC;CACrD,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,UAAU,wBAAwB,CACtC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,qBAAqB,EAAE,CAAC;IAC1C,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC;IAEhE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC;AACzD,CAAC"}
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Claude Estimation Encoding
3
+ *
4
+ * IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
5
+ *
6
+ * Claude (Anthropic) uses a proprietary tokenizer that is NOT publicly available.
7
+ * This encoding provides "safe" estimates that intentionally over-count tokens
8
+ * to prevent API truncation issues when working with Claude models.
9
+ *
10
+ * Based on research findings (see claude-tokenizer-research.md):
11
+ * - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
12
+ * - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content:
13
+ * - English articles: +16%
14
+ * - Math equations: +21%
15
+ * - Python code: +30%
16
+ * - Average ~3.5 characters per token (vs GPT-4's ~4)
17
+ *
18
+ * This encoding applies a 1.25x safety multiplier to ensure estimates err on
19
+ * the side of over-counting, preventing API truncation.
20
+ *
21
+ * For EXACT Claude token counts, use Anthropic's official API:
22
+ * @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
23
+ */
24
+ import type { Tokenizer, EncodingName } from "../types.js";
25
+ /**
26
+ * Encoding name constant
27
+ */
28
+ export declare const ENCODING_NAME: EncodingName;
29
+ /**
30
+ * Create a Claude estimation tokenizer instance
31
+ *
32
+ * This tokenizer provides SAFE estimates for Claude models by:
33
+ * 1. Using cl100k_base patterns as a base (70% vocabulary overlap with Claude 1/2)
34
+ * 2. Applying a 1.25x safety multiplier to account for Claude's higher token usage
35
+ *
36
+ * The estimates intentionally err on over-counting to prevent API truncation.
37
+ *
38
+ * @returns Tokenizer instance configured for Claude estimation
39
+ *
40
+ * @example
41
+ * ```typescript
42
+ * import { createClaudeEstimationTokenizer } from "tiktoken-ts";
43
+ *
44
+ * const tokenizer = createClaudeEstimationTokenizer();
45
+ * const count = tokenizer.countTokens("Hello, Claude!");
46
+ * // Returns a SAFE estimate (intentionally higher than actual)
47
+ * ```
48
+ */
49
+ export declare function createClaudeEstimationTokenizer(): Tokenizer;
50
+ /**
51
+ * Average characters per token for Claude models.
52
+ *
53
+ * Research indicates Claude averages ~3.5 characters per token,
54
+ * which is lower (less efficient) than GPT-4's ~4 characters per token.
55
+ */
56
+ export declare const AVERAGE_CHARS_PER_TOKEN = 3.5;
57
+ /**
58
+ * Estimated vocabulary size for Claude 3+.
59
+ *
60
+ * Based on reverse-engineering research by Sander Land, Claude 3's
61
+ * vocabulary is estimated at ~22,000 tokens - remarkably small compared
62
+ * to Mistral's 32K, GPT-4's 100K, or LLaMA 3's 128K.
63
+ */
64
+ export declare const ESTIMATED_VOCAB_SIZE = 22000;
65
+ /**
66
+ * Safety multiplier applied to token estimates.
67
+ *
68
+ * This multiplier ensures estimates err on over-counting to prevent
69
+ * API truncation. Based on research showing Claude produces 16-30%
70
+ * more tokens than GPT-4:
71
+ * - English: +16%
72
+ * - Math: +21%
73
+ * - Code: +30%
74
+ *
75
+ * We use 1.25 (25%) as a safe middle ground.
76
+ */
77
+ export declare const SAFETY_MULTIPLIER = 1.25;
78
+ /**
79
+ * Token estimation adjustments for different content types.
80
+ *
81
+ * Claude's tokenizer has different efficiency characteristics than OpenAI's:
82
+ * - Generally less efficient (more tokens per character)
83
+ * - Especially less efficient for code (+30% vs GPT-4)
84
+ * - CJK handling varies by input vs output context
85
+ *
86
+ * These multipliers are applied ON TOP of the base safety multiplier.
87
+ */
88
+ export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
89
+ /**
90
+ * Get estimated token count with content-type awareness for Claude.
91
+ *
92
+ * The returned estimate includes both:
93
+ * 1. The base safety multiplier (1.25x)
94
+ * 2. Content-type specific adjustments
95
+ *
96
+ * @param text - Input text
97
+ * @param contentType - Type of content (affects estimation accuracy)
98
+ * @returns Safe estimated token count (intentionally over-counts)
99
+ *
100
+ * @example
101
+ * ```typescript
102
+ * import { estimateClaudeTokens } from "tiktoken-ts";
103
+ *
104
+ * // For English prose
105
+ * const count = estimateClaudeTokens("Hello, world!", "prose");
106
+ *
107
+ * // For Python code (applies additional code multiplier)
108
+ * const codeCount = estimateClaudeTokens(pythonCode, "code");
109
+ * ```
110
+ */
111
+ export declare function estimateClaudeTokens(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
112
+ /**
113
+ * Get a CONSERVATIVE (high) token estimate for Claude.
114
+ *
115
+ * This function returns the highest reasonable estimate, suitable for:
116
+ * - Calculating worst-case API costs
117
+ * - Ensuring prompts fit within context limits
118
+ * - Setting very safe max_tokens values
119
+ *
120
+ * It applies the maximum content multiplier (1.1 for code/math) on top
121
+ * of the base safety multiplier (1.25x), resulting in ~1.375x over-estimation.
122
+ *
123
+ * @param text - Input text
124
+ * @returns Conservative (high) token estimate
125
+ */
126
+ export declare function estimateClaudeTokensConservative(text: string): number;
127
+ /**
128
+ * Check if Claude token estimation applies safety multiplier.
129
+ *
130
+ * This is a convenience function that always returns true, confirming
131
+ * that Claude estimates include a safety margin.
132
+ *
133
+ * @returns Always true (Claude estimation always includes safety multiplier)
134
+ */
135
+ export declare function includesSafetyMultiplier(): boolean;
136
+ //# sourceMappingURL=claude-estimation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"claude-estimation.d.ts","sourceRoot":"","sources":["../../src/encodings/claude-estimation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAAkC,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,+BAA+B,IAAI,SAAS,CAS3D;AAED;;;;;GAKG;AACH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C;;;;;;GAMG;AACH,eAAO,MAAM,oBAAoB,QAAQ,CAAC;AAE1C;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,iBAAiB,OAA2B,CAAC;AAE1D;;;;;;;;;GASG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO3D,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,gCAAgC,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAErE;AAED;;;;;;;GAOG;AACH,wBAAgB,wBAAwB,IAAI,OAAO,CAElD"}
@@ -0,0 +1,160 @@
1
+ /**
2
+ * Claude Estimation Encoding
3
+ *
4
+ * IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
5
+ *
6
+ * Claude (Anthropic) uses a proprietary tokenizer that is NOT publicly available.
7
+ * This encoding provides "safe" estimates that intentionally over-count tokens
8
+ * to prevent API truncation issues when working with Claude models.
9
+ *
10
+ * Based on research findings (see claude-tokenizer-research.md):
11
+ * - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
12
+ * - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content:
13
+ * - English articles: +16%
14
+ * - Math equations: +21%
15
+ * - Python code: +30%
16
+ * - Average ~3.5 characters per token (vs GPT-4's ~4)
17
+ *
18
+ * This encoding applies a 1.25x safety multiplier to ensure estimates err on
19
+ * the side of over-counting, preventing API truncation.
20
+ *
21
+ * For EXACT Claude token counts, use Anthropic's official API:
22
+ * @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
23
+ */
24
+ import { BPETokenizer, CLAUDE_SAFETY_MULTIPLIER } from "../bpe.js";
25
+ /**
26
+ * Encoding name constant
27
+ */
28
+ export const ENCODING_NAME = "claude_estimation";
29
+ /**
30
+ * Create a Claude estimation tokenizer instance
31
+ *
32
+ * This tokenizer provides SAFE estimates for Claude models by:
33
+ * 1. Using cl100k_base patterns as a base (70% vocabulary overlap with Claude 1/2)
34
+ * 2. Applying a 1.25x safety multiplier to account for Claude's higher token usage
35
+ *
36
+ * The estimates intentionally err on over-counting to prevent API truncation.
37
+ *
38
+ * @returns Tokenizer instance configured for Claude estimation
39
+ *
40
+ * @example
41
+ * ```typescript
42
+ * import { createClaudeEstimationTokenizer } from "tiktoken-ts";
43
+ *
44
+ * const tokenizer = createClaudeEstimationTokenizer();
45
+ * const count = tokenizer.countTokens("Hello, Claude!");
46
+ * // Returns a SAFE estimate (intentionally higher than actual)
47
+ * ```
48
+ */
49
+ export function createClaudeEstimationTokenizer() {
50
+ const bpe = new BPETokenizer(ENCODING_NAME);
51
+ return {
52
+ encodingName: ENCODING_NAME,
53
+ encode: (text) => bpe.encode(text),
54
+ decode: (tokens) => bpe.decode(tokens),
55
+ countTokens: (text) => bpe.countTokens(text),
56
+ };
57
+ }
58
+ /**
59
+ * Average characters per token for Claude models.
60
+ *
61
+ * Research indicates Claude averages ~3.5 characters per token,
62
+ * which is lower (less efficient) than GPT-4's ~4 characters per token.
63
+ */
64
+ export const AVERAGE_CHARS_PER_TOKEN = 3.5;
65
+ /**
66
+ * Estimated vocabulary size for Claude 3+.
67
+ *
68
+ * Based on reverse-engineering research by Sander Land, Claude 3's
69
+ * vocabulary is estimated at ~22,000 tokens - remarkably small compared
70
+ * to Mistral's 32K, GPT-4's 100K, or LLaMA 3's 128K.
71
+ */
72
+ export const ESTIMATED_VOCAB_SIZE = 22000;
73
+ /**
74
+ * Safety multiplier applied to token estimates.
75
+ *
76
+ * This multiplier ensures estimates err on over-counting to prevent
77
+ * API truncation. Based on research showing Claude produces 16-30%
78
+ * more tokens than GPT-4:
79
+ * - English: +16%
80
+ * - Math: +21%
81
+ * - Code: +30%
82
+ *
83
+ * We use 1.25 (25%) as a safe middle ground.
84
+ */
85
+ export const SAFETY_MULTIPLIER = CLAUDE_SAFETY_MULTIPLIER;
86
+ /**
87
+ * Token estimation adjustments for different content types.
88
+ *
89
+ * Claude's tokenizer has different efficiency characteristics than OpenAI's:
90
+ * - Generally less efficient (more tokens per character)
91
+ * - Especially less efficient for code (+30% vs GPT-4)
92
+ * - CJK handling varies by input vs output context
93
+ *
94
+ * These multipliers are applied ON TOP of the base safety multiplier.
95
+ */
96
+ export const CONTENT_TYPE_MULTIPLIERS = {
97
+ prose: 1.0, // Base English text
98
+ code: 1.1, // Code is particularly inefficient on Claude (+30% vs GPT-4)
99
+ json: 1.0, // JSON follows prose patterns
100
+ markdown: 1.0, // Markdown similar to prose
101
+ html: 0.95, // HTML tags slightly more efficient
102
+ math: 1.1, // Math equations use more tokens on Claude (+21% vs GPT-4)
103
+ };
104
+ /**
105
+ * Get estimated token count with content-type awareness for Claude.
106
+ *
107
+ * The returned estimate includes both:
108
+ * 1. The base safety multiplier (1.25x)
109
+ * 2. Content-type specific adjustments
110
+ *
111
+ * @param text - Input text
112
+ * @param contentType - Type of content (affects estimation accuracy)
113
+ * @returns Safe estimated token count (intentionally over-counts)
114
+ *
115
+ * @example
116
+ * ```typescript
117
+ * import { estimateClaudeTokens } from "tiktoken-ts";
118
+ *
119
+ * // For English prose
120
+ * const count = estimateClaudeTokens("Hello, world!", "prose");
121
+ *
122
+ * // For Python code (applies additional code multiplier)
123
+ * const codeCount = estimateClaudeTokens(pythonCode, "code");
124
+ * ```
125
+ */
126
+ export function estimateClaudeTokens(text, contentType = "prose") {
127
+ const tokenizer = createClaudeEstimationTokenizer();
128
+ const baseCount = tokenizer.countTokens(text); // Already includes safety multiplier
129
+ const contentMultiplier = CONTENT_TYPE_MULTIPLIERS[contentType] ?? 1.0;
130
+ return Math.max(1, Math.ceil(baseCount * contentMultiplier));
131
+ }
132
+ /**
133
+ * Get a CONSERVATIVE (high) token estimate for Claude.
134
+ *
135
+ * This function returns the highest reasonable estimate, suitable for:
136
+ * - Calculating worst-case API costs
137
+ * - Ensuring prompts fit within context limits
138
+ * - Setting very safe max_tokens values
139
+ *
140
+ * It applies the maximum content multiplier (1.1 for code/math) on top
141
+ * of the base safety multiplier (1.25x), resulting in ~1.375x over-estimation.
142
+ *
143
+ * @param text - Input text
144
+ * @returns Conservative (high) token estimate
145
+ */
146
+ export function estimateClaudeTokensConservative(text) {
147
+ return estimateClaudeTokens(text, "code"); // Uses highest multiplier
148
+ }
149
+ /**
150
+ * Check if Claude token estimation applies safety multiplier.
151
+ *
152
+ * This is a convenience function that always returns true, confirming
153
+ * that Claude estimates include a safety margin.
154
+ *
155
+ * @returns Always true (Claude estimation always includes safety multiplier)
156
+ */
157
+ export function includesSafetyMultiplier() {
158
+ return true;
159
+ }
160
+ //# sourceMappingURL=claude-estimation.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"claude-estimation.js","sourceRoot":"","sources":["../../src/encodings/claude-estimation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAE,YAAY,EAAE,wBAAwB,EAAE,MAAM,WAAW,CAAC;AAGnE;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAiB,mBAAmB,CAAC;AAE/D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,+BAA+B;IAC7C,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,aAAa,CAAC,CAAC;IAE5C,OAAO;QACL,YAAY,EAAE,aAAa;QAC3B,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;QAC1C,MAAM,EAAE,CAAC,MAAgB,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC;QAChD,WAAW,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC;KACrD,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG,KAAK,CAAC;AAE1C;;;;;;;;;;;GAWG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG,wBAAwB,CAAC;AAE1D;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA2B;IAC9D,KAAK,EAAE,GAAG,EAAE,oBAAoB;IAChC,IAAI,EAAE,GAAG,EAAE,6DAA6D;IACxE,IAAI,EAAE,GAAG,EAAE,8BAA8B;IACzC,QAAQ,EAAE,GAAG,EAAE,4BAA4B;IAC3C,IAAI,EAAE,IAAI,EAAE,oCAAoC;IAChD,IAAI,EAAE,GAAG,EAAE,2DAA2D;CACvE,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,cAAqD,OAAO;IAE5D,MAAM,SAAS,GAAG,+BAA+B,EAAE,CAAC;IACpD,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,qCAAqC;IACpF,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC;IAEvE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,iBAAiB,CAAC,CAAC,CAAC;AAC/D,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,gCAAgC,CAAC,IAAY;IAC3D,OAAO,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC,0BAA0B;AACvE,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,wBAAwB;IACtC,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Encoding Exports
3
+ * Central export for all encoding implementations
4
+ */
5
+ export { createCL100kTokenizer, ENCODING_NAME as CL100K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as CL100K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as CL100K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as CL100K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateCL100kTokens, } from "./cl100k-base.js";
6
+ export { createO200kTokenizer, ENCODING_NAME as O200K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as O200K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as O200K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as O200K_CONTENT_MULTIPLIERS, LANGUAGE_EFFICIENCY as O200K_LANGUAGE_EFFICIENCY, estimateTokensForContent as estimateO200kTokens, estimateTokensForLanguage as estimateO200kTokensForLanguage, } from "./o200k-base.js";
7
+ export { createP50kTokenizer, ENCODING_NAME as P50K_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as P50K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as P50K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateP50kTokens, } from "./p50k-base.js";
8
+ export { createClaudeEstimationTokenizer, ENCODING_NAME as CLAUDE_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as CLAUDE_AVG_CHARS, ESTIMATED_VOCAB_SIZE as CLAUDE_VOCAB_SIZE, SAFETY_MULTIPLIER as CLAUDE_SAFETY_MULT, CONTENT_TYPE_MULTIPLIERS as CLAUDE_CONTENT_MULTIPLIERS, estimateClaudeTokens, estimateClaudeTokensConservative, includesSafetyMultiplier as claudeIncludesSafetyMultiplier, } from "./claude-estimation.js";
9
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/encodings/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EACL,qBAAqB,EACrB,aAAa,IAAI,oBAAoB,EACrC,qBAAqB,IAAI,qBAAqB,EAC9C,uBAAuB,IAAI,gBAAgB,EAC3C,wBAAwB,IAAI,0BAA0B,EACtD,wBAAwB,IAAI,oBAAoB,GACjD,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,oBAAoB,EACpB,aAAa,IAAI,mBAAmB,EACpC,qBAAqB,IAAI,oBAAoB,EAC7C,uBAAuB,IAAI,eAAe,EAC1C,wBAAwB,IAAI,yBAAyB,EACrD,mBAAmB,IAAI,yBAAyB,EAChD,wBAAwB,IAAI,mBAAmB,EAC/C,yBAAyB,IAAI,8BAA8B,GAC5D,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,mBAAmB,EACnB,aAAa,IAAI,kBAAkB,EACnC,uBAAuB,IAAI,cAAc,EACzC,wBAAwB,IAAI,wBAAwB,EACpD,wBAAwB,IAAI,kBAAkB,GAC/C,MAAM,gBAAgB,CAAC;AAGxB,OAAO,EACL,+BAA+B,EAC/B,aAAa,IAAI,oBAAoB,EACrC,uBAAuB,IAAI,gBAAgB,EAC3C,oBAAoB,IAAI,iBAAiB,EACzC,iBAAiB,IAAI,kBAAkB,EACvC,wBAAwB,IAAI,0BAA0B,EACtD,oBAAoB,EACpB,gCAAgC,EAChC,wBAAwB,IAAI,8BAA8B,GAC3D,MAAM,wBAAwB,CAAC"}
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Encoding Exports
3
+ * Central export for all encoding implementations
4
+ */
5
+ // cl100k_base (GPT-4, GPT-3.5-turbo)
6
+ export { createCL100kTokenizer, ENCODING_NAME as CL100K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as CL100K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as CL100K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as CL100K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateCL100kTokens, } from "./cl100k-base.js";
7
+ // o200k_base (GPT-4o, GPT-4.1, GPT-5)
8
+ export { createO200kTokenizer, ENCODING_NAME as O200K_ENCODING_NAME, COMMON_TOKEN_PATTERNS as O200K_TOKEN_PATTERNS, AVERAGE_CHARS_PER_TOKEN as O200K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as O200K_CONTENT_MULTIPLIERS, LANGUAGE_EFFICIENCY as O200K_LANGUAGE_EFFICIENCY, estimateTokensForContent as estimateO200kTokens, estimateTokensForLanguage as estimateO200kTokensForLanguage, } from "./o200k-base.js";
9
+ // p50k_base (Legacy Codex models)
10
+ export { createP50kTokenizer, ENCODING_NAME as P50K_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as P50K_AVG_CHARS, CONTENT_TYPE_MULTIPLIERS as P50K_CONTENT_MULTIPLIERS, estimateTokensForContent as estimateP50kTokens, } from "./p50k-base.js";
11
+ // claude_estimation (Anthropic Claude models - estimation only)
12
+ export { createClaudeEstimationTokenizer, ENCODING_NAME as CLAUDE_ENCODING_NAME, AVERAGE_CHARS_PER_TOKEN as CLAUDE_AVG_CHARS, ESTIMATED_VOCAB_SIZE as CLAUDE_VOCAB_SIZE, SAFETY_MULTIPLIER as CLAUDE_SAFETY_MULT, CONTENT_TYPE_MULTIPLIERS as CLAUDE_CONTENT_MULTIPLIERS, estimateClaudeTokens, estimateClaudeTokensConservative, includesSafetyMultiplier as claudeIncludesSafetyMultiplier, } from "./claude-estimation.js";
13
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/encodings/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,qCAAqC;AACrC,OAAO,EACL,qBAAqB,EACrB,aAAa,IAAI,oBAAoB,EACrC,qBAAqB,IAAI,qBAAqB,EAC9C,uBAAuB,IAAI,gBAAgB,EAC3C,wBAAwB,IAAI,0BAA0B,EACtD,wBAAwB,IAAI,oBAAoB,GACjD,MAAM,kBAAkB,CAAC;AAE1B,sCAAsC;AACtC,OAAO,EACL,oBAAoB,EACpB,aAAa,IAAI,mBAAmB,EACpC,qBAAqB,IAAI,oBAAoB,EAC7C,uBAAuB,IAAI,eAAe,EAC1C,wBAAwB,IAAI,yBAAyB,EACrD,mBAAmB,IAAI,yBAAyB,EAChD,wBAAwB,IAAI,mBAAmB,EAC/C,yBAAyB,IAAI,8BAA8B,GAC5D,MAAM,iBAAiB,CAAC;AAEzB,kCAAkC;AAClC,OAAO,EACL,mBAAmB,EACnB,aAAa,IAAI,kBAAkB,EACnC,uBAAuB,IAAI,cAAc,EACzC,wBAAwB,IAAI,wBAAwB,EACpD,wBAAwB,IAAI,kBAAkB,GAC/C,MAAM,gBAAgB,CAAC;AAExB,gEAAgE;AAChE,OAAO,EACL,+BAA+B,EAC/B,aAAa,IAAI,oBAAoB,EACrC,uBAAuB,IAAI,gBAAgB,EAC3C,oBAAoB,IAAI,iBAAiB,EACzC,iBAAiB,IAAI,kBAAkB,EACvC,wBAAwB,IAAI,0BAA0B,EACtD,oBAAoB,EACpB,gCAAgC,EAChC,wBAAwB,IAAI,8BAA8B,GAC3D,MAAM,wBAAwB,CAAC"}
@@ -0,0 +1,58 @@
1
+ /**
2
+ * o200k_base Encoding
3
+ * Used by GPT-4o, GPT-4.1, GPT-5 models
4
+ *
5
+ * This encoding has:
6
+ * - 200,000 tokens (larger vocabulary)
7
+ * - Better handling of non-English languages
8
+ * - Improved efficiency for common patterns
9
+ * - Better support for multimodal inputs
10
+ */
11
+ import type { Tokenizer, EncodingName } from "../types.js";
12
+ /**
13
+ * Encoding name constant
14
+ */
15
+ export declare const ENCODING_NAME: EncodingName;
16
+ /**
17
+ * Create an o200k_base tokenizer instance
18
+ *
19
+ * @returns Tokenizer instance
20
+ */
21
+ export declare function createO200kTokenizer(): Tokenizer;
22
+ /**
23
+ * Pre-computed token counts for common patterns
24
+ * o200k_base has a larger vocabulary, so more words are single tokens
25
+ */
26
+ export declare const COMMON_TOKEN_PATTERNS: Record<string, number>;
27
+ /**
28
+ * Average characters per token for o200k_base
29
+ * Slightly higher than cl100k due to larger vocabulary
30
+ */
31
+ export declare const AVERAGE_CHARS_PER_TOKEN = 4;
32
+ /**
33
+ * Token estimation adjustments for different content types
34
+ * o200k is generally more efficient across all types
35
+ */
36
+ export declare const CONTENT_TYPE_MULTIPLIERS: Record<string, number>;
37
+ /**
38
+ * Language-specific efficiency factors for o200k_base
39
+ * The larger vocabulary handles more languages efficiently
40
+ */
41
+ export declare const LANGUAGE_EFFICIENCY: Record<string, number>;
42
+ /**
43
+ * Get estimated token count with content-type awareness
44
+ *
45
+ * @param text - Input text
46
+ * @param contentType - Type of content
47
+ * @returns Estimated token count
48
+ */
49
+ export declare function estimateTokensForContent(text: string, contentType?: keyof typeof CONTENT_TYPE_MULTIPLIERS): number;
50
+ /**
51
+ * Get estimated token count with language awareness
52
+ *
53
+ * @param text - Input text
54
+ * @param language - ISO 639-1 language code
55
+ * @returns Estimated token count
56
+ */
57
+ export declare function estimateTokensForLanguage(text: string, language: string): number;
58
+ //# sourceMappingURL=o200k-base.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"o200k-base.d.ts","sourceRoot":"","sources":["../../src/encodings/o200k-base.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3D;;GAEG;AACH,eAAO,MAAM,aAAa,EAAE,YAA2B,CAAC;AAExD;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,SAAS,CAShD;AAED;;;GAGG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgGxD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,uBAAuB,IAAM,CAAC;AAE3C;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAQ3D,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAgBtD,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,MAAM,OAAO,wBAAkC,GAC3D,MAAM,CAMR;AAED;;;;;;GAMG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,MAAM,GACf,MAAM,CAMR"}