ai-token-estimator 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -0
- package/dist/index.cjs +142 -13
- package/dist/index.d.cts +61 -1
- package/dist/index.d.ts +61 -1
- package/dist/index.js +138 -13
- package/package.json +6 -2
package/README.md
CHANGED
|
@@ -39,6 +39,29 @@ console.log(getAvailableModels());
|
|
|
39
39
|
// ['gpt-5.2', 'gpt-4o', 'claude-opus-4.5', 'gemini-3-pro', ...]
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
+
## Exact OpenAI tokenization (BPE)
|
|
43
|
+
|
|
44
|
+
This package includes **exact tokenization for OpenAI models** using a tiktoken-compatible BPE tokenizer (via `gpt-tokenizer`).
|
|
45
|
+
|
|
46
|
+
Notes:
|
|
47
|
+
- Encodings are **lazy-loaded on first use** (one-time cost per encoding).
|
|
48
|
+
- Exact tokenization is **slower** than heuristic estimation; `estimate()` defaults to `'heuristic'` to keep existing behavior fast.
|
|
49
|
+
- `encode` / `decode` and `estimate({ tokenizer: 'openai_exact' })` require **Node.js** (uses `node:module` under the hood).
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
import { encode, decode } from 'ai-token-estimator';
|
|
53
|
+
|
|
54
|
+
const text = 'Hello, world!';
|
|
55
|
+
const tokens = encode(text, { model: 'gpt-5.1' }); // exact OpenAI token IDs
|
|
56
|
+
const roundTrip = decode(tokens, { model: 'gpt-5.1' });
|
|
57
|
+
|
|
58
|
+
console.log(tokens.length);
|
|
59
|
+
console.log(roundTrip); // "Hello, world!"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Supported encodings:
|
|
63
|
+
`r50k_base`, `p50k_base`, `p50k_edit`, `cl100k_base`, `o200k_base`, `o200k_harmony`
|
|
64
|
+
|
|
42
65
|
## API Reference
|
|
43
66
|
|
|
44
67
|
### `estimate(input: EstimateInput): EstimateOutput`
|
|
@@ -52,6 +75,7 @@ interface EstimateInput {
|
|
|
52
75
|
text: string; // The text to estimate tokens for
|
|
53
76
|
model: string; // Model ID (e.g., 'gpt-4o', 'claude-opus-4.5')
|
|
54
77
|
rounding?: 'ceil' | 'round' | 'floor'; // Rounding strategy (default: 'ceil')
|
|
78
|
+
tokenizer?: 'heuristic' | 'openai_exact' | 'auto'; // Token counting strategy (default: 'heuristic')
|
|
55
79
|
}
|
|
56
80
|
```
|
|
57
81
|
|
|
@@ -64,13 +88,36 @@ interface EstimateOutput {
|
|
|
64
88
|
estimatedTokens: number; // Estimated token count (integer)
|
|
65
89
|
estimatedInputCost: number; // Estimated cost in USD
|
|
66
90
|
charsPerToken: number; // The ratio used for this model
|
|
91
|
+
tokenizerMode?: 'heuristic' | 'openai_exact' | 'auto'; // Which strategy was used
|
|
92
|
+
encodingUsed?: string; // OpenAI encoding when using exact tokenization
|
|
67
93
|
}
|
|
68
94
|
```
|
|
69
95
|
|
|
96
|
+
### `countTokens(input: TokenCountInput): TokenCountOutput`
|
|
97
|
+
|
|
98
|
+
Counts tokens for a given model:
|
|
99
|
+
- OpenAI models: **exact** BPE tokenization
|
|
100
|
+
- Other providers: heuristic estimate
|
|
101
|
+
|
|
102
|
+
```ts
|
|
103
|
+
import { countTokens } from 'ai-token-estimator';
|
|
104
|
+
|
|
105
|
+
const result = countTokens({ text: 'Hello, world!', model: 'gpt-5.1' });
|
|
106
|
+
// { tokens: 4, exact: true, encoding: 'o200k_base' }
|
|
107
|
+
```
|
|
108
|
+
|
|
70
109
|
### `getAvailableModels(): string[]`
|
|
71
110
|
|
|
72
111
|
Returns an array of all supported model IDs.
|
|
73
112
|
|
|
113
|
+
### `encode(text: string, options?: EncodeOptions): number[]`
|
|
114
|
+
|
|
115
|
+
Encodes text into **OpenAI token IDs** using tiktoken-compatible BPE tokenization.
|
|
116
|
+
|
|
117
|
+
### `decode(tokens: Iterable<number>, options?: { encoding?: OpenAIEncoding; model?: string }): string`
|
|
118
|
+
|
|
119
|
+
Decodes OpenAI token IDs back into text using the selected encoding/model.
|
|
120
|
+
|
|
74
121
|
### `getModelConfig(model: string): ModelConfig`
|
|
75
122
|
|
|
76
123
|
Returns the configuration for a specific model. Throws if the model is not found.
|
|
@@ -108,6 +155,14 @@ This package counts Unicode code points, not UTF-16 code units. This means:
|
|
|
108
155
|
- Accented characters count correctly
|
|
109
156
|
- Most source code characters count as 1
|
|
110
157
|
|
|
158
|
+
## Benchmarks (repo only)
|
|
159
|
+
|
|
160
|
+
This repository includes a small benchmark script to compare heuristic vs exact OpenAI tokenization:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
npm run benchmark:tokenizer
|
|
164
|
+
```
|
|
165
|
+
|
|
111
166
|
<!-- SUPPORTED_MODELS_START -->
|
|
112
167
|
## Supported Models
|
|
113
168
|
|
package/dist/index.cjs
CHANGED
|
@@ -22,6 +22,9 @@ var index_exports = {};
|
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
DEFAULT_MODELS: () => DEFAULT_MODELS,
|
|
24
24
|
LAST_UPDATED: () => LAST_UPDATED,
|
|
25
|
+
countTokens: () => countTokens,
|
|
26
|
+
decode: () => decode,
|
|
27
|
+
encode: () => encode,
|
|
25
28
|
estimate: () => estimate,
|
|
26
29
|
getAvailableModels: () => getAvailableModels,
|
|
27
30
|
getModelConfig: () => getModelConfig
|
|
@@ -404,6 +407,79 @@ function getAvailableModels() {
|
|
|
404
407
|
return Object.keys(DEFAULT_MODELS);
|
|
405
408
|
}
|
|
406
409
|
|
|
410
|
+
// src/openai-bpe.ts
|
|
411
|
+
var import_node_module = require("module");
|
|
412
|
+
var import_constants = require("gpt-tokenizer/constants");
|
|
413
|
+
var import_mapping = require("gpt-tokenizer/mapping");
|
|
414
|
+
var import_meta = {};
|
|
415
|
+
var requireBase = typeof __filename === "string" && __filename.length > 0 ? __filename : import_meta.url;
|
|
416
|
+
var NODE_REQUIRE = (0, import_node_module.createRequire)(requireBase);
|
|
417
|
+
var ENCODING_MODULES = {
|
|
418
|
+
r50k_base: "gpt-tokenizer/cjs/encoding/r50k_base",
|
|
419
|
+
p50k_base: "gpt-tokenizer/cjs/encoding/p50k_base",
|
|
420
|
+
p50k_edit: "gpt-tokenizer/cjs/encoding/p50k_edit",
|
|
421
|
+
cl100k_base: "gpt-tokenizer/cjs/encoding/cl100k_base",
|
|
422
|
+
o200k_base: "gpt-tokenizer/cjs/encoding/o200k_base",
|
|
423
|
+
o200k_harmony: "gpt-tokenizer/cjs/encoding/o200k_harmony"
|
|
424
|
+
};
|
|
425
|
+
var encodingApiCache = /* @__PURE__ */ new Map();
|
|
426
|
+
function getEncodingApi(encoding) {
|
|
427
|
+
const cached = encodingApiCache.get(encoding);
|
|
428
|
+
if (cached) return cached;
|
|
429
|
+
const modulePath = ENCODING_MODULES[encoding];
|
|
430
|
+
const mod = NODE_REQUIRE(modulePath);
|
|
431
|
+
const api = { encode: mod.encode, decode: mod.decode };
|
|
432
|
+
encodingApiCache.set(encoding, api);
|
|
433
|
+
return api;
|
|
434
|
+
}
|
|
435
|
+
function resolveEncoding(selector) {
|
|
436
|
+
if (selector?.encoding) {
|
|
437
|
+
return selector.encoding;
|
|
438
|
+
}
|
|
439
|
+
const model = selector?.model?.trim();
|
|
440
|
+
if (model) {
|
|
441
|
+
const mapped = import_mapping.modelToEncodingMap[model];
|
|
442
|
+
if (mapped) {
|
|
443
|
+
return mapped;
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
return import_mapping.DEFAULT_ENCODING;
|
|
447
|
+
}
|
|
448
|
+
function getOpenAIEncoding(selector) {
|
|
449
|
+
return resolveEncoding(selector);
|
|
450
|
+
}
|
|
451
|
+
function toGptTokenizerEncodeOptions(allowSpecial) {
|
|
452
|
+
const mode = allowSpecial ?? "none_raise";
|
|
453
|
+
switch (mode) {
|
|
454
|
+
case "all":
|
|
455
|
+
return {
|
|
456
|
+
allowedSpecial: import_constants.ALL_SPECIAL_TOKENS,
|
|
457
|
+
disallowedSpecial: /* @__PURE__ */ new Set()
|
|
458
|
+
};
|
|
459
|
+
case "none":
|
|
460
|
+
return {
|
|
461
|
+
allowedSpecial: /* @__PURE__ */ new Set(),
|
|
462
|
+
disallowedSpecial: /* @__PURE__ */ new Set()
|
|
463
|
+
};
|
|
464
|
+
case "none_raise":
|
|
465
|
+
default:
|
|
466
|
+
return {
|
|
467
|
+
disallowedSpecial: import_constants.ALL_SPECIAL_TOKENS
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
function encode(text, options) {
|
|
472
|
+
const encoding = resolveEncoding(options);
|
|
473
|
+
const api = getEncodingApi(encoding);
|
|
474
|
+
const encodeOptions = toGptTokenizerEncodeOptions(options?.allowSpecial);
|
|
475
|
+
return api.encode(text, encodeOptions);
|
|
476
|
+
}
|
|
477
|
+
function decode(tokens, options) {
|
|
478
|
+
const encoding = resolveEncoding(options);
|
|
479
|
+
const api = getEncodingApi(encoding);
|
|
480
|
+
return api.decode(tokens);
|
|
481
|
+
}
|
|
482
|
+
|
|
407
483
|
// src/estimator.ts
|
|
408
484
|
function countCodePoints(text) {
|
|
409
485
|
let count = 0;
|
|
@@ -413,21 +489,43 @@ function countCodePoints(text) {
|
|
|
413
489
|
return count;
|
|
414
490
|
}
|
|
415
491
|
function estimate(input) {
|
|
416
|
-
const { text, model, rounding = "ceil" } = input;
|
|
492
|
+
const { text, model, rounding = "ceil", tokenizer = "heuristic" } = input;
|
|
417
493
|
const config = getModelConfig(model);
|
|
418
494
|
const characterCount = countCodePoints(text);
|
|
419
|
-
const
|
|
495
|
+
const isNonOpenAIModel2 = model.startsWith("claude-") || model.startsWith("gemini-");
|
|
420
496
|
let estimatedTokens;
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
estimatedTokens =
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
497
|
+
let tokenizerModeUsed = "heuristic";
|
|
498
|
+
let encodingUsed;
|
|
499
|
+
const shouldTryExact = tokenizer === "openai_exact" || tokenizer === "auto";
|
|
500
|
+
if (shouldTryExact && !isNonOpenAIModel2) {
|
|
501
|
+
try {
|
|
502
|
+
estimatedTokens = encode(text, { model, allowSpecial: "none" }).length;
|
|
503
|
+
tokenizerModeUsed = "openai_exact";
|
|
504
|
+
encodingUsed = getOpenAIEncoding({ model });
|
|
505
|
+
} catch (error) {
|
|
506
|
+
if (tokenizer === "openai_exact") {
|
|
507
|
+
throw error;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
} else if (tokenizer === "openai_exact" && isNonOpenAIModel2) {
|
|
511
|
+
throw new Error(
|
|
512
|
+
`Tokenizer mode "openai_exact" requested for non-OpenAI model: "${model}"`
|
|
513
|
+
);
|
|
514
|
+
}
|
|
515
|
+
if (estimatedTokens === void 0) {
|
|
516
|
+
const rawTokens = characterCount / config.charsPerToken;
|
|
517
|
+
switch (rounding) {
|
|
518
|
+
case "floor":
|
|
519
|
+
estimatedTokens = Math.floor(rawTokens);
|
|
520
|
+
break;
|
|
521
|
+
case "round":
|
|
522
|
+
estimatedTokens = Math.round(rawTokens);
|
|
523
|
+
break;
|
|
524
|
+
case "ceil":
|
|
525
|
+
default:
|
|
526
|
+
estimatedTokens = Math.ceil(rawTokens);
|
|
527
|
+
}
|
|
528
|
+
tokenizerModeUsed = "heuristic";
|
|
431
529
|
}
|
|
432
530
|
const estimatedInputCost = estimatedTokens * config.inputCostPerMillion / 1e6;
|
|
433
531
|
return {
|
|
@@ -435,13 +533,44 @@ function estimate(input) {
|
|
|
435
533
|
characterCount,
|
|
436
534
|
estimatedTokens,
|
|
437
535
|
estimatedInputCost,
|
|
438
|
-
charsPerToken: config.charsPerToken
|
|
536
|
+
charsPerToken: config.charsPerToken,
|
|
537
|
+
tokenizerMode: tokenizerModeUsed,
|
|
538
|
+
encodingUsed
|
|
439
539
|
};
|
|
440
540
|
}
|
|
541
|
+
|
|
542
|
+
// src/token-counter.ts
|
|
543
|
+
function isNonOpenAIModel(model) {
|
|
544
|
+
return model.startsWith("claude-") || model.startsWith("gemini-");
|
|
545
|
+
}
|
|
546
|
+
function countTokens(input) {
|
|
547
|
+
const { text, model } = input;
|
|
548
|
+
if (isNonOpenAIModel(model)) {
|
|
549
|
+
return {
|
|
550
|
+
tokens: estimate({ text, model }).estimatedTokens,
|
|
551
|
+
exact: false
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
try {
|
|
555
|
+
return {
|
|
556
|
+
tokens: encode(text, { model, allowSpecial: "none" }).length,
|
|
557
|
+
exact: true,
|
|
558
|
+
encoding: getOpenAIEncoding({ model })
|
|
559
|
+
};
|
|
560
|
+
} catch {
|
|
561
|
+
return {
|
|
562
|
+
tokens: estimate({ text, model }).estimatedTokens,
|
|
563
|
+
exact: false
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
}
|
|
441
567
|
// Annotate the CommonJS export names for ESM import in node:
|
|
442
568
|
0 && (module.exports = {
|
|
443
569
|
DEFAULT_MODELS,
|
|
444
570
|
LAST_UPDATED,
|
|
571
|
+
countTokens,
|
|
572
|
+
decode,
|
|
573
|
+
encode,
|
|
445
574
|
estimate,
|
|
446
575
|
getAvailableModels,
|
|
447
576
|
getModelConfig
|
package/dist/index.d.cts
CHANGED
|
@@ -7,6 +7,7 @@ interface ModelConfig {
|
|
|
7
7
|
/** Cost in USD per 1 million input tokens */
|
|
8
8
|
inputCostPerMillion: number;
|
|
9
9
|
}
|
|
10
|
+
type TokenizerMode = 'heuristic' | 'openai_exact' | 'auto';
|
|
10
11
|
/**
|
|
11
12
|
* Input parameters for the estimate function.
|
|
12
13
|
*/
|
|
@@ -17,6 +18,13 @@ interface EstimateInput {
|
|
|
17
18
|
model: string;
|
|
18
19
|
/** Rounding strategy for token count (default: 'ceil') */
|
|
19
20
|
rounding?: 'ceil' | 'round' | 'floor';
|
|
21
|
+
/**
|
|
22
|
+
* Token counting strategy.
|
|
23
|
+
* - `heuristic` (default): use chars-per-token ratios
|
|
24
|
+
* - `openai_exact`: use OpenAI BPE tokenization (throws if non-OpenAI model)
|
|
25
|
+
* - `auto`: use OpenAI BPE for OpenAI models, otherwise heuristic
|
|
26
|
+
*/
|
|
27
|
+
tokenizer?: TokenizerMode;
|
|
20
28
|
}
|
|
21
29
|
/**
|
|
22
30
|
* Output from the estimate function.
|
|
@@ -32,6 +40,10 @@ interface EstimateOutput {
|
|
|
32
40
|
estimatedInputCost: number;
|
|
33
41
|
/** The chars-per-token ratio used */
|
|
34
42
|
charsPerToken: number;
|
|
43
|
+
/** Which tokenizer strategy was used */
|
|
44
|
+
tokenizerMode?: TokenizerMode;
|
|
45
|
+
/** OpenAI encoding used when tokenizerMode is `openai_exact` */
|
|
46
|
+
encodingUsed?: string;
|
|
35
47
|
}
|
|
36
48
|
|
|
37
49
|
/**
|
|
@@ -81,4 +93,52 @@ declare function getModelConfig(model: string): ModelConfig;
|
|
|
81
93
|
*/
|
|
82
94
|
declare function getAvailableModels(): string[];
|
|
83
95
|
|
|
84
|
-
|
|
96
|
+
type OpenAIEncoding = 'r50k_base' | 'p50k_base' | 'p50k_edit' | 'cl100k_base' | 'o200k_base' | 'o200k_harmony';
|
|
97
|
+
type SpecialTokenHandling = 'all' | 'none' | 'none_raise';
|
|
98
|
+
interface EncodeOptions {
|
|
99
|
+
/**
|
|
100
|
+
* Explicit OpenAI encoding override.
|
|
101
|
+
* When provided, this takes precedence over `model`.
|
|
102
|
+
*/
|
|
103
|
+
encoding?: OpenAIEncoding;
|
|
104
|
+
/**
|
|
105
|
+
* OpenAI model ID used to select the appropriate encoding.
|
|
106
|
+
*/
|
|
107
|
+
model?: string;
|
|
108
|
+
/**
|
|
109
|
+
* How special tokens are handled.
|
|
110
|
+
* - `none_raise` (default): throw if special tokens appear
|
|
111
|
+
* - `none`: treat special tokens as regular text
|
|
112
|
+
* - `all`: allow special tokens and encode them as special token IDs
|
|
113
|
+
*/
|
|
114
|
+
allowSpecial?: SpecialTokenHandling;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Encode text into OpenAI token IDs using tiktoken-compatible BPE encoding.
|
|
118
|
+
*
|
|
119
|
+
* This is exact tokenization for OpenAI models (unlike heuristic estimators).
|
|
120
|
+
*/
|
|
121
|
+
declare function encode(text: string, options?: EncodeOptions): number[];
|
|
122
|
+
/**
|
|
123
|
+
* Decode OpenAI token IDs into text using tiktoken-compatible BPE encoding.
|
|
124
|
+
*/
|
|
125
|
+
declare function decode(tokens: Iterable<number>, options?: Pick<EncodeOptions, 'encoding' | 'model'>): string;
|
|
126
|
+
|
|
127
|
+
interface TokenCountInput {
|
|
128
|
+
text: string;
|
|
129
|
+
model: string;
|
|
130
|
+
}
|
|
131
|
+
interface TokenCountOutput {
|
|
132
|
+
tokens: number;
|
|
133
|
+
exact: boolean;
|
|
134
|
+
encoding?: OpenAIEncoding;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Count tokens for a given model.
|
|
138
|
+
*
|
|
139
|
+
* - OpenAI models: exact BPE tokenization
|
|
140
|
+
* - Other providers: heuristic estimate (chars-per-token)
|
|
141
|
+
*/
|
|
142
|
+
declare function countTokens(input: TokenCountInput): TokenCountOutput;
|
|
143
|
+
|
|
144
|
+
export { DEFAULT_MODELS, type EncodeOptions, type EstimateInput, type EstimateOutput, LAST_UPDATED, type ModelConfig, type OpenAIEncoding, type SpecialTokenHandling, type TokenCountInput, type TokenCountOutput, type TokenizerMode, countTokens, decode, encode, estimate, getAvailableModels, getModelConfig };
|
package/dist/index.d.ts
CHANGED
|
@@ -7,6 +7,7 @@ interface ModelConfig {
|
|
|
7
7
|
/** Cost in USD per 1 million input tokens */
|
|
8
8
|
inputCostPerMillion: number;
|
|
9
9
|
}
|
|
10
|
+
type TokenizerMode = 'heuristic' | 'openai_exact' | 'auto';
|
|
10
11
|
/**
|
|
11
12
|
* Input parameters for the estimate function.
|
|
12
13
|
*/
|
|
@@ -17,6 +18,13 @@ interface EstimateInput {
|
|
|
17
18
|
model: string;
|
|
18
19
|
/** Rounding strategy for token count (default: 'ceil') */
|
|
19
20
|
rounding?: 'ceil' | 'round' | 'floor';
|
|
21
|
+
/**
|
|
22
|
+
* Token counting strategy.
|
|
23
|
+
* - `heuristic` (default): use chars-per-token ratios
|
|
24
|
+
* - `openai_exact`: use OpenAI BPE tokenization (throws if non-OpenAI model)
|
|
25
|
+
* - `auto`: use OpenAI BPE for OpenAI models, otherwise heuristic
|
|
26
|
+
*/
|
|
27
|
+
tokenizer?: TokenizerMode;
|
|
20
28
|
}
|
|
21
29
|
/**
|
|
22
30
|
* Output from the estimate function.
|
|
@@ -32,6 +40,10 @@ interface EstimateOutput {
|
|
|
32
40
|
estimatedInputCost: number;
|
|
33
41
|
/** The chars-per-token ratio used */
|
|
34
42
|
charsPerToken: number;
|
|
43
|
+
/** Which tokenizer strategy was used */
|
|
44
|
+
tokenizerMode?: TokenizerMode;
|
|
45
|
+
/** OpenAI encoding used when tokenizerMode is `openai_exact` */
|
|
46
|
+
encodingUsed?: string;
|
|
35
47
|
}
|
|
36
48
|
|
|
37
49
|
/**
|
|
@@ -81,4 +93,52 @@ declare function getModelConfig(model: string): ModelConfig;
|
|
|
81
93
|
*/
|
|
82
94
|
declare function getAvailableModels(): string[];
|
|
83
95
|
|
|
84
|
-
|
|
96
|
+
type OpenAIEncoding = 'r50k_base' | 'p50k_base' | 'p50k_edit' | 'cl100k_base' | 'o200k_base' | 'o200k_harmony';
|
|
97
|
+
type SpecialTokenHandling = 'all' | 'none' | 'none_raise';
|
|
98
|
+
interface EncodeOptions {
|
|
99
|
+
/**
|
|
100
|
+
* Explicit OpenAI encoding override.
|
|
101
|
+
* When provided, this takes precedence over `model`.
|
|
102
|
+
*/
|
|
103
|
+
encoding?: OpenAIEncoding;
|
|
104
|
+
/**
|
|
105
|
+
* OpenAI model ID used to select the appropriate encoding.
|
|
106
|
+
*/
|
|
107
|
+
model?: string;
|
|
108
|
+
/**
|
|
109
|
+
* How special tokens are handled.
|
|
110
|
+
* - `none_raise` (default): throw if special tokens appear
|
|
111
|
+
* - `none`: treat special tokens as regular text
|
|
112
|
+
* - `all`: allow special tokens and encode them as special token IDs
|
|
113
|
+
*/
|
|
114
|
+
allowSpecial?: SpecialTokenHandling;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Encode text into OpenAI token IDs using tiktoken-compatible BPE encoding.
|
|
118
|
+
*
|
|
119
|
+
* This is exact tokenization for OpenAI models (unlike heuristic estimators).
|
|
120
|
+
*/
|
|
121
|
+
declare function encode(text: string, options?: EncodeOptions): number[];
|
|
122
|
+
/**
|
|
123
|
+
* Decode OpenAI token IDs into text using tiktoken-compatible BPE encoding.
|
|
124
|
+
*/
|
|
125
|
+
declare function decode(tokens: Iterable<number>, options?: Pick<EncodeOptions, 'encoding' | 'model'>): string;
|
|
126
|
+
|
|
127
|
+
interface TokenCountInput {
|
|
128
|
+
text: string;
|
|
129
|
+
model: string;
|
|
130
|
+
}
|
|
131
|
+
interface TokenCountOutput {
|
|
132
|
+
tokens: number;
|
|
133
|
+
exact: boolean;
|
|
134
|
+
encoding?: OpenAIEncoding;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Count tokens for a given model.
|
|
138
|
+
*
|
|
139
|
+
* - OpenAI models: exact BPE tokenization
|
|
140
|
+
* - Other providers: heuristic estimate (chars-per-token)
|
|
141
|
+
*/
|
|
142
|
+
declare function countTokens(input: TokenCountInput): TokenCountOutput;
|
|
143
|
+
|
|
144
|
+
export { DEFAULT_MODELS, type EncodeOptions, type EstimateInput, type EstimateOutput, LAST_UPDATED, type ModelConfig, type OpenAIEncoding, type SpecialTokenHandling, type TokenCountInput, type TokenCountOutput, type TokenizerMode, countTokens, decode, encode, estimate, getAvailableModels, getModelConfig };
|
package/dist/index.js
CHANGED
|
@@ -374,6 +374,78 @@ function getAvailableModels() {
|
|
|
374
374
|
return Object.keys(DEFAULT_MODELS);
|
|
375
375
|
}
|
|
376
376
|
|
|
377
|
+
// src/openai-bpe.ts
|
|
378
|
+
import { createRequire } from "module";
|
|
379
|
+
import { ALL_SPECIAL_TOKENS } from "gpt-tokenizer/constants";
|
|
380
|
+
import { DEFAULT_ENCODING, modelToEncodingMap } from "gpt-tokenizer/mapping";
|
|
381
|
+
var requireBase = typeof __filename === "string" && __filename.length > 0 ? __filename : import.meta.url;
|
|
382
|
+
var NODE_REQUIRE = createRequire(requireBase);
|
|
383
|
+
var ENCODING_MODULES = {
|
|
384
|
+
r50k_base: "gpt-tokenizer/cjs/encoding/r50k_base",
|
|
385
|
+
p50k_base: "gpt-tokenizer/cjs/encoding/p50k_base",
|
|
386
|
+
p50k_edit: "gpt-tokenizer/cjs/encoding/p50k_edit",
|
|
387
|
+
cl100k_base: "gpt-tokenizer/cjs/encoding/cl100k_base",
|
|
388
|
+
o200k_base: "gpt-tokenizer/cjs/encoding/o200k_base",
|
|
389
|
+
o200k_harmony: "gpt-tokenizer/cjs/encoding/o200k_harmony"
|
|
390
|
+
};
|
|
391
|
+
var encodingApiCache = /* @__PURE__ */ new Map();
|
|
392
|
+
function getEncodingApi(encoding) {
|
|
393
|
+
const cached = encodingApiCache.get(encoding);
|
|
394
|
+
if (cached) return cached;
|
|
395
|
+
const modulePath = ENCODING_MODULES[encoding];
|
|
396
|
+
const mod = NODE_REQUIRE(modulePath);
|
|
397
|
+
const api = { encode: mod.encode, decode: mod.decode };
|
|
398
|
+
encodingApiCache.set(encoding, api);
|
|
399
|
+
return api;
|
|
400
|
+
}
|
|
401
|
+
function resolveEncoding(selector) {
|
|
402
|
+
if (selector?.encoding) {
|
|
403
|
+
return selector.encoding;
|
|
404
|
+
}
|
|
405
|
+
const model = selector?.model?.trim();
|
|
406
|
+
if (model) {
|
|
407
|
+
const mapped = modelToEncodingMap[model];
|
|
408
|
+
if (mapped) {
|
|
409
|
+
return mapped;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
return DEFAULT_ENCODING;
|
|
413
|
+
}
|
|
414
|
+
function getOpenAIEncoding(selector) {
|
|
415
|
+
return resolveEncoding(selector);
|
|
416
|
+
}
|
|
417
|
+
function toGptTokenizerEncodeOptions(allowSpecial) {
|
|
418
|
+
const mode = allowSpecial ?? "none_raise";
|
|
419
|
+
switch (mode) {
|
|
420
|
+
case "all":
|
|
421
|
+
return {
|
|
422
|
+
allowedSpecial: ALL_SPECIAL_TOKENS,
|
|
423
|
+
disallowedSpecial: /* @__PURE__ */ new Set()
|
|
424
|
+
};
|
|
425
|
+
case "none":
|
|
426
|
+
return {
|
|
427
|
+
allowedSpecial: /* @__PURE__ */ new Set(),
|
|
428
|
+
disallowedSpecial: /* @__PURE__ */ new Set()
|
|
429
|
+
};
|
|
430
|
+
case "none_raise":
|
|
431
|
+
default:
|
|
432
|
+
return {
|
|
433
|
+
disallowedSpecial: ALL_SPECIAL_TOKENS
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
function encode(text, options) {
|
|
438
|
+
const encoding = resolveEncoding(options);
|
|
439
|
+
const api = getEncodingApi(encoding);
|
|
440
|
+
const encodeOptions = toGptTokenizerEncodeOptions(options?.allowSpecial);
|
|
441
|
+
return api.encode(text, encodeOptions);
|
|
442
|
+
}
|
|
443
|
+
function decode(tokens, options) {
|
|
444
|
+
const encoding = resolveEncoding(options);
|
|
445
|
+
const api = getEncodingApi(encoding);
|
|
446
|
+
return api.decode(tokens);
|
|
447
|
+
}
|
|
448
|
+
|
|
377
449
|
// src/estimator.ts
|
|
378
450
|
function countCodePoints(text) {
|
|
379
451
|
let count = 0;
|
|
@@ -383,21 +455,43 @@ function countCodePoints(text) {
|
|
|
383
455
|
return count;
|
|
384
456
|
}
|
|
385
457
|
function estimate(input) {
|
|
386
|
-
const { text, model, rounding = "ceil" } = input;
|
|
458
|
+
const { text, model, rounding = "ceil", tokenizer = "heuristic" } = input;
|
|
387
459
|
const config = getModelConfig(model);
|
|
388
460
|
const characterCount = countCodePoints(text);
|
|
389
|
-
const
|
|
461
|
+
const isNonOpenAIModel2 = model.startsWith("claude-") || model.startsWith("gemini-");
|
|
390
462
|
let estimatedTokens;
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
estimatedTokens =
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
463
|
+
let tokenizerModeUsed = "heuristic";
|
|
464
|
+
let encodingUsed;
|
|
465
|
+
const shouldTryExact = tokenizer === "openai_exact" || tokenizer === "auto";
|
|
466
|
+
if (shouldTryExact && !isNonOpenAIModel2) {
|
|
467
|
+
try {
|
|
468
|
+
estimatedTokens = encode(text, { model, allowSpecial: "none" }).length;
|
|
469
|
+
tokenizerModeUsed = "openai_exact";
|
|
470
|
+
encodingUsed = getOpenAIEncoding({ model });
|
|
471
|
+
} catch (error) {
|
|
472
|
+
if (tokenizer === "openai_exact") {
|
|
473
|
+
throw error;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
} else if (tokenizer === "openai_exact" && isNonOpenAIModel2) {
|
|
477
|
+
throw new Error(
|
|
478
|
+
`Tokenizer mode "openai_exact" requested for non-OpenAI model: "${model}"`
|
|
479
|
+
);
|
|
480
|
+
}
|
|
481
|
+
if (estimatedTokens === void 0) {
|
|
482
|
+
const rawTokens = characterCount / config.charsPerToken;
|
|
483
|
+
switch (rounding) {
|
|
484
|
+
case "floor":
|
|
485
|
+
estimatedTokens = Math.floor(rawTokens);
|
|
486
|
+
break;
|
|
487
|
+
case "round":
|
|
488
|
+
estimatedTokens = Math.round(rawTokens);
|
|
489
|
+
break;
|
|
490
|
+
case "ceil":
|
|
491
|
+
default:
|
|
492
|
+
estimatedTokens = Math.ceil(rawTokens);
|
|
493
|
+
}
|
|
494
|
+
tokenizerModeUsed = "heuristic";
|
|
401
495
|
}
|
|
402
496
|
const estimatedInputCost = estimatedTokens * config.inputCostPerMillion / 1e6;
|
|
403
497
|
return {
|
|
@@ -405,12 +499,43 @@ function estimate(input) {
|
|
|
405
499
|
characterCount,
|
|
406
500
|
estimatedTokens,
|
|
407
501
|
estimatedInputCost,
|
|
408
|
-
charsPerToken: config.charsPerToken
|
|
502
|
+
charsPerToken: config.charsPerToken,
|
|
503
|
+
tokenizerMode: tokenizerModeUsed,
|
|
504
|
+
encodingUsed
|
|
409
505
|
};
|
|
410
506
|
}
|
|
507
|
+
|
|
508
|
+
// src/token-counter.ts
|
|
509
|
+
function isNonOpenAIModel(model) {
|
|
510
|
+
return model.startsWith("claude-") || model.startsWith("gemini-");
|
|
511
|
+
}
|
|
512
|
+
function countTokens(input) {
|
|
513
|
+
const { text, model } = input;
|
|
514
|
+
if (isNonOpenAIModel(model)) {
|
|
515
|
+
return {
|
|
516
|
+
tokens: estimate({ text, model }).estimatedTokens,
|
|
517
|
+
exact: false
|
|
518
|
+
};
|
|
519
|
+
}
|
|
520
|
+
try {
|
|
521
|
+
return {
|
|
522
|
+
tokens: encode(text, { model, allowSpecial: "none" }).length,
|
|
523
|
+
exact: true,
|
|
524
|
+
encoding: getOpenAIEncoding({ model })
|
|
525
|
+
};
|
|
526
|
+
} catch {
|
|
527
|
+
return {
|
|
528
|
+
tokens: estimate({ text, model }).estimatedTokens,
|
|
529
|
+
exact: false
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
}
|
|
411
533
|
export {
|
|
412
534
|
DEFAULT_MODELS,
|
|
413
535
|
LAST_UPDATED,
|
|
536
|
+
countTokens,
|
|
537
|
+
decode,
|
|
538
|
+
encode,
|
|
414
539
|
estimate,
|
|
415
540
|
getAvailableModels,
|
|
416
541
|
getModelConfig
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ai-token-estimator",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Estimate token counts and costs for LLM API calls",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -23,13 +23,17 @@
|
|
|
23
23
|
"LICENSE",
|
|
24
24
|
"README.md"
|
|
25
25
|
],
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"gpt-tokenizer": "^3.4.0"
|
|
28
|
+
},
|
|
26
29
|
"scripts": {
|
|
27
30
|
"build": "tsup src/index.ts --format cjs,esm --dts",
|
|
28
31
|
"test": "vitest run",
|
|
29
32
|
"test:watch": "vitest",
|
|
30
33
|
"lint": "eslint src tests",
|
|
31
34
|
"prepublishOnly": "npm run lint && npm run test && npm run build",
|
|
32
|
-
"update-pricing": "tsx scripts/update-pricing.ts"
|
|
35
|
+
"update-pricing": "tsx scripts/update-pricing.ts",
|
|
36
|
+
"benchmark:tokenizer": "tsx benchmark/tokenizer.ts"
|
|
33
37
|
},
|
|
34
38
|
"keywords": [
|
|
35
39
|
"llm",
|