@hyvmind/tiktoken-ts 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +557 -0
- package/dist/bpe.d.ts +171 -0
- package/dist/bpe.d.ts.map +1 -0
- package/dist/bpe.js +478 -0
- package/dist/bpe.js.map +1 -0
- package/dist/core/byte-pair-encoding.d.ts +49 -0
- package/dist/core/byte-pair-encoding.d.ts.map +1 -0
- package/dist/core/byte-pair-encoding.js +154 -0
- package/dist/core/byte-pair-encoding.js.map +1 -0
- package/dist/core/encoding-definitions.d.ts +95 -0
- package/dist/core/encoding-definitions.d.ts.map +1 -0
- package/dist/core/encoding-definitions.js +202 -0
- package/dist/core/encoding-definitions.js.map +1 -0
- package/dist/core/index.d.ts +12 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +17 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/model-to-encoding.d.ts +36 -0
- package/dist/core/model-to-encoding.d.ts.map +1 -0
- package/dist/core/model-to-encoding.js +299 -0
- package/dist/core/model-to-encoding.js.map +1 -0
- package/dist/core/tiktoken.d.ts +126 -0
- package/dist/core/tiktoken.d.ts.map +1 -0
- package/dist/core/tiktoken.js +295 -0
- package/dist/core/tiktoken.js.map +1 -0
- package/dist/core/vocab-loader.d.ts +77 -0
- package/dist/core/vocab-loader.d.ts.map +1 -0
- package/dist/core/vocab-loader.js +176 -0
- package/dist/core/vocab-loader.js.map +1 -0
- package/dist/encodings/cl100k-base.d.ts +43 -0
- package/dist/encodings/cl100k-base.d.ts.map +1 -0
- package/dist/encodings/cl100k-base.js +142 -0
- package/dist/encodings/cl100k-base.js.map +1 -0
- package/dist/encodings/claude-estimation.d.ts +136 -0
- package/dist/encodings/claude-estimation.d.ts.map +1 -0
- package/dist/encodings/claude-estimation.js +160 -0
- package/dist/encodings/claude-estimation.js.map +1 -0
- package/dist/encodings/index.d.ts +9 -0
- package/dist/encodings/index.d.ts.map +1 -0
- package/dist/encodings/index.js +13 -0
- package/dist/encodings/index.js.map +1 -0
- package/dist/encodings/o200k-base.d.ts +58 -0
- package/dist/encodings/o200k-base.d.ts.map +1 -0
- package/dist/encodings/o200k-base.js +191 -0
- package/dist/encodings/o200k-base.js.map +1 -0
- package/dist/encodings/p50k-base.d.ts +44 -0
- package/dist/encodings/p50k-base.d.ts.map +1 -0
- package/dist/encodings/p50k-base.js +64 -0
- package/dist/encodings/p50k-base.js.map +1 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +109 -0
- package/dist/index.js.map +1 -0
- package/dist/models.d.ts +92 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +320 -0
- package/dist/models.js.map +1 -0
- package/dist/tiktoken.d.ts +198 -0
- package/dist/tiktoken.d.ts.map +1 -0
- package/dist/tiktoken.js +331 -0
- package/dist/tiktoken.js.map +1 -0
- package/dist/tokenizer.d.ts +181 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +436 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +127 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +152 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +244 -0
- package/dist/utils.js.map +1 -0
- package/package.json +78 -0
package/dist/bpe.d.ts
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Byte-Pair Encoding (BPE) Implementation
|
|
3
|
+
* A pure TypeScript implementation of the BPE algorithm
|
|
4
|
+
*
|
|
5
|
+
* This implementation uses a hybrid approach:
|
|
6
|
+
* 1. Pattern-based tokenization for accuracy with special tokens
|
|
7
|
+
* 2. Statistical estimation for token counting (without full vocabulary)
|
|
8
|
+
*
|
|
9
|
+
* For production use with full accuracy, consider loading the actual
|
|
10
|
+
* vocabulary files from tiktoken.
|
|
11
|
+
*/
|
|
12
|
+
import type { EncodingConfig, EncodingName, SpecialTokens } from "./types.js";
|
|
13
|
+
/**
|
|
14
|
+
* Default special tokens for OpenAI chat format
|
|
15
|
+
*/
|
|
16
|
+
export declare const DEFAULT_SPECIAL_TOKENS: SpecialTokens;
|
|
17
|
+
/**
|
|
18
|
+
* cl100k_base encoding configuration
|
|
19
|
+
* Used by GPT-4, GPT-3.5-turbo
|
|
20
|
+
*/
|
|
21
|
+
export declare const CL100K_BASE_CONFIG: EncodingConfig;
|
|
22
|
+
/**
|
|
23
|
+
* o200k_base encoding configuration
|
|
24
|
+
* Used by GPT-4o, GPT-4.1, GPT-5 models
|
|
25
|
+
*/
|
|
26
|
+
export declare const O200K_BASE_CONFIG: EncodingConfig;
|
|
27
|
+
/**
|
|
28
|
+
* p50k_base encoding configuration (legacy)
|
|
29
|
+
* Used by older codex models
|
|
30
|
+
*/
|
|
31
|
+
export declare const P50K_BASE_CONFIG: EncodingConfig;
|
|
32
|
+
/**
|
|
33
|
+
* Claude estimation encoding configuration
|
|
34
|
+
*
|
|
35
|
+
* IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
|
|
36
|
+
*
|
|
37
|
+
* Claude uses a proprietary tokenizer that is NOT publicly available.
|
|
38
|
+
* This encoding provides "safe" estimates that intentionally over-count
|
|
39
|
+
* tokens to prevent API truncation issues.
|
|
40
|
+
*
|
|
41
|
+
* Based on research findings (see claude-tokenizer-research.md):
|
|
42
|
+
* - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
|
|
43
|
+
* - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content
|
|
44
|
+
* - Average ~3.5 characters per token (vs GPT-4's ~4)
|
|
45
|
+
*
|
|
46
|
+
* This encoding applies a 1.25x safety multiplier to cl100k_base estimates.
|
|
47
|
+
* For exact Claude token counts, use Anthropic's /v1/messages/count_tokens API.
|
|
48
|
+
*
|
|
49
|
+
* @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
|
|
50
|
+
*/
|
|
51
|
+
export declare const CLAUDE_ESTIMATION_CONFIG: EncodingConfig;
|
|
52
|
+
/**
|
|
53
|
+
* Get encoding configuration by name
|
|
54
|
+
*
|
|
55
|
+
* @param name - Encoding name
|
|
56
|
+
* @returns Encoding configuration
|
|
57
|
+
*/
|
|
58
|
+
export declare function getEncodingConfig(name: EncodingName): EncodingConfig;
|
|
59
|
+
/**
|
|
60
|
+
* Safety multiplier for Claude token estimation.
|
|
61
|
+
*
|
|
62
|
+
* Research shows Claude produces 16-30% more tokens than GPT-4 for equivalent content:
|
|
63
|
+
* - English articles: +16%
|
|
64
|
+
* - Math equations: +21%
|
|
65
|
+
* - Python code: +30%
|
|
66
|
+
*
|
|
67
|
+
* We use 1.25 (25% over-estimate) as a safe default that covers most cases
|
|
68
|
+
* without being excessively conservative.
|
|
69
|
+
*/
|
|
70
|
+
export declare const CLAUDE_SAFETY_MULTIPLIER = 1.25;
|
|
71
|
+
/**
|
|
72
|
+
* BPE Tokenizer class
|
|
73
|
+
* Implements the core BPE algorithm with statistical estimation
|
|
74
|
+
*/
|
|
75
|
+
export declare class BPETokenizer {
|
|
76
|
+
private readonly config;
|
|
77
|
+
private readonly specialTokenPatterns;
|
|
78
|
+
private readonly bytePairCache;
|
|
79
|
+
private readonly isClaudeEstimation;
|
|
80
|
+
constructor(encoding?: EncodingName);
|
|
81
|
+
/**
|
|
82
|
+
* Initialize special token patterns
|
|
83
|
+
*/
|
|
84
|
+
private initializeSpecialTokens;
|
|
85
|
+
/**
|
|
86
|
+
* Get the encoding name
|
|
87
|
+
*/
|
|
88
|
+
get encodingName(): EncodingName;
|
|
89
|
+
/**
|
|
90
|
+
* Encode text into token IDs
|
|
91
|
+
* Uses a hybrid approach: special token detection + BPE encoding
|
|
92
|
+
*
|
|
93
|
+
* @param text - Input text
|
|
94
|
+
* @returns Array of token IDs
|
|
95
|
+
*/
|
|
96
|
+
encode(text: string): number[];
|
|
97
|
+
/**
|
|
98
|
+
* Decode token IDs back to text
|
|
99
|
+
* Note: This is a best-effort decode for estimated tokens
|
|
100
|
+
*
|
|
101
|
+
* @param tokens - Array of token IDs
|
|
102
|
+
* @returns Decoded text
|
|
103
|
+
*/
|
|
104
|
+
decode(tokens: number[]): string;
|
|
105
|
+
/**
|
|
106
|
+
* Count the number of tokens in text
|
|
107
|
+
* This uses statistical estimation for efficiency
|
|
108
|
+
*
|
|
109
|
+
* For Claude models (claude_estimation encoding), this applies a safety
|
|
110
|
+
* multiplier to ensure estimates err on the side of over-counting,
|
|
111
|
+
* preventing API truncation issues.
|
|
112
|
+
*
|
|
113
|
+
* @param text - Input text
|
|
114
|
+
* @returns Estimated token count
|
|
115
|
+
*/
|
|
116
|
+
countTokens(text: string): number;
|
|
117
|
+
/**
|
|
118
|
+
* Split text on special tokens while preserving them
|
|
119
|
+
*
|
|
120
|
+
* @param text - Input text
|
|
121
|
+
* @returns Array of segments
|
|
122
|
+
*/
|
|
123
|
+
private splitOnSpecialTokens;
|
|
124
|
+
/**
|
|
125
|
+
* Encode regular text using BPE
|
|
126
|
+
* Uses pattern-based splitting and byte-pair encoding
|
|
127
|
+
*
|
|
128
|
+
* @param text - Text to encode (no special tokens)
|
|
129
|
+
* @returns Array of token IDs
|
|
130
|
+
*/
|
|
131
|
+
private encodeText;
|
|
132
|
+
/**
|
|
133
|
+
* Apply BPE to a byte sequence
|
|
134
|
+
* Uses a simplified hash-based approach for estimation
|
|
135
|
+
*
|
|
136
|
+
* @param bytes - UTF-8 bytes
|
|
137
|
+
* @returns Array of token IDs
|
|
138
|
+
*/
|
|
139
|
+
private bytePairEncode;
|
|
140
|
+
/**
|
|
141
|
+
* Estimate the boundary of the next token
|
|
142
|
+
*
|
|
143
|
+
* @param bytes - Full byte sequence
|
|
144
|
+
* @param start - Starting position
|
|
145
|
+
* @returns Number of bytes for the next token
|
|
146
|
+
*/
|
|
147
|
+
private estimateTokenBoundary;
|
|
148
|
+
/**
|
|
149
|
+
* Convert bytes to a deterministic token ID
|
|
150
|
+
*
|
|
151
|
+
* @param bytes - Token bytes
|
|
152
|
+
* @returns Token ID
|
|
153
|
+
*/
|
|
154
|
+
private bytesToTokenId;
|
|
155
|
+
/**
|
|
156
|
+
* Estimate token count for text using statistical methods
|
|
157
|
+
* This is the primary method for fast token counting
|
|
158
|
+
*
|
|
159
|
+
* ACCURACY NOTES (validated against tiktoken-rs):
|
|
160
|
+
* - English text: +/-10-15% typically accurate
|
|
161
|
+
* - Numbers: Now properly models 1-3 digit grouping
|
|
162
|
+
* - CJK with o200k: Highly efficient (1 token per char typically)
|
|
163
|
+
* - CJK with cl100k: Less efficient (1-2 tokens per char)
|
|
164
|
+
* - Emoji: 2-3 tokens each
|
|
165
|
+
*
|
|
166
|
+
* @param text - Text to estimate
|
|
167
|
+
* @returns Estimated token count
|
|
168
|
+
*/
|
|
169
|
+
private estimateTokenCount;
|
|
170
|
+
}
|
|
171
|
+
//# sourceMappingURL=bpe.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bpe.d.ts","sourceRoot":"","sources":["../src/bpe.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAU9E;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,aAOpC,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,kBAAkB,EAAE,cAOhC,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,iBAAiB,EAAE,cAO/B,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,gBAAgB,EAAE,cAO9B,CAAC;AAEF;;;;;;;;;;;;;;;;;;GAkBG;AACH,eAAO,MAAM,wBAAwB,EAAE,cAUtC,CAAC;AAEF;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,YAAY,GAAG,cAAc,CAapE;AAED;;;;;;;;;;GAUG;AACH,eAAO,MAAM,wBAAwB,OAAO,CAAC;AAE7C;;;GAGG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAiB;IACxC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAsB;IAC3D,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAwB;IACtD,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAU;gBAEjC,QAAQ,GAAE,YAA2B;IAUjD;;OAEG;IACH,OAAO,CAAC,uBAAuB;IAgB/B;;OAEG;IACH,IAAI,YAAY,IAAI,YAAY,CAE/B;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE;IAqB9B;;;;;;OAMG;IACH,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM;IAoBhC;;;;;;;;;;OAUG;IACH,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IA2BjC;;;;;OAKG;IACH,OAAO,CAAC,oBAAoB;IAmC5B;;;;;;OAMG;IACH,OAAO,CAAC,UAAU;IA6BlB;;;;;;OAMG;IACH,OAAO,CAAC,cAAc;IA0BtB;;;;;;OAMG;IACH,OAAO,CAAC,qBAAqB;IAyC7B;;;;;OAKG;IACH,OAAO,CAAC,cAAc;IAStB;;;;;;;;;;;;;OAaG;IACH,OAAO,CAAC,kBAAkB;CA6F3B"}
|
package/dist/bpe.js
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Byte-Pair Encoding (BPE) Implementation
|
|
3
|
+
* A pure TypeScript implementation of the BPE algorithm
|
|
4
|
+
*
|
|
5
|
+
* This implementation uses a hybrid approach:
|
|
6
|
+
* 1. Pattern-based tokenization for accuracy with special tokens
|
|
7
|
+
* 2. Statistical estimation for token counting (without full vocabulary)
|
|
8
|
+
*
|
|
9
|
+
* For production use with full accuracy, consider loading the actual
|
|
10
|
+
* vocabulary files from tiktoken.
|
|
11
|
+
*/
|
|
12
|
+
import { stringToBytes, bytesToString, containsEmoji, isAscii, countCodePoints, hashString, } from "./utils.js";
|
|
13
|
+
/**
|
|
14
|
+
* Default special tokens for OpenAI chat format
|
|
15
|
+
*/
|
|
16
|
+
export const DEFAULT_SPECIAL_TOKENS = {
|
|
17
|
+
endOfText: "<|endoftext|>",
|
|
18
|
+
imStart: "<|im_start|>",
|
|
19
|
+
imEnd: "<|im_end|>",
|
|
20
|
+
imSep: "<|im_sep|>",
|
|
21
|
+
toolCall: "<|tool_call|>",
|
|
22
|
+
toolResult: "<|tool_result|>",
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* cl100k_base encoding configuration
|
|
26
|
+
* Used by GPT-4, GPT-3.5-turbo
|
|
27
|
+
*/
|
|
28
|
+
export const CL100K_BASE_CONFIG = {
|
|
29
|
+
name: "cl100k_base",
|
|
30
|
+
patternSplit: /(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/gu,
|
|
31
|
+
specialTokens: DEFAULT_SPECIAL_TOKENS,
|
|
32
|
+
vocabSize: 100256,
|
|
33
|
+
averageCharsPerToken: 3.8,
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* o200k_base encoding configuration
|
|
37
|
+
* Used by GPT-4o, GPT-4.1, GPT-5 models
|
|
38
|
+
*/
|
|
39
|
+
export const O200K_BASE_CONFIG = {
|
|
40
|
+
name: "o200k_base",
|
|
41
|
+
patternSplit: /[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/gu,
|
|
42
|
+
specialTokens: DEFAULT_SPECIAL_TOKENS,
|
|
43
|
+
vocabSize: 200000,
|
|
44
|
+
averageCharsPerToken: 4.0,
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* p50k_base encoding configuration (legacy)
|
|
48
|
+
* Used by older codex models
|
|
49
|
+
*/
|
|
50
|
+
export const P50K_BASE_CONFIG = {
|
|
51
|
+
name: "p50k_base",
|
|
52
|
+
patternSplit: /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu,
|
|
53
|
+
specialTokens: DEFAULT_SPECIAL_TOKENS,
|
|
54
|
+
vocabSize: 50257,
|
|
55
|
+
averageCharsPerToken: 3.5,
|
|
56
|
+
};
|
|
57
|
+
/**
|
|
58
|
+
* Claude estimation encoding configuration
|
|
59
|
+
*
|
|
60
|
+
* IMPORTANT: This is an ESTIMATION encoding, not an exact BPE tokenizer.
|
|
61
|
+
*
|
|
62
|
+
* Claude uses a proprietary tokenizer that is NOT publicly available.
|
|
63
|
+
* This encoding provides "safe" estimates that intentionally over-count
|
|
64
|
+
* tokens to prevent API truncation issues.
|
|
65
|
+
*
|
|
66
|
+
* Based on research findings (see claude-tokenizer-research.md):
|
|
67
|
+
* - Claude 3+ uses ~22,000 token vocabulary (much smaller than OpenAI's 100K-200K)
|
|
68
|
+
* - Claude produces 16-30% MORE tokens than GPT-4 for equivalent content
|
|
69
|
+
* - Average ~3.5 characters per token (vs GPT-4's ~4)
|
|
70
|
+
*
|
|
71
|
+
* This encoding applies a 1.25x safety multiplier to cl100k_base estimates.
|
|
72
|
+
* For exact Claude token counts, use Anthropic's /v1/messages/count_tokens API.
|
|
73
|
+
*
|
|
74
|
+
* @see https://docs.anthropic.com/en/docs/build-with-claude/token-counting
|
|
75
|
+
*/
|
|
76
|
+
export const CLAUDE_ESTIMATION_CONFIG = {
|
|
77
|
+
name: "claude_estimation",
|
|
78
|
+
// Use cl100k_base pattern as base (70% vocabulary overlap with Claude 1/2)
|
|
79
|
+
patternSplit: /(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/gu,
|
|
80
|
+
specialTokens: DEFAULT_SPECIAL_TOKENS,
|
|
81
|
+
// Claude's estimated vocabulary (~22K) but we use this for informational purposes
|
|
82
|
+
vocabSize: 22000,
|
|
83
|
+
// Claude averages ~3.5 chars per token, lower than cl100k_base
|
|
84
|
+
averageCharsPerToken: 3.5,
|
|
85
|
+
};
|
|
86
|
+
/**
|
|
87
|
+
* Get encoding configuration by name
|
|
88
|
+
*
|
|
89
|
+
* @param name - Encoding name
|
|
90
|
+
* @returns Encoding configuration
|
|
91
|
+
*/
|
|
92
|
+
export function getEncodingConfig(name) {
|
|
93
|
+
switch (name) {
|
|
94
|
+
case "cl100k_base":
|
|
95
|
+
return CL100K_BASE_CONFIG;
|
|
96
|
+
case "o200k_base":
|
|
97
|
+
return O200K_BASE_CONFIG;
|
|
98
|
+
case "p50k_base":
|
|
99
|
+
return P50K_BASE_CONFIG;
|
|
100
|
+
case "claude_estimation":
|
|
101
|
+
return CLAUDE_ESTIMATION_CONFIG;
|
|
102
|
+
default:
|
|
103
|
+
return O200K_BASE_CONFIG;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Safety multiplier for Claude token estimation.
|
|
108
|
+
*
|
|
109
|
+
* Research shows Claude produces 16-30% more tokens than GPT-4 for equivalent content:
|
|
110
|
+
* - English articles: +16%
|
|
111
|
+
* - Math equations: +21%
|
|
112
|
+
* - Python code: +30%
|
|
113
|
+
*
|
|
114
|
+
* We use 1.25 (25% over-estimate) as a safe default that covers most cases
|
|
115
|
+
* without being excessively conservative.
|
|
116
|
+
*/
|
|
117
|
+
export const CLAUDE_SAFETY_MULTIPLIER = 1.25;
|
|
118
|
+
/**
|
|
119
|
+
* BPE Tokenizer class
|
|
120
|
+
* Implements the core BPE algorithm with statistical estimation
|
|
121
|
+
*/
|
|
122
|
+
export class BPETokenizer {
|
|
123
|
+
config;
|
|
124
|
+
specialTokenPatterns;
|
|
125
|
+
bytePairCache;
|
|
126
|
+
isClaudeEstimation;
|
|
127
|
+
constructor(encoding = "o200k_base") {
|
|
128
|
+
this.config = getEncodingConfig(encoding);
|
|
129
|
+
this.specialTokenPatterns = new Map();
|
|
130
|
+
this.bytePairCache = new Map();
|
|
131
|
+
this.isClaudeEstimation = encoding === "claude_estimation";
|
|
132
|
+
// Pre-compute special token patterns and their token IDs
|
|
133
|
+
this.initializeSpecialTokens();
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Initialize special token patterns
|
|
137
|
+
*/
|
|
138
|
+
initializeSpecialTokens() {
|
|
139
|
+
const tokens = this.config.specialTokens;
|
|
140
|
+
let tokenId = this.config.vocabSize - 10; // Reserve IDs at end of vocab
|
|
141
|
+
this.specialTokenPatterns.set(tokens.endOfText, tokenId++);
|
|
142
|
+
this.specialTokenPatterns.set(tokens.imStart, tokenId++);
|
|
143
|
+
this.specialTokenPatterns.set(tokens.imEnd, tokenId++);
|
|
144
|
+
this.specialTokenPatterns.set(tokens.imSep, tokenId++);
|
|
145
|
+
if (tokens.toolCall) {
|
|
146
|
+
this.specialTokenPatterns.set(tokens.toolCall, tokenId++);
|
|
147
|
+
}
|
|
148
|
+
if (tokens.toolResult) {
|
|
149
|
+
this.specialTokenPatterns.set(tokens.toolResult, tokenId++);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Get the encoding name
|
|
154
|
+
*/
|
|
155
|
+
get encodingName() {
|
|
156
|
+
return this.config.name;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Encode text into token IDs
|
|
160
|
+
* Uses a hybrid approach: special token detection + BPE encoding
|
|
161
|
+
*
|
|
162
|
+
* @param text - Input text
|
|
163
|
+
* @returns Array of token IDs
|
|
164
|
+
*/
|
|
165
|
+
encode(text) {
|
|
166
|
+
if (!text)
|
|
167
|
+
return [];
|
|
168
|
+
const tokens = [];
|
|
169
|
+
// First, split on special tokens
|
|
170
|
+
const segments = this.splitOnSpecialTokens(text);
|
|
171
|
+
for (const segment of segments) {
|
|
172
|
+
if (this.specialTokenPatterns.has(segment)) {
|
|
173
|
+
// Special token - use its pre-assigned ID
|
|
174
|
+
tokens.push(this.specialTokenPatterns.get(segment));
|
|
175
|
+
}
|
|
176
|
+
else {
|
|
177
|
+
// Regular text - apply BPE
|
|
178
|
+
tokens.push(...this.encodeText(segment));
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
return tokens;
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Decode token IDs back to text
|
|
185
|
+
* Note: This is a best-effort decode for estimated tokens
|
|
186
|
+
*
|
|
187
|
+
* @param tokens - Array of token IDs
|
|
188
|
+
* @returns Decoded text
|
|
189
|
+
*/
|
|
190
|
+
decode(tokens) {
|
|
191
|
+
const parts = [];
|
|
192
|
+
for (const token of tokens) {
|
|
193
|
+
// Check if it's a special token
|
|
194
|
+
for (const [text, id] of this.specialTokenPatterns) {
|
|
195
|
+
if (id === token) {
|
|
196
|
+
parts.push(text);
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// For regular tokens, we'd need the vocabulary to decode properly
|
|
201
|
+
// Since we don't have it, we return a placeholder
|
|
202
|
+
parts.push(`<token:${token}>`);
|
|
203
|
+
}
|
|
204
|
+
return parts.join("");
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Count the number of tokens in text
|
|
208
|
+
* This uses statistical estimation for efficiency
|
|
209
|
+
*
|
|
210
|
+
* For Claude models (claude_estimation encoding), this applies a safety
|
|
211
|
+
* multiplier to ensure estimates err on the side of over-counting,
|
|
212
|
+
* preventing API truncation issues.
|
|
213
|
+
*
|
|
214
|
+
* @param text - Input text
|
|
215
|
+
* @returns Estimated token count
|
|
216
|
+
*/
|
|
217
|
+
countTokens(text) {
|
|
218
|
+
if (!text)
|
|
219
|
+
return 0;
|
|
220
|
+
let count = 0;
|
|
221
|
+
// Split on special tokens first
|
|
222
|
+
const segments = this.splitOnSpecialTokens(text);
|
|
223
|
+
for (const segment of segments) {
|
|
224
|
+
if (this.specialTokenPatterns.has(segment)) {
|
|
225
|
+
count += 1;
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
count += this.estimateTokenCount(segment);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
let result = Math.max(1, Math.round(count));
|
|
232
|
+
// Apply Claude safety multiplier for Claude estimation encoding
|
|
233
|
+
// This ensures we over-estimate to prevent API truncation
|
|
234
|
+
if (this.isClaudeEstimation) {
|
|
235
|
+
result = Math.ceil(result * CLAUDE_SAFETY_MULTIPLIER);
|
|
236
|
+
}
|
|
237
|
+
return result;
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Split text on special tokens while preserving them
|
|
241
|
+
*
|
|
242
|
+
* @param text - Input text
|
|
243
|
+
* @returns Array of segments
|
|
244
|
+
*/
|
|
245
|
+
splitOnSpecialTokens(text) {
|
|
246
|
+
const segments = [];
|
|
247
|
+
// Build regex pattern for all special tokens
|
|
248
|
+
const specialTokensList = Array.from(this.specialTokenPatterns.keys());
|
|
249
|
+
if (specialTokensList.length === 0) {
|
|
250
|
+
return [text];
|
|
251
|
+
}
|
|
252
|
+
const pattern = new RegExp(`(${specialTokensList.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})`, "g");
|
|
253
|
+
let lastIndex = 0;
|
|
254
|
+
let match;
|
|
255
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
256
|
+
// Add text before the special token
|
|
257
|
+
if (match.index > lastIndex) {
|
|
258
|
+
segments.push(text.slice(lastIndex, match.index));
|
|
259
|
+
}
|
|
260
|
+
// Add the special token
|
|
261
|
+
segments.push(match[0]);
|
|
262
|
+
lastIndex = match.index + match[0].length;
|
|
263
|
+
}
|
|
264
|
+
// Add remaining text
|
|
265
|
+
if (lastIndex < text.length) {
|
|
266
|
+
segments.push(text.slice(lastIndex));
|
|
267
|
+
}
|
|
268
|
+
return segments.filter((s) => s.length > 0);
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Encode regular text using BPE
|
|
272
|
+
* Uses pattern-based splitting and byte-pair encoding
|
|
273
|
+
*
|
|
274
|
+
* @param text - Text to encode (no special tokens)
|
|
275
|
+
* @returns Array of token IDs
|
|
276
|
+
*/
|
|
277
|
+
encodeText(text) {
|
|
278
|
+
const tokens = [];
|
|
279
|
+
// Split text using the encoding's pattern
|
|
280
|
+
const matches = text.match(this.config.patternSplit) || [text];
|
|
281
|
+
for (const match of matches) {
|
|
282
|
+
// Check cache first
|
|
283
|
+
const cached = this.bytePairCache.get(match);
|
|
284
|
+
if (cached) {
|
|
285
|
+
tokens.push(...cached);
|
|
286
|
+
continue;
|
|
287
|
+
}
|
|
288
|
+
// Convert to bytes and apply BPE
|
|
289
|
+
const bytes = stringToBytes(match);
|
|
290
|
+
const encodedTokens = this.bytePairEncode(bytes);
|
|
291
|
+
// Cache the result
|
|
292
|
+
if (match.length < 100) {
|
|
293
|
+
this.bytePairCache.set(match, encodedTokens);
|
|
294
|
+
}
|
|
295
|
+
tokens.push(...encodedTokens);
|
|
296
|
+
}
|
|
297
|
+
return tokens;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Apply BPE to a byte sequence
|
|
301
|
+
* Uses a simplified hash-based approach for estimation
|
|
302
|
+
*
|
|
303
|
+
* @param bytes - UTF-8 bytes
|
|
304
|
+
* @returns Array of token IDs
|
|
305
|
+
*/
|
|
306
|
+
bytePairEncode(bytes) {
|
|
307
|
+
if (bytes.length === 0)
|
|
308
|
+
return [];
|
|
309
|
+
// For accurate BPE, we'd need the full vocabulary and merge rules
|
|
310
|
+
// Instead, we use a deterministic hash-based approach that:
|
|
311
|
+
// 1. Groups bytes into likely token boundaries
|
|
312
|
+
// 2. Assigns consistent token IDs based on content
|
|
313
|
+
const tokens = [];
|
|
314
|
+
let i = 0;
|
|
315
|
+
while (i < bytes.length) {
|
|
316
|
+
// Determine likely token boundary (1-6 bytes typically)
|
|
317
|
+
const tokenLength = this.estimateTokenBoundary(bytes, i);
|
|
318
|
+
const tokenBytes = bytes.slice(i, i + tokenLength);
|
|
319
|
+
// Generate a consistent token ID from the bytes
|
|
320
|
+
const tokenId = this.bytesToTokenId(tokenBytes);
|
|
321
|
+
tokens.push(tokenId);
|
|
322
|
+
i += tokenLength;
|
|
323
|
+
}
|
|
324
|
+
return tokens;
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Estimate the boundary of the next token
|
|
328
|
+
*
|
|
329
|
+
* @param bytes - Full byte sequence
|
|
330
|
+
* @param start - Starting position
|
|
331
|
+
* @returns Number of bytes for the next token
|
|
332
|
+
*/
|
|
333
|
+
estimateTokenBoundary(bytes, start) {
|
|
334
|
+
const remaining = bytes.length - start;
|
|
335
|
+
if (remaining <= 0)
|
|
336
|
+
return 0;
|
|
337
|
+
// UTF-8 aware boundary detection
|
|
338
|
+
const firstByte = bytes[start];
|
|
339
|
+
// ASCII printable characters often merge together
|
|
340
|
+
if (firstByte >= 0x20 && firstByte <= 0x7e) {
|
|
341
|
+
// Check for common patterns
|
|
342
|
+
let end = start + 1;
|
|
343
|
+
while (end < bytes.length && end - start < 6) {
|
|
344
|
+
const b = bytes[end];
|
|
345
|
+
// Continue if same type of character
|
|
346
|
+
if (b >= 0x41 && b <= 0x5a) {
|
|
347
|
+
// A-Z
|
|
348
|
+
end++;
|
|
349
|
+
}
|
|
350
|
+
else if (b >= 0x61 && b <= 0x7a) {
|
|
351
|
+
// a-z
|
|
352
|
+
end++;
|
|
353
|
+
}
|
|
354
|
+
else if (b >= 0x30 && b <= 0x39) {
|
|
355
|
+
// 0-9
|
|
356
|
+
end++;
|
|
357
|
+
}
|
|
358
|
+
else {
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
return Math.min(end - start, remaining);
|
|
363
|
+
}
|
|
364
|
+
// Multi-byte UTF-8 sequences
|
|
365
|
+
if ((firstByte & 0xe0) === 0xc0)
|
|
366
|
+
return Math.min(2, remaining);
|
|
367
|
+
if ((firstByte & 0xf0) === 0xe0)
|
|
368
|
+
return Math.min(3, remaining);
|
|
369
|
+
if ((firstByte & 0xf8) === 0xf0)
|
|
370
|
+
return Math.min(4, remaining);
|
|
371
|
+
return 1;
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Convert bytes to a deterministic token ID
|
|
375
|
+
*
|
|
376
|
+
* @param bytes - Token bytes
|
|
377
|
+
* @returns Token ID
|
|
378
|
+
*/
|
|
379
|
+
bytesToTokenId(bytes) {
|
|
380
|
+
// Use a hash to generate consistent token IDs
|
|
381
|
+
const text = bytesToString(bytes);
|
|
382
|
+
const hash = hashString(text);
|
|
383
|
+
// Map to valid token range (avoid special tokens at the end)
|
|
384
|
+
return hash % (this.config.vocabSize - 20);
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Estimate token count for text using statistical methods
|
|
388
|
+
* This is the primary method for fast token counting
|
|
389
|
+
*
|
|
390
|
+
* ACCURACY NOTES (validated against tiktoken-rs):
|
|
391
|
+
* - English text: +/-10-15% typically accurate
|
|
392
|
+
* - Numbers: Now properly models 1-3 digit grouping
|
|
393
|
+
* - CJK with o200k: Highly efficient (1 token per char typically)
|
|
394
|
+
* - CJK with cl100k: Less efficient (1-2 tokens per char)
|
|
395
|
+
* - Emoji: 2-3 tokens each
|
|
396
|
+
*
|
|
397
|
+
* @param text - Text to estimate
|
|
398
|
+
* @returns Estimated token count
|
|
399
|
+
*/
|
|
400
|
+
estimateTokenCount(text) {
|
|
401
|
+
if (!text)
|
|
402
|
+
return 0;
|
|
403
|
+
const codePoints = countCodePoints(text);
|
|
404
|
+
const byteLength = stringToBytes(text).length;
|
|
405
|
+
// Start with base estimate, but we'll rebuild for mixed content
|
|
406
|
+
let estimate = 0;
|
|
407
|
+
// Handle numbers separately - tiktoken splits into 1-3 digit groups
|
|
408
|
+
// This is CRITICAL: "12345678901234567890" = 7 tokens, not 40
|
|
409
|
+
const numberMatches = text.match(/\d+/g);
|
|
410
|
+
let numberCharsCount = 0;
|
|
411
|
+
let numberTokenCount = 0;
|
|
412
|
+
if (numberMatches) {
|
|
413
|
+
for (const num of numberMatches) {
|
|
414
|
+
numberCharsCount += num.length;
|
|
415
|
+
// tiktoken groups digits into 1-3 digit tokens
|
|
416
|
+
numberTokenCount += Math.ceil(num.length / 3);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
// Handle CJK characters - behavior differs by encoding
|
|
420
|
+
const cjkChars = text.match(/[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/g);
|
|
421
|
+
let cjkCharsCount = 0;
|
|
422
|
+
let cjkTokenCount = 0;
|
|
423
|
+
if (cjkChars) {
|
|
424
|
+
cjkCharsCount = cjkChars.length;
|
|
425
|
+
if (this.config.name === "o200k_base") {
|
|
426
|
+
// o200k_base is HIGHLY efficient for CJK
|
|
427
|
+
// "こんにちは世界" = 2 tokens (not 6)
|
|
428
|
+
// Typically 1-3 CJK chars per token
|
|
429
|
+
cjkTokenCount = Math.ceil(cjkCharsCount / 2.5);
|
|
430
|
+
}
|
|
431
|
+
else {
|
|
432
|
+
// cl100k_base: CJK chars are typically 1-2 tokens each
|
|
433
|
+
cjkTokenCount = Math.ceil(cjkCharsCount * 1.2);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
// Handle emoji - typically 2-3 tokens each
|
|
437
|
+
let emojiCount = 0;
|
|
438
|
+
let emojiTokenCount = 0;
|
|
439
|
+
if (containsEmoji(text)) {
|
|
440
|
+
const emojis = text.match(/\p{Emoji}/gu);
|
|
441
|
+
if (emojis) {
|
|
442
|
+
emojiCount = emojis.length;
|
|
443
|
+
// Each emoji is typically 2-3 tokens
|
|
444
|
+
emojiTokenCount = Math.ceil(emojiCount * 2.5);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
// Calculate remaining non-special characters
|
|
448
|
+
const specialCharsCount = numberCharsCount + cjkCharsCount + emojiCount;
|
|
449
|
+
const remainingChars = codePoints - specialCharsCount;
|
|
450
|
+
// Base estimate for remaining (mostly ASCII/Latin) text
|
|
451
|
+
if (remainingChars > 0) {
|
|
452
|
+
estimate = remainingChars / this.config.averageCharsPerToken;
|
|
453
|
+
// Whitespace sequences add minimal overhead
|
|
454
|
+
const whitespaceSeqs = text.match(/\s+/g);
|
|
455
|
+
if (whitespaceSeqs) {
|
|
456
|
+
estimate += whitespaceSeqs.length * 0.1;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
// Add special character token counts
|
|
460
|
+
estimate += numberTokenCount;
|
|
461
|
+
estimate += cjkTokenCount;
|
|
462
|
+
estimate += emojiTokenCount;
|
|
463
|
+
// Non-ASCII penalty for remaining text (excluding CJK which we handled)
|
|
464
|
+
if (!isAscii(text) && remainingChars > 0) {
|
|
465
|
+
const nonAsciiRatio = (byteLength - codePoints) / byteLength;
|
|
466
|
+
// Only apply penalty to the base estimate portion
|
|
467
|
+
const basePortion = remainingChars / this.config.averageCharsPerToken;
|
|
468
|
+
estimate += basePortion * nonAsciiRatio * 0.15;
|
|
469
|
+
}
|
|
470
|
+
// o200k_base tends to be slightly more efficient overall
|
|
471
|
+
if (this.config.name === "o200k_base" && remainingChars > 0) {
|
|
472
|
+
const basePortion = remainingChars / this.config.averageCharsPerToken;
|
|
473
|
+
estimate -= basePortion * 0.05;
|
|
474
|
+
}
|
|
475
|
+
return Math.max(1, Math.round(estimate));
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
//# sourceMappingURL=bpe.js.map
|
package/dist/bpe.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bpe.js","sourceRoot":"","sources":["../src/bpe.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAGH,OAAO,EACL,aAAa,EACb,aAAa,EACb,aAAa,EACb,OAAO,EACP,eAAe,EACf,UAAU,GACX,MAAM,YAAY,CAAC;AAEpB;;GAEG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAAkB;IACnD,SAAS,EAAE,eAAe;IAC1B,OAAO,EAAE,cAAc;IACvB,KAAK,EAAE,YAAY;IACnB,KAAK,EAAE,YAAY;IACnB,QAAQ,EAAE,eAAe;IACzB,UAAU,EAAE,iBAAiB;CAC9B,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAmB;IAChD,IAAI,EAAE,aAAa;IACnB,YAAY,EACV,uHAAuH;IACzH,aAAa,EAAE,sBAAsB;IACrC,SAAS,EAAE,MAAM;IACjB,oBAAoB,EAAE,GAAG;CAC1B,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAmB;IAC/C,IAAI,EAAE,YAAY;IAClB,YAAY,EACV,qRAAqR;IACvR,aAAa,EAAE,sBAAsB;IACrC,SAAS,EAAE,MAAM;IACjB,oBAAoB,EAAE,GAAG;CAC1B,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAmB;IAC9C,IAAI,EAAE,WAAW;IACjB,YAAY,EACV,8EAA8E;IAChF,aAAa,EAAE,sBAAsB;IACrC,SAAS,EAAE,KAAK;IAChB,oBAAoB,EAAE,GAAG;CAC1B,CAAC;AAEF;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAmB;IACtD,IAAI,EAAE,mBAAmB;IACzB,2EAA2E;IAC3E,YAAY,EACV,oJAAoJ;IACtJ,aAAa,EAAE,sBAAsB;IACrC,kFAAkF;IAClF,SAAS,EAAE,KAAK;IAChB,+DAA+D;IAC/D,oBAAoB,EAAE,GAAG;CAC1B,CAAC;AAEF;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAkB;IAClD,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,aAAa;YAChB,OAAO,kBAAkB,CAAC;QAC5B,KAAK,YAAY;YACf,OAAO,iBAAiB,CAAC;QAC3B,KAAK,WAAW;YACd,OAAO,gBAAgB,CAAC;QAC1B,KAAK,mBAAmB;YACtB,OAAO,wBAAwB,CAAC;QAClC;YACE,OAAO,iBAAiB,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAG,IAAI,CAAC;AAE7C;;;GAGG;AACH,MAAM,OAAO,YAAY;IACN,MAAM,CAAiB;IACvB,oBAAoB,CAAsB;IAC1C,aAAa,CAAwB;IACrC,kBAAkB,CAAU;IAE7C,YAAY,WAAyB,YAAY;QAC/C,IAAI,CAAC,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAC1C,IAAI,CAAC,oBAAoB,GAAG,IAAI,GAAG,EAAE,CAAC;QACtC,IAAI,CAAC,aAAa,GAAG,IAAI,GAAG,EAAE,CAAC;QAC/B,IAAI,CAAC,kBAAkB,GAAG,QAAQ,KAAK,mBAAmB,CAAC;QAE3D,yDAAyD;QACzD,IAAI,CAAC,uBAAuB,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,uBAAuB;QAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC;QACzC,IAAI,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,EAAE,CAAC,CAAC,8BAA8B;QAExE,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,OAAO,EAAE,CAAC,CAAC;QAC3D,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,CAAC,CAAC;QACzD,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;QACvD,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;YACpB,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAC5D,CAAC;QACD,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YACtB,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,EAAE,CAAC,CAAC;QAC9D,CAAC;IACH,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC;IAC1B,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY;QACjB,IAAI,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAErB,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,iCAAiC;QACjC,MAAM,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAEjD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC3C,0CAA0C;gBAC1C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAE,CAAC,CAAC;YACvD,CAAC;iBAAM,CAAC;gBACN,2BAA2B;gBAC3B,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;YAC3C,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,MAAgB;QACrB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,gCAAgC;YAChC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBACnD,IAAI,EAAE,KAAK,KAAK,EAAE,CAAC;oBACjB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACjB,MAAM;gBACR,CAAC;YACH,CAAC;YAED,kEAAkE;YAClE,kDAAkD;YAClD,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,GAAG,CAAC,CAAC;QACjC,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACxB,CAAC;IAED;;;;;;;;;;OAUG;IACH,WAAW,CAAC,IAAY;QACtB,IAAI,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QAEpB,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAEjD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC3C,KAAK,IAAI,CAAC,CAAC;YACb,CAAC;iBAAM,CAAC;gBACN,KAAK,IAAI,IAAI,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC;YAC5C,CAAC;QACH,CAAC;QAED,IAAI,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;QAE5C,gEAAgE;QAChE,0DAA0D;QAC1D,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,wBAAwB,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;OAKG;IACK,oBAAoB,CAAC,IAAY;QACvC,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,6CAA6C;QAC7C,MAAM,iBAAiB,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC,CAAC;QACvE,IAAI,iBAAiB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC,CAAC;QAChB,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,MAAM,CACxB,IAAI,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EACvF,GAAG,CACJ,CAAC;QAEF,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,KAAK,CAAC;QAEV,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7C,oCAAoC;YACpC,IAAI,KAAK,CAAC,KAAK,GAAG,SAAS,EAAE,CAAC;gBAC5B,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;YACpD,CAAC;YACD,wBAAwB;YACxB,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACxB,SAAS,GAAG,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC5C,CAAC;QAED,qBAAqB;QACrB,IAAI,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC5B,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC;QACvC,CAAC;QAED,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED;;;;;;OAMG;IACK,UAAU,CAAC,IAAY;QAC7B,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,0CAA0C;QAC1C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE/D,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,oBAAoB;YACpB,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAC7C,IAAI,MAAM,EAAE,CAAC;gBACX,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;gBACvB,SAAS;YACX,CAAC;YAED,iCAAiC;YACjC,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;YACnC,MAAM,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;YAEjD,mBAAmB;YACnB,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBACvB,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;YAC/C,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,CAAC;QAChC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACK,cAAc,CAAC,KAAiB;QACtC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAElC,kEAAkE;QAClE,4DAA4D;QAC5D,+CAA+C;QAC/C,mDAAmD;QAEnD,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,GAAG,CAAC,CAAC;QAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,wDAAwD;YACxD,MAAM,WAAW,GAAG,IAAI,CAAC,qBAAqB,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YACzD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC;YAEnD,gDAAgD;YAChD,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;YAChD,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAErB,CAAC,IAAI,WAAW,CAAC;QACnB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACK,qBAAqB,CAAC,KAAiB,EAAE,KAAa;QAC5D,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC;QACvC,IAAI,SAAS,IAAI,CAAC;YAAE,OAAO,CAAC,CAAC;QAE7B,iCAAiC;QACjC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC;QAE/B,kDAAkD;QAClD,IAAI,SAAS,IAAI,IAAI,IAAI,SAAS,IAAI,IAAI,EAAE,CAAC;YAC3C,4BAA4B;YAC5B,IAAI,GAAG,GAAG,KAAK,GAAG,CAAC,CAAC;YAEpB,OAAO,GAAG,GAAG,KAAK,CAAC,MAAM,IAAI,GAAG,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC;gBAC7C,MAAM,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBAErB,qCAAqC;gBACrC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;oBAC3B,MAAM;oBACN,GAAG,EAAE,CAAC;gBACR,CAAC;qBAAM,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;oBAClC,MAAM;oBACN,GAAG,EAAE,CAAC;gBACR,CAAC;qBAAM,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;oBAClC,MAAM;oBACN,GAAG,EAAE,CAAC;gBACR,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YAED,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,EAAE,SAAS,CAAC,CAAC;QAC1C,CAAC;QAED,6BAA6B;QAC7B,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,IAAI;YAAE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAC/D,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,IAAI;YAAE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAC/D,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,IAAI;YAAE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAE/D,OAAO,CAAC,CAAC;IACX,CAAC;IAED;;;;;OAKG;IACK,cAAc,CAAC,KAAiB;QACtC,8CAA8C;QAC9C,MAAM,IAAI,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;QAE9B,6DAA6D;QAC7D,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,EAAE,CAAC,CAAC;IAC7C,CAAC;IAED;;;;;;;;;;;;;OAaG;IACK,kBAAkB,CAAC,IAAY;QACrC,IAAI,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QAEpB,MAAM,UAAU,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QAE9C,gEAAgE;QAChE,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,oEAAoE;QACpE,8DAA8D;QAC9D,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACzC,IAAI,gBAAgB,GAAG,CAAC,CAAC;QACzB,IAAI,gBAAgB,GAAG,CAAC,CAAC;QAEzB,IAAI,aAAa,EAAE,CAAC;YAClB,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,gBAAgB,IAAI,GAAG,CAAC,MAAM,CAAC;gBAC/B,+CAA+C;gBAC/C,gBAAgB,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAChD,CAAC;QACH,CAAC;QAED,uDAAuD;QACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CACzB,sEAAsE,CACvE,CAAC;QACF,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,aAAa,GAAG,CAAC,CAAC;QAEtB,IAAI,QAAQ,EAAE,CAAC;YACb,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC;YAEhC,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBACtC,yCAAyC;gBACzC,+BAA+B;gBAC/B,oCAAoC;gBACpC,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,CAAC;YACjD,CAAC;iBAAM,CAAC;gBACN,uDAAuD;gBACvD,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QAED,2CAA2C;QAC3C,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,eAAe,GAAG,CAAC,CAAC;QAExB,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;YACzC,IAAI,MAAM,EAAE,CAAC;gBACX,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC;gBAC3B,qCAAqC;gBACrC,eAAe,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,CAAC;YAChD,CAAC;QACH,CAAC;QAED,6CAA6C;QAC7C,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,aAAa,GAAG,UAAU,CAAC;QACxE,MAAM,cAAc,GAAG,UAAU,GAAG,iBAAiB,CAAC;QAEtD,wDAAwD;QACxD,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC;YACvB,QAAQ,GAAG,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC;YAE7D,4CAA4C;YAC5C,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YAC1C,IAAI,cAAc,EAAE,CAAC;gBACnB,QAAQ,IAAI,cAAc,CAAC,MAAM,GAAG,GAAG,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,QAAQ,IAAI,gBAAgB,CAAC;QAC7B,QAAQ,IAAI,aAAa,CAAC;QAC1B,QAAQ,IAAI,eAAe,CAAC;QAE5B,wEAAwE;QACxE,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC;YACzC,MAAM,aAAa,GAAG,CAAC,UAAU,GAAG,UAAU,CAAC,GAAG,UAAU,CAAC;YAC7D,kDAAkD;YAClD,MAAM,WAAW,GAAG,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC;YACtE,QAAQ,IAAI,WAAW,GAAG,aAAa,GAAG,IAAI,CAAC;QACjD,CAAC;QAED,yDAAyD;QACzD,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,KAAK,YAAY,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC;YAC5D,MAAM,WAAW,GAAG,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC;YACtE,QAAQ,IAAI,WAAW,GAAG,IAAI,CAAC;QACjC,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC3C,CAAC;CACF"}
|