@softerist/heuristic-mcp 3.0.15 → 3.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -104
- package/config.jsonc +173 -173
- package/features/ann-config.js +131 -0
- package/features/clear-cache.js +84 -0
- package/features/find-similar-code.js +291 -0
- package/features/hybrid-search.js +544 -0
- package/features/index-codebase.js +3268 -0
- package/features/lifecycle.js +1189 -0
- package/features/package-version.js +302 -0
- package/features/register.js +408 -0
- package/features/resources.js +156 -0
- package/features/set-workspace.js +265 -0
- package/index.js +96 -96
- package/lib/cache-ops.js +22 -22
- package/lib/cache-utils.js +565 -565
- package/lib/cache.js +1870 -1870
- package/lib/call-graph.js +396 -396
- package/lib/cli.js +1 -1
- package/lib/config.js +517 -517
- package/lib/constants.js +39 -39
- package/lib/embed-query-process.js +7 -7
- package/lib/embedding-process.js +7 -7
- package/lib/embedding-worker.js +299 -299
- package/lib/ignore-patterns.js +316 -316
- package/lib/json-worker.js +14 -14
- package/lib/json-writer.js +337 -337
- package/lib/logging.js +164 -164
- package/lib/memory-logger.js +13 -13
- package/lib/onnx-backend.js +193 -193
- package/lib/project-detector.js +84 -84
- package/lib/server-lifecycle.js +165 -165
- package/lib/settings-editor.js +754 -754
- package/lib/tokenizer.js +256 -256
- package/lib/utils.js +428 -428
- package/lib/vector-store-binary.js +627 -627
- package/lib/vector-store-sqlite.js +95 -95
- package/lib/workspace-env.js +28 -28
- package/mcp_config.json +9 -9
- package/package.json +86 -75
- package/scripts/clear-cache.js +20 -0
- package/scripts/download-model.js +43 -0
- package/scripts/mcp-launcher.js +49 -0
- package/scripts/postinstall.js +12 -0
- package/search-configs.js +36 -36
- package/.prettierrc +0 -7
- package/debug-pids.js +0 -30
- package/eslint.config.js +0 -36
- package/specs/plan.md +0 -23
- package/vitest.config.js +0 -39
package/lib/tokenizer.js
CHANGED
|
@@ -1,256 +1,256 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Token estimation and limits for embedding models
|
|
3
|
-
*
|
|
4
|
-
* Performance:
|
|
5
|
-
* - O(1) model lookups with precomputed maps
|
|
6
|
-
* - Zero regex / Zero allocations in hot loop
|
|
7
|
-
* - Proper LRU cache eviction
|
|
8
|
-
* - Optimized Unicode whitespace detection (ordered by probability)
|
|
9
|
-
* - Eliminated double toLowerCase() calls
|
|
10
|
-
* - Type-safe guard rails on all public APIs
|
|
11
|
-
* - Branchless special character counting
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
|
|
15
|
-
|
|
16
|
-
const MODEL_TOKEN_LIMITS_RAW = {
|
|
17
|
-
// NOTE: While jina-embeddings-v2-base-code supports 8192 tokens, ONNX runtime
|
|
18
|
-
// allocates O(n²) memory for attention. Using 512 tokens for optimal speed
|
|
19
|
-
// with 4 ONNX threads (~1.5GB RAM, fastest inference).
|
|
20
|
-
'jinaai/jina-embeddings-v2-base-code': 512,
|
|
21
|
-
default: 512, // Safe default for BERT-like models
|
|
22
|
-
};
|
|
23
|
-
|
|
24
|
-
export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
|
|
25
|
-
? { ...MODEL_TOKEN_LIMITS_RAW }
|
|
26
|
-
: Object.freeze({ ...MODEL_TOKEN_LIMITS_RAW });
|
|
27
|
-
|
|
28
|
-
const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* Precomputed case-insensitive lookup
|
|
32
|
-
*/
|
|
33
|
-
const MODEL_LIMITS_LC = new Map();
|
|
34
|
-
for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
35
|
-
MODEL_LIMITS_LC.set(k.toLowerCase(), v);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Internal helper: get model limit from pre-normalized key
|
|
40
|
-
* Avoids double toLowerCase() when called from cache flow
|
|
41
|
-
* @param {string} lowerName - Pre-normalized lowercase model name
|
|
42
|
-
* @param {*} originalName - Original model name (may not be a string)
|
|
43
|
-
* @returns {number} Token limit
|
|
44
|
-
*/
|
|
45
|
-
function getModelTokenLimitFromLower(lowerName, originalName) {
|
|
46
|
-
// Fast path: try exact match first (only if original is a string)
|
|
47
|
-
if (typeof originalName === 'string') {
|
|
48
|
-
const direct = MODEL_TOKEN_LIMITS[originalName];
|
|
49
|
-
if (direct !== undefined) return direct;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// Slow path: use pre-normalized key
|
|
53
|
-
const exact = MODEL_LIMITS_LC.get(lowerName);
|
|
54
|
-
if (exact !== undefined) return exact;
|
|
55
|
-
|
|
56
|
-
// Heuristics for common models (use conservative limits for ONNX speed)
|
|
57
|
-
// 512 tokens = fastest, 1024 = 4x more compute due to O(n²) attention
|
|
58
|
-
if (
|
|
59
|
-
lowerName.includes('jina') ||
|
|
60
|
-
lowerName.includes('nomic') ||
|
|
61
|
-
lowerName.includes('gte-large')
|
|
62
|
-
) {
|
|
63
|
-
return 512;
|
|
64
|
-
}
|
|
65
|
-
if (lowerName.includes('gte-base') || lowerName.includes('gte-small')) {
|
|
66
|
-
return 512;
|
|
67
|
-
}
|
|
68
|
-
if (lowerName.includes('minilm')) {
|
|
69
|
-
return 512;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
return DEFAULT_LIMIT;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/**
|
|
76
|
-
* Get the maximum token limit for a given model
|
|
77
|
-
* @param {string} modelName - The model name
|
|
78
|
-
* @returns {number} Maximum tokens supported by the model
|
|
79
|
-
*/
|
|
80
|
-
export function getModelTokenLimit(modelName) {
|
|
81
|
-
// Guard clause for non-string or empty inputs
|
|
82
|
-
if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
|
|
83
|
-
|
|
84
|
-
const direct = MODEL_TOKEN_LIMITS[modelName];
|
|
85
|
-
if (direct !== undefined) return direct;
|
|
86
|
-
|
|
87
|
-
const lower = modelName.toLowerCase();
|
|
88
|
-
return getModelTokenLimitFromLower(lower, modelName);
|
|
89
|
-
}
|
|
90
|
-
/**
|
|
91
|
-
* LRU cache for chunking parameters
|
|
92
|
-
* @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
|
|
93
|
-
*/
|
|
94
|
-
import { CHUNKING_PARAMS_CACHE_SIZE as MAX_CACHE_SIZE } from './constants.js';
|
|
95
|
-
const chunkingParamsCache = new Map();
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Get chunking parameters for a model
|
|
99
|
-
* @param {string} modelName - The model name
|
|
100
|
-
* @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
|
|
101
|
-
*/
|
|
102
|
-
export function getChunkingParams(modelName) {
|
|
103
|
-
const key = typeof modelName === 'string' && modelName.length ? modelName.toLowerCase() : '';
|
|
104
|
-
|
|
105
|
-
// Fast path for invalid inputs: don't consume cache slots
|
|
106
|
-
if (key === '') {
|
|
107
|
-
const maxTokens = DEFAULT_LIMIT;
|
|
108
|
-
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
109
|
-
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
110
|
-
return { maxTokens, targetTokens, overlapTokens };
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// LRU pattern: delete-and-reinsert to mark as most recently used.
|
|
114
|
-
// Note: This creates minor GC pressure due to Map key reallocation, but is
|
|
115
|
-
// acceptable for MAX_CACHE_SIZE=100. For larger caches (1000+), consider
|
|
116
|
-
// a doubly-linked-list LRU implementation for O(1) access without reallocation.
|
|
117
|
-
const cached = chunkingParamsCache.get(key);
|
|
118
|
-
if (cached) {
|
|
119
|
-
chunkingParamsCache.delete(key);
|
|
120
|
-
chunkingParamsCache.set(key, cached);
|
|
121
|
-
return cached;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// Cache miss: compute new params (avoid double toLowerCase)
|
|
125
|
-
const maxTokens = getModelTokenLimitFromLower(key, modelName);
|
|
126
|
-
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
127
|
-
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
128
|
-
|
|
129
|
-
const params = { maxTokens, targetTokens, overlapTokens };
|
|
130
|
-
|
|
131
|
-
// LRU eviction: remove oldest entry if at capacity
|
|
132
|
-
if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
|
|
133
|
-
const oldestKey = chunkingParamsCache.keys().next().value;
|
|
134
|
-
chunkingParamsCache.delete(oldestKey);
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
chunkingParamsCache.set(key, params);
|
|
138
|
-
return params;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* ASCII whitespace lookup table
|
|
143
|
-
*/
|
|
144
|
-
const WS = new Uint8Array(128);
|
|
145
|
-
WS[9] = 1; // \t (horizontal tab)
|
|
146
|
-
WS[10] = 1; // \n (line feed)
|
|
147
|
-
WS[11] = 1; // \v (vertical tab)
|
|
148
|
-
WS[12] = 1; // \f (form feed)
|
|
149
|
-
WS[13] = 1; // \r (carriage return)
|
|
150
|
-
WS[32] = 1; // space
|
|
151
|
-
|
|
152
|
-
/**
|
|
153
|
-
* ASCII special character lookup table
|
|
154
|
-
*/
|
|
155
|
-
const SPECIAL = new Uint8Array(128);
|
|
156
|
-
const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
|
|
157
|
-
for (let i = 0; i < SPECIAL_CHARS.length; i++) {
|
|
158
|
-
SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
/**
|
|
162
|
-
* Calculate token count for a word of given length
|
|
163
|
-
* This function will be inlined by V8
|
|
164
|
-
* @param {number} len - Word length in characters
|
|
165
|
-
* @returns {number} Estimated token count
|
|
166
|
-
*/
|
|
167
|
-
function calcWordTokens(len) {
|
|
168
|
-
if (len <= 4) return 1;
|
|
169
|
-
if (len <= 10) return 2;
|
|
170
|
-
return (len + 3) >> 2; // ceil(len / 4)
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
/**
|
|
174
|
-
* Estimate token count for text (conservative estimate for code)
|
|
175
|
-
*
|
|
176
|
-
* Performance optimizations:
|
|
177
|
-
* - No regex (pure integer comparisons)
|
|
178
|
-
* - No string allocations (charCodeAt only)
|
|
179
|
-
* - Inlined word token calculation
|
|
180
|
-
* - Unicode checks ordered by frequency
|
|
181
|
-
* - Branchless special character counting
|
|
182
|
-
*
|
|
183
|
-
* @param {string} text - The text to estimate tokens for
|
|
184
|
-
* @param {object} [options]
|
|
185
|
-
* @param {boolean} [options.includeSpecialTokens=true] - Whether to include [CLS]/[SEP]
|
|
186
|
-
* @returns {number} Estimated token count
|
|
187
|
-
*/
|
|
188
|
-
export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
189
|
-
// Type-safe guard: prevents crashes from non-string inputs
|
|
190
|
-
if (typeof text !== 'string' || text.length === 0) return 0;
|
|
191
|
-
|
|
192
|
-
const len = text.length;
|
|
193
|
-
let tokenCount = includeSpecialTokens ? 2 : 0; // [CLS] + [SEP]
|
|
194
|
-
let specialCount = 0;
|
|
195
|
-
let wordStart = -1;
|
|
196
|
-
|
|
197
|
-
for (let i = 0; i < len; i++) {
|
|
198
|
-
const code = text.charCodeAt(i);
|
|
199
|
-
|
|
200
|
-
// ASCII fast path (most common for code)
|
|
201
|
-
if (code < 128) {
|
|
202
|
-
if (WS[code]) {
|
|
203
|
-
if (wordStart !== -1) {
|
|
204
|
-
tokenCount += calcWordTokens(i - wordStart);
|
|
205
|
-
wordStart = -1;
|
|
206
|
-
}
|
|
207
|
-
} else {
|
|
208
|
-
// Branchless: add 0 or 1 based on SPECIAL[code]
|
|
209
|
-
specialCount += SPECIAL[code];
|
|
210
|
-
if (wordStart === -1) wordStart = i;
|
|
211
|
-
}
|
|
212
|
-
continue;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
// Unicode whitespace: ordered by frequency for real-world text
|
|
216
|
-
// Note: Includes legacy 0x180E for tokenization compatibility even though
|
|
217
|
-
// modern JS \s doesn't consider it whitespace (ES2016+)
|
|
218
|
-
const isUnicodeWS =
|
|
219
|
-
code === 0x00a0 || // NBSP (most common)
|
|
220
|
-
code === 0x202f || // NARROW NO-BREAK SPACE
|
|
221
|
-
(code >= 0x2000 && code <= 0x200a) || // EN QUAD..HAIR SPACE
|
|
222
|
-
code === 0x3000 || // IDEOGRAPHIC SPACE (CJK)
|
|
223
|
-
code === 0x2028 || // LINE SEPARATOR
|
|
224
|
-
code === 0x2029 || // PARAGRAPH SEPARATOR
|
|
225
|
-
code === 0x205f || // MEDIUM MATHEMATICAL SPACE
|
|
226
|
-
code === 0x1680 || // OGHAM SPACE MARK
|
|
227
|
-
code === 0x180e || // MONGOLIAN VOWEL SEPARATOR (legacy)
|
|
228
|
-
code === 0x0085 || // NEXT LINE (NEL)
|
|
229
|
-
code === 0xfeff; // ZERO WIDTH NO-BREAK SPACE / BOM
|
|
230
|
-
|
|
231
|
-
if (isUnicodeWS) {
|
|
232
|
-
if (wordStart !== -1) {
|
|
233
|
-
tokenCount += calcWordTokens(i - wordStart);
|
|
234
|
-
wordStart = -1;
|
|
235
|
-
}
|
|
236
|
-
} else {
|
|
237
|
-
// Non-ASCII, non-whitespace (e.g., CJK, emojis, accented chars)
|
|
238
|
-
// Conservative estimate: treat each as 1 token
|
|
239
|
-
if (wordStart !== -1) {
|
|
240
|
-
tokenCount += calcWordTokens(i - wordStart);
|
|
241
|
-
wordStart = -1;
|
|
242
|
-
}
|
|
243
|
-
tokenCount++;
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// Flush final word
|
|
248
|
-
if (wordStart !== -1) {
|
|
249
|
-
tokenCount += calcWordTokens(len - wordStart);
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
// Add ~50% of special chars as tokens
|
|
253
|
-
tokenCount += specialCount >> 1;
|
|
254
|
-
|
|
255
|
-
return tokenCount;
|
|
256
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Token estimation and limits for embedding models
|
|
3
|
+
*
|
|
4
|
+
* Performance:
|
|
5
|
+
* - O(1) model lookups with precomputed maps
|
|
6
|
+
* - Zero regex / Zero allocations in hot loop
|
|
7
|
+
* - Proper LRU cache eviction
|
|
8
|
+
* - Optimized Unicode whitespace detection (ordered by probability)
|
|
9
|
+
* - Eliminated double toLowerCase() calls
|
|
10
|
+
* - Type-safe guard rails on all public APIs
|
|
11
|
+
* - Branchless special character counting
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
|
|
15
|
+
|
|
16
|
+
const MODEL_TOKEN_LIMITS_RAW = {
|
|
17
|
+
// NOTE: While jina-embeddings-v2-base-code supports 8192 tokens, ONNX runtime
|
|
18
|
+
// allocates O(n²) memory for attention. Using 512 tokens for optimal speed
|
|
19
|
+
// with 4 ONNX threads (~1.5GB RAM, fastest inference).
|
|
20
|
+
'jinaai/jina-embeddings-v2-base-code': 512,
|
|
21
|
+
default: 512, // Safe default for BERT-like models
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
|
|
25
|
+
? { ...MODEL_TOKEN_LIMITS_RAW }
|
|
26
|
+
: Object.freeze({ ...MODEL_TOKEN_LIMITS_RAW });
|
|
27
|
+
|
|
28
|
+
const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Precomputed case-insensitive lookup
|
|
32
|
+
*/
|
|
33
|
+
const MODEL_LIMITS_LC = new Map();
|
|
34
|
+
for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
35
|
+
MODEL_LIMITS_LC.set(k.toLowerCase(), v);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Internal helper: get model limit from pre-normalized key
|
|
40
|
+
* Avoids double toLowerCase() when called from cache flow
|
|
41
|
+
* @param {string} lowerName - Pre-normalized lowercase model name
|
|
42
|
+
* @param {*} originalName - Original model name (may not be a string)
|
|
43
|
+
* @returns {number} Token limit
|
|
44
|
+
*/
|
|
45
|
+
function getModelTokenLimitFromLower(lowerName, originalName) {
|
|
46
|
+
// Fast path: try exact match first (only if original is a string)
|
|
47
|
+
if (typeof originalName === 'string') {
|
|
48
|
+
const direct = MODEL_TOKEN_LIMITS[originalName];
|
|
49
|
+
if (direct !== undefined) return direct;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Slow path: use pre-normalized key
|
|
53
|
+
const exact = MODEL_LIMITS_LC.get(lowerName);
|
|
54
|
+
if (exact !== undefined) return exact;
|
|
55
|
+
|
|
56
|
+
// Heuristics for common models (use conservative limits for ONNX speed)
|
|
57
|
+
// 512 tokens = fastest, 1024 = 4x more compute due to O(n²) attention
|
|
58
|
+
if (
|
|
59
|
+
lowerName.includes('jina') ||
|
|
60
|
+
lowerName.includes('nomic') ||
|
|
61
|
+
lowerName.includes('gte-large')
|
|
62
|
+
) {
|
|
63
|
+
return 512;
|
|
64
|
+
}
|
|
65
|
+
if (lowerName.includes('gte-base') || lowerName.includes('gte-small')) {
|
|
66
|
+
return 512;
|
|
67
|
+
}
|
|
68
|
+
if (lowerName.includes('minilm')) {
|
|
69
|
+
return 512;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return DEFAULT_LIMIT;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Get the maximum token limit for a given model
|
|
77
|
+
* @param {string} modelName - The model name
|
|
78
|
+
* @returns {number} Maximum tokens supported by the model
|
|
79
|
+
*/
|
|
80
|
+
export function getModelTokenLimit(modelName) {
|
|
81
|
+
// Guard clause for non-string or empty inputs
|
|
82
|
+
if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
|
|
83
|
+
|
|
84
|
+
const direct = MODEL_TOKEN_LIMITS[modelName];
|
|
85
|
+
if (direct !== undefined) return direct;
|
|
86
|
+
|
|
87
|
+
const lower = modelName.toLowerCase();
|
|
88
|
+
return getModelTokenLimitFromLower(lower, modelName);
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* LRU cache for chunking parameters
|
|
92
|
+
* @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
|
|
93
|
+
*/
|
|
94
|
+
import { CHUNKING_PARAMS_CACHE_SIZE as MAX_CACHE_SIZE } from './constants.js';
|
|
95
|
+
const chunkingParamsCache = new Map();
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Get chunking parameters for a model
|
|
99
|
+
* @param {string} modelName - The model name
|
|
100
|
+
* @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
|
|
101
|
+
*/
|
|
102
|
+
export function getChunkingParams(modelName) {
|
|
103
|
+
const key = typeof modelName === 'string' && modelName.length ? modelName.toLowerCase() : '';
|
|
104
|
+
|
|
105
|
+
// Fast path for invalid inputs: don't consume cache slots
|
|
106
|
+
if (key === '') {
|
|
107
|
+
const maxTokens = DEFAULT_LIMIT;
|
|
108
|
+
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
109
|
+
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
110
|
+
return { maxTokens, targetTokens, overlapTokens };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// LRU pattern: delete-and-reinsert to mark as most recently used.
|
|
114
|
+
// Note: This creates minor GC pressure due to Map key reallocation, but is
|
|
115
|
+
// acceptable for MAX_CACHE_SIZE=100. For larger caches (1000+), consider
|
|
116
|
+
// a doubly-linked-list LRU implementation for O(1) access without reallocation.
|
|
117
|
+
const cached = chunkingParamsCache.get(key);
|
|
118
|
+
if (cached) {
|
|
119
|
+
chunkingParamsCache.delete(key);
|
|
120
|
+
chunkingParamsCache.set(key, cached);
|
|
121
|
+
return cached;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Cache miss: compute new params (avoid double toLowerCase)
|
|
125
|
+
const maxTokens = getModelTokenLimitFromLower(key, modelName);
|
|
126
|
+
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
127
|
+
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
128
|
+
|
|
129
|
+
const params = { maxTokens, targetTokens, overlapTokens };
|
|
130
|
+
|
|
131
|
+
// LRU eviction: remove oldest entry if at capacity
|
|
132
|
+
if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
|
|
133
|
+
const oldestKey = chunkingParamsCache.keys().next().value;
|
|
134
|
+
chunkingParamsCache.delete(oldestKey);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
chunkingParamsCache.set(key, params);
|
|
138
|
+
return params;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* ASCII whitespace lookup table
|
|
143
|
+
*/
|
|
144
|
+
const WS = new Uint8Array(128);
|
|
145
|
+
WS[9] = 1; // \t (horizontal tab)
|
|
146
|
+
WS[10] = 1; // \n (line feed)
|
|
147
|
+
WS[11] = 1; // \v (vertical tab)
|
|
148
|
+
WS[12] = 1; // \f (form feed)
|
|
149
|
+
WS[13] = 1; // \r (carriage return)
|
|
150
|
+
WS[32] = 1; // space
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* ASCII special character lookup table
|
|
154
|
+
*/
|
|
155
|
+
const SPECIAL = new Uint8Array(128);
|
|
156
|
+
const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
|
|
157
|
+
for (let i = 0; i < SPECIAL_CHARS.length; i++) {
|
|
158
|
+
SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Calculate token count for a word of given length
|
|
163
|
+
* This function will be inlined by V8
|
|
164
|
+
* @param {number} len - Word length in characters
|
|
165
|
+
* @returns {number} Estimated token count
|
|
166
|
+
*/
|
|
167
|
+
function calcWordTokens(len) {
|
|
168
|
+
if (len <= 4) return 1;
|
|
169
|
+
if (len <= 10) return 2;
|
|
170
|
+
return (len + 3) >> 2; // ceil(len / 4)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Estimate token count for text (conservative estimate for code)
|
|
175
|
+
*
|
|
176
|
+
* Performance optimizations:
|
|
177
|
+
* - No regex (pure integer comparisons)
|
|
178
|
+
* - No string allocations (charCodeAt only)
|
|
179
|
+
* - Inlined word token calculation
|
|
180
|
+
* - Unicode checks ordered by frequency
|
|
181
|
+
* - Branchless special character counting
|
|
182
|
+
*
|
|
183
|
+
* @param {string} text - The text to estimate tokens for
|
|
184
|
+
* @param {object} [options]
|
|
185
|
+
* @param {boolean} [options.includeSpecialTokens=true] - Whether to include [CLS]/[SEP]
|
|
186
|
+
* @returns {number} Estimated token count
|
|
187
|
+
*/
|
|
188
|
+
export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
189
|
+
// Type-safe guard: prevents crashes from non-string inputs
|
|
190
|
+
if (typeof text !== 'string' || text.length === 0) return 0;
|
|
191
|
+
|
|
192
|
+
const len = text.length;
|
|
193
|
+
let tokenCount = includeSpecialTokens ? 2 : 0; // [CLS] + [SEP]
|
|
194
|
+
let specialCount = 0;
|
|
195
|
+
let wordStart = -1;
|
|
196
|
+
|
|
197
|
+
for (let i = 0; i < len; i++) {
|
|
198
|
+
const code = text.charCodeAt(i);
|
|
199
|
+
|
|
200
|
+
// ASCII fast path (most common for code)
|
|
201
|
+
if (code < 128) {
|
|
202
|
+
if (WS[code]) {
|
|
203
|
+
if (wordStart !== -1) {
|
|
204
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
205
|
+
wordStart = -1;
|
|
206
|
+
}
|
|
207
|
+
} else {
|
|
208
|
+
// Branchless: add 0 or 1 based on SPECIAL[code]
|
|
209
|
+
specialCount += SPECIAL[code];
|
|
210
|
+
if (wordStart === -1) wordStart = i;
|
|
211
|
+
}
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Unicode whitespace: ordered by frequency for real-world text
|
|
216
|
+
// Note: Includes legacy 0x180E for tokenization compatibility even though
|
|
217
|
+
// modern JS \s doesn't consider it whitespace (ES2016+)
|
|
218
|
+
const isUnicodeWS =
|
|
219
|
+
code === 0x00a0 || // NBSP (most common)
|
|
220
|
+
code === 0x202f || // NARROW NO-BREAK SPACE
|
|
221
|
+
(code >= 0x2000 && code <= 0x200a) || // EN QUAD..HAIR SPACE
|
|
222
|
+
code === 0x3000 || // IDEOGRAPHIC SPACE (CJK)
|
|
223
|
+
code === 0x2028 || // LINE SEPARATOR
|
|
224
|
+
code === 0x2029 || // PARAGRAPH SEPARATOR
|
|
225
|
+
code === 0x205f || // MEDIUM MATHEMATICAL SPACE
|
|
226
|
+
code === 0x1680 || // OGHAM SPACE MARK
|
|
227
|
+
code === 0x180e || // MONGOLIAN VOWEL SEPARATOR (legacy)
|
|
228
|
+
code === 0x0085 || // NEXT LINE (NEL)
|
|
229
|
+
code === 0xfeff; // ZERO WIDTH NO-BREAK SPACE / BOM
|
|
230
|
+
|
|
231
|
+
if (isUnicodeWS) {
|
|
232
|
+
if (wordStart !== -1) {
|
|
233
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
234
|
+
wordStart = -1;
|
|
235
|
+
}
|
|
236
|
+
} else {
|
|
237
|
+
// Non-ASCII, non-whitespace (e.g., CJK, emojis, accented chars)
|
|
238
|
+
// Conservative estimate: treat each as 1 token
|
|
239
|
+
if (wordStart !== -1) {
|
|
240
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
241
|
+
wordStart = -1;
|
|
242
|
+
}
|
|
243
|
+
tokenCount++;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Flush final word
|
|
248
|
+
if (wordStart !== -1) {
|
|
249
|
+
tokenCount += calcWordTokens(len - wordStart);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Add ~50% of special chars as tokens
|
|
253
|
+
tokenCount += specialCount >> 1;
|
|
254
|
+
|
|
255
|
+
return tokenCount;
|
|
256
|
+
}
|