@softerist/heuristic-mcp 2.1.47 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/workflows/code-review.md +60 -0
- package/.prettierrc +7 -0
- package/ARCHITECTURE.md +105 -170
- package/CONTRIBUTING.md +32 -113
- package/GEMINI.md +73 -0
- package/LICENSE +21 -21
- package/README.md +161 -54
- package/config.json +876 -75
- package/debug-pids.js +27 -0
- package/eslint.config.js +36 -0
- package/features/ann-config.js +37 -26
- package/features/clear-cache.js +28 -19
- package/features/find-similar-code.js +142 -66
- package/features/hybrid-search.js +253 -93
- package/features/index-codebase.js +1455 -394
- package/features/lifecycle.js +813 -180
- package/features/register.js +58 -52
- package/index.js +450 -306
- package/lib/cache-ops.js +22 -0
- package/lib/cache-utils.js +68 -0
- package/lib/cache.js +1392 -587
- package/lib/call-graph.js +165 -50
- package/lib/cli.js +154 -0
- package/lib/config.js +462 -121
- package/lib/embedding-process.js +77 -0
- package/lib/embedding-worker.js +545 -30
- package/lib/ignore-patterns.js +61 -59
- package/lib/json-worker.js +14 -0
- package/lib/json-writer.js +344 -0
- package/lib/logging.js +88 -0
- package/lib/memory-logger.js +13 -0
- package/lib/project-detector.js +13 -17
- package/lib/server-lifecycle.js +38 -0
- package/lib/settings-editor.js +645 -0
- package/lib/tokenizer.js +207 -104
- package/lib/utils.js +273 -198
- package/lib/vector-store-binary.js +592 -0
- package/mcp_config.example.json +13 -0
- package/package.json +13 -2
- package/scripts/clear-cache.js +6 -17
- package/scripts/download-model.js +14 -9
- package/scripts/postinstall.js +5 -5
- package/search-configs.js +36 -0
- package/test/ann-config.test.js +179 -0
- package/test/ann-fallback.test.js +6 -6
- package/test/binary-store.test.js +69 -0
- package/test/cache-branches.test.js +120 -0
- package/test/cache-errors.test.js +264 -0
- package/test/cache-extra.test.js +300 -0
- package/test/cache-helpers.test.js +205 -0
- package/test/cache-hnsw-failure.test.js +40 -0
- package/test/cache-json-worker.test.js +190 -0
- package/test/cache-worker.test.js +102 -0
- package/test/cache.test.js +443 -0
- package/test/call-graph.test.js +103 -4
- package/test/clear-cache.test.js +69 -68
- package/test/code-review-workflow.test.js +50 -0
- package/test/config.test.js +418 -0
- package/test/coverage-gap.test.js +497 -0
- package/test/coverage-maximizer.test.js +236 -0
- package/test/debug-analysis.js +107 -0
- package/test/embedding-model.test.js +173 -103
- package/test/embedding-worker-extra.test.js +272 -0
- package/test/embedding-worker.test.js +158 -0
- package/test/features.test.js +139 -0
- package/test/final-boost.test.js +271 -0
- package/test/final-polish.test.js +183 -0
- package/test/final.test.js +95 -0
- package/test/find-similar-code.test.js +191 -0
- package/test/helpers.js +92 -11
- package/test/helpers.test.js +46 -0
- package/test/hybrid-search-basic.test.js +62 -0
- package/test/hybrid-search-branch.test.js +202 -0
- package/test/hybrid-search-callgraph.test.js +229 -0
- package/test/hybrid-search-extra.test.js +81 -0
- package/test/hybrid-search.test.js +484 -71
- package/test/index-cli.test.js +520 -0
- package/test/index-codebase-batch.test.js +119 -0
- package/test/index-codebase-branches.test.js +585 -0
- package/test/index-codebase-core.test.js +1032 -0
- package/test/index-codebase-edge-cases.test.js +254 -0
- package/test/index-codebase-errors.test.js +132 -0
- package/test/index-codebase-gap.test.js +239 -0
- package/test/index-codebase-lines.test.js +151 -0
- package/test/index-codebase-watcher.test.js +259 -0
- package/test/index-codebase-zone.test.js +259 -0
- package/test/index-codebase.test.js +371 -69
- package/test/index-memory.test.js +220 -0
- package/test/indexer-detailed.test.js +176 -0
- package/test/integration.test.js +148 -92
- package/test/json-worker.test.js +50 -0
- package/test/lifecycle.test.js +541 -0
- package/test/master.test.js +198 -0
- package/test/perfection.test.js +349 -0
- package/test/project-detector.test.js +65 -0
- package/test/register.test.js +262 -0
- package/test/tokenizer.test.js +55 -93
- package/test/ultra-maximizer.test.js +116 -0
- package/test/utils-branches.test.js +161 -0
- package/test/utils-extra.test.js +116 -0
- package/test/utils.test.js +131 -0
- package/test/verify_fixes.js +76 -0
- package/test/worker-errors.test.js +96 -0
- package/test/worker-init.test.js +102 -0
- package/test/worker_throttling.test.js +93 -0
- package/tools/scripts/benchmark-search.js +95 -0
- package/tools/scripts/cache-stats.js +71 -0
- package/tools/scripts/manual-search.js +34 -0
- package/vitest.config.js +19 -9
package/lib/tokenizer.js
CHANGED
|
@@ -1,142 +1,245 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Token estimation and limits for embedding models
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* Performance:
|
|
5
|
+
* - O(1) model lookups with precomputed maps
|
|
6
|
+
* - Zero regex / Zero allocations in hot loop
|
|
7
|
+
* - Proper LRU cache eviction
|
|
8
|
+
* - Optimized Unicode whitespace detection (ordered by probability)
|
|
9
|
+
* - Eliminated double toLowerCase() calls
|
|
10
|
+
* - Type-safe guard rails on all public APIs
|
|
11
|
+
* - Branchless special character counting
|
|
6
12
|
*/
|
|
7
13
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
"Xenova/all-MiniLM-L6-v2": 256,
|
|
15
|
-
"Xenova/all-MiniLM-L12-v2": 256,
|
|
16
|
-
"Xenova/paraphrase-MiniLM-L6-v2": 128,
|
|
17
|
-
"Xenova/paraphrase-MiniLM-L3-v2": 128,
|
|
14
|
+
const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
|
|
15
|
+
|
|
16
|
+
const MODEL_TOKEN_LIMITS_RAW = {
|
|
17
|
+
'jinaai/jina-embeddings-v2-base-code': 8192,
|
|
18
|
+
default: 512, // Safe default for BERT-like models
|
|
19
|
+
};
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
|
|
22
|
+
? { ...MODEL_TOKEN_LIMITS_RAW }
|
|
23
|
+
: Object.freeze({ ...MODEL_TOKEN_LIMITS_RAW });
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
"Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
|
|
25
|
-
"Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
|
|
25
|
+
const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Precomputed case-insensitive lookup
|
|
29
|
+
*/
|
|
30
|
+
const MODEL_LIMITS_LC = new Map();
|
|
31
|
+
for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
32
|
+
MODEL_LIMITS_LC.set(k.toLowerCase(), v);
|
|
33
|
+
}
|
|
30
34
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
+
/**
|
|
36
|
+
* Internal helper: get model limit from pre-normalized key
|
|
37
|
+
* Avoids double toLowerCase() when called from cache flow
|
|
38
|
+
* @param {string} lowerName - Pre-normalized lowercase model name
|
|
39
|
+
* @param {*} originalName - Original model name (may not be a string)
|
|
40
|
+
* @returns {number} Token limit
|
|
41
|
+
*/
|
|
42
|
+
function getModelTokenLimitFromLower(lowerName, originalName) {
|
|
43
|
+
// Fast path: try exact match first (only if original is a string)
|
|
44
|
+
if (typeof originalName === 'string') {
|
|
45
|
+
const direct = MODEL_TOKEN_LIMITS[originalName];
|
|
46
|
+
if (direct !== undefined) return direct;
|
|
47
|
+
}
|
|
35
48
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
"Xenova/bge-large-en-v1.5": 512,
|
|
49
|
+
// Slow path: use pre-normalized key
|
|
50
|
+
const exact = MODEL_LIMITS_LC.get(lowerName);
|
|
51
|
+
if (exact !== undefined) return exact;
|
|
40
52
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
export function getModelTokenLimit(modelName) {
|
|
52
|
-
if (!modelName) return MODEL_TOKEN_LIMITS["default"];
|
|
53
|
+
// Heuristics for common high-context models
|
|
54
|
+
if (lowerName.includes('jina') || lowerName.includes('nomic') || lowerName.includes('gte-large')) {
|
|
55
|
+
return 8192;
|
|
56
|
+
}
|
|
57
|
+
if (lowerName.includes('gte-base') || lowerName.includes('gte-small')) {
|
|
58
|
+
return 512;
|
|
59
|
+
}
|
|
60
|
+
if (lowerName.includes('minilm')) {
|
|
61
|
+
return 512;
|
|
62
|
+
}
|
|
53
63
|
|
|
54
|
-
|
|
55
|
-
if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
|
|
56
|
-
return MODEL_TOKEN_LIMITS[modelName];
|
|
64
|
+
return DEFAULT_LIMIT;
|
|
57
65
|
}
|
|
58
66
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
67
|
+
/**
|
|
68
|
+
* Get the maximum token limit for a given model
|
|
69
|
+
* @param {string} modelName - The model name
|
|
70
|
+
* @returns {number} Maximum tokens supported by the model
|
|
71
|
+
*/
|
|
72
|
+
export function getModelTokenLimit(modelName) {
|
|
73
|
+
// Guard clause for non-string or empty inputs
|
|
74
|
+
if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
|
|
66
75
|
|
|
67
|
-
|
|
68
|
-
|
|
76
|
+
const direct = MODEL_TOKEN_LIMITS[modelName];
|
|
77
|
+
if (direct !== undefined) return direct;
|
|
78
|
+
|
|
79
|
+
const lower = modelName.toLowerCase();
|
|
80
|
+
return getModelTokenLimitFromLower(lower, modelName);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* LRU cache for chunking parameters
|
|
84
|
+
* @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
|
|
85
|
+
*/
|
|
86
|
+
const MAX_CACHE_SIZE = 100;
|
|
87
|
+
const chunkingParamsCache = new Map();
|
|
69
88
|
|
|
70
89
|
/**
|
|
71
90
|
* Get chunking parameters for a model
|
|
72
|
-
* Returns target and overlap tokens based on the model's limit
|
|
73
91
|
* @param {string} modelName - The model name
|
|
74
|
-
* @returns {{
|
|
92
|
+
* @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
|
|
75
93
|
*/
|
|
76
94
|
export function getChunkingParams(modelName) {
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
95
|
+
const key = (typeof modelName === 'string' && modelName.length)
|
|
96
|
+
? modelName.toLowerCase()
|
|
97
|
+
: '';
|
|
98
|
+
|
|
99
|
+
// Fast path for invalid inputs: don't consume cache slots
|
|
100
|
+
if (key === '') {
|
|
101
|
+
const maxTokens = DEFAULT_LIMIT;
|
|
102
|
+
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
103
|
+
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
104
|
+
return { maxTokens, targetTokens, overlapTokens };
|
|
105
|
+
}
|
|
81
106
|
|
|
82
|
-
//
|
|
83
|
-
const
|
|
107
|
+
// LRU: If hit, delete and re-insert to mark as most recently used
|
|
108
|
+
const cached = chunkingParamsCache.get(key);
|
|
109
|
+
if (cached) {
|
|
110
|
+
chunkingParamsCache.delete(key);
|
|
111
|
+
chunkingParamsCache.set(key, cached);
|
|
112
|
+
return cached;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Cache miss: compute new params (avoid double toLowerCase)
|
|
116
|
+
const maxTokens = getModelTokenLimitFromLower(key, modelName);
|
|
117
|
+
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
118
|
+
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
84
119
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
120
|
+
const params = { maxTokens, targetTokens, overlapTokens };
|
|
121
|
+
|
|
122
|
+
// LRU eviction: remove oldest entry if at capacity
|
|
123
|
+
if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
|
|
124
|
+
const oldestKey = chunkingParamsCache.keys().next().value;
|
|
125
|
+
chunkingParamsCache.delete(oldestKey);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
chunkingParamsCache.set(key, params);
|
|
129
|
+
return params;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* ASCII whitespace lookup table
|
|
134
|
+
*/
|
|
135
|
+
const WS = new Uint8Array(128);
|
|
136
|
+
WS[9] = 1; // \t (horizontal tab)
|
|
137
|
+
WS[10] = 1; // \n (line feed)
|
|
138
|
+
WS[11] = 1; // \v (vertical tab)
|
|
139
|
+
WS[12] = 1; // \f (form feed)
|
|
140
|
+
WS[13] = 1; // \r (carriage return)
|
|
141
|
+
WS[32] = 1; // space
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* ASCII special character lookup table
|
|
145
|
+
*/
|
|
146
|
+
const SPECIAL = new Uint8Array(128);
|
|
147
|
+
const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
|
|
148
|
+
for (let i = 0; i < SPECIAL_CHARS.length; i++) {
|
|
149
|
+
SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Calculate token count for a word of given length
|
|
154
|
+
* This function will be inlined by V8
|
|
155
|
+
* @param {number} len - Word length in characters
|
|
156
|
+
* @returns {number} Estimated token count
|
|
157
|
+
*/
|
|
158
|
+
function calcWordTokens(len) {
|
|
159
|
+
if (len <= 4) return 1;
|
|
160
|
+
if (len <= 10) return 2;
|
|
161
|
+
return (len + 3) >> 2; // ceil(len / 4)
|
|
90
162
|
}
|
|
91
163
|
|
|
92
164
|
/**
|
|
93
165
|
* Estimate token count for text (conservative estimate for code)
|
|
94
|
-
* Uses a simple heuristic: counts words, special characters, and estimates subwords
|
|
95
166
|
*
|
|
96
|
-
*
|
|
97
|
-
*
|
|
167
|
+
* Performance optimizations:
|
|
168
|
+
* - No regex (pure integer comparisons)
|
|
169
|
+
* - No string allocations (charCodeAt only)
|
|
170
|
+
* - Inlined word token calculation
|
|
171
|
+
* - Unicode checks ordered by frequency
|
|
172
|
+
* - Branchless special character counting
|
|
98
173
|
*
|
|
99
174
|
* @param {string} text - The text to estimate tokens for
|
|
100
175
|
* @returns {number} Estimated token count
|
|
101
176
|
*/
|
|
102
177
|
export function estimateTokens(text) {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
const
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
178
|
+
// Type-safe guard: prevents crashes from non-string inputs
|
|
179
|
+
if (typeof text !== 'string' || text.length === 0) return 0;
|
|
180
|
+
|
|
181
|
+
const len = text.length;
|
|
182
|
+
let tokenCount = 2; // [CLS] + [SEP]
|
|
183
|
+
let specialCount = 0;
|
|
184
|
+
let wordStart = -1;
|
|
185
|
+
|
|
186
|
+
for (let i = 0; i < len; i++) {
|
|
187
|
+
const code = text.charCodeAt(i);
|
|
188
|
+
|
|
189
|
+
// ASCII fast path (most common for code)
|
|
190
|
+
if (code < 128) {
|
|
191
|
+
if (WS[code]) {
|
|
192
|
+
if (wordStart !== -1) {
|
|
193
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
194
|
+
wordStart = -1;
|
|
195
|
+
}
|
|
196
|
+
} else {
|
|
197
|
+
// Branchless: add 0 or 1 based on SPECIAL[code]
|
|
198
|
+
specialCount += SPECIAL[code];
|
|
199
|
+
if (wordStart === -1) wordStart = i;
|
|
200
|
+
}
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Unicode whitespace: ordered by frequency for real-world text
|
|
205
|
+
// Note: Includes legacy 0x180E for tokenization compatibility even though
|
|
206
|
+
// modern JS \s doesn't consider it whitespace (ES2016+)
|
|
207
|
+
const isUnicodeWS =
|
|
208
|
+
code === 0x00a0 || // NBSP (most common)
|
|
209
|
+
code === 0x202f || // NARROW NO-BREAK SPACE
|
|
210
|
+
(code >= 0x2000 && code <= 0x200a) || // EN QUAD..HAIR SPACE
|
|
211
|
+
code === 0x3000 || // IDEOGRAPHIC SPACE (CJK)
|
|
212
|
+
code === 0x2028 || // LINE SEPARATOR
|
|
213
|
+
code === 0x2029 || // PARAGRAPH SEPARATOR
|
|
214
|
+
code === 0x205f || // MEDIUM MATHEMATICAL SPACE
|
|
215
|
+
code === 0x1680 || // OGHAM SPACE MARK
|
|
216
|
+
code === 0x180e || // MONGOLIAN VOWEL SEPARATOR (legacy)
|
|
217
|
+
code === 0x0085 || // NEXT LINE (NEL)
|
|
218
|
+
code === 0xfeff; // ZERO WIDTH NO-BREAK SPACE / BOM
|
|
219
|
+
|
|
220
|
+
if (isUnicodeWS) {
|
|
221
|
+
if (wordStart !== -1) {
|
|
222
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
223
|
+
wordStart = -1;
|
|
224
|
+
}
|
|
120
225
|
} else {
|
|
121
|
-
//
|
|
122
|
-
|
|
226
|
+
// Non-ASCII, non-whitespace (e.g., CJK, emojis, accented chars)
|
|
227
|
+
// Conservative estimate: treat each as 1 token
|
|
228
|
+
if (wordStart !== -1) {
|
|
229
|
+
tokenCount += calcWordTokens(i - wordStart);
|
|
230
|
+
wordStart = -1;
|
|
231
|
+
}
|
|
232
|
+
tokenCount++;
|
|
123
233
|
}
|
|
124
234
|
}
|
|
125
|
-
|
|
126
|
-
// Many special chars merge with adjacent tokens, so count ~50%
|
|
127
|
-
tokenCount += Math.floor(specialChars * 0.5);
|
|
128
|
-
|
|
129
|
-
return tokenCount;
|
|
130
|
-
}
|
|
131
235
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
return tokens > limit;
|
|
236
|
+
// Flush final word
|
|
237
|
+
if (wordStart !== -1) {
|
|
238
|
+
tokenCount += calcWordTokens(len - wordStart);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Add ~50% of special chars as tokens
|
|
242
|
+
tokenCount += specialCount >> 1;
|
|
243
|
+
|
|
244
|
+
return tokenCount;
|
|
142
245
|
}
|