@softerist/heuristic-mcp 3.0.17 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.jsonc +23 -6
- package/features/ann-config.js +7 -14
- package/features/clear-cache.js +3 -3
- package/features/find-similar-code.js +17 -22
- package/features/hybrid-search.js +59 -67
- package/features/index-codebase.js +305 -268
- package/features/lifecycle.js +370 -176
- package/features/package-version.js +15 -26
- package/features/register.js +75 -57
- package/features/resources.js +21 -47
- package/features/set-workspace.js +31 -43
- package/index.js +818 -172
- package/lib/cache-utils.js +95 -99
- package/lib/cache.js +121 -166
- package/lib/cli.js +246 -238
- package/lib/config.js +232 -62
- package/lib/constants.js +22 -2
- package/lib/embed-query-process.js +13 -29
- package/lib/embedding-process.js +29 -19
- package/lib/embedding-worker.js +166 -149
- package/lib/ignore-patterns.js +39 -39
- package/lib/json-writer.js +7 -34
- package/lib/logging.js +11 -42
- package/lib/onnx-backend.js +4 -4
- package/lib/path-utils.js +4 -21
- package/lib/project-detector.js +3 -3
- package/lib/server-lifecycle.js +109 -15
- package/lib/settings-editor.js +25 -18
- package/lib/slice-normalize.js +6 -16
- package/lib/tokenizer.js +56 -109
- package/lib/utils.js +62 -81
- package/lib/vector-store-binary.js +7 -7
- package/lib/vector-store-sqlite.js +35 -67
- package/lib/workspace-cache-key.js +36 -0
- package/lib/workspace-env.js +55 -14
- package/package.json +86 -86
package/lib/settings-editor.js
CHANGED
|
@@ -734,21 +734,28 @@ export function upsertMcpServerEntryInToml(text, serverName, serverConfig) {
|
|
|
734
734
|
return `${withTrailingNewline}${newline}${section}${newline}`;
|
|
735
735
|
}
|
|
736
736
|
|
|
737
|
-
export function setMcpServerDisabledInToml(text, serverName, disabled) {
|
|
738
|
-
const source = String(text || '');
|
|
739
|
-
const sectionName = `mcp_servers.${serverName}`;
|
|
740
|
-
const range = findTomlSectionRange(source, sectionName);
|
|
741
|
-
|
|
742
|
-
if (!range) {
|
|
743
|
-
return source;
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
const sectionBlock = source.slice(range.start, range.end);
|
|
747
|
-
const newline = detectNewline(sectionBlock || '\n');
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
}
|
|
737
|
+
export function setMcpServerDisabledInToml(text, serverName, disabled) {
|
|
738
|
+
const source = String(text || '');
|
|
739
|
+
const sectionName = `mcp_servers.${serverName}`;
|
|
740
|
+
const range = findTomlSectionRange(source, sectionName);
|
|
741
|
+
|
|
742
|
+
if (!range) {
|
|
743
|
+
return source;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
const sectionBlock = source.slice(range.start, range.end);
|
|
747
|
+
const newline = detectNewline(sectionBlock || '\n');
|
|
748
|
+
if (disabled) {
|
|
749
|
+
const disabledLine = `disabled = true`;
|
|
750
|
+
const updatedSection = /^\s*disabled\s*=.*$/m.test(sectionBlock)
|
|
751
|
+
? sectionBlock.replace(/^\s*disabled\s*=.*$/m, disabledLine)
|
|
752
|
+
: `${sectionBlock.trimEnd()}${newline}${disabledLine}${newline}`;
|
|
753
|
+
|
|
754
|
+
return `${source.slice(0, range.start)}${updatedSection}${source.slice(range.end)}`;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
const cleanedSection = sectionBlock
|
|
758
|
+
.replace(/^\s*disabled\s*=.*$/m, '')
|
|
759
|
+
.replace(/\n\s*\n$/, '\n');
|
|
760
|
+
return `${source.slice(0, range.start)}${cleanedSection}${source.slice(range.end)}`;
|
|
761
|
+
}
|
package/lib/slice-normalize.js
CHANGED
|
@@ -1,19 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
* Slice and L2-normalize a vector for MRL (Matryoshka Representation Learning).
|
|
3
|
-
* If targetDim is null/undefined or >= vector length, returns the original vector unchanged.
|
|
4
|
-
* @param {Float32Array} vector - The full embedding vector
|
|
5
|
-
* @param {number|null} targetDim - Target dimension (64/128/256/512/768 or null)
|
|
6
|
-
* @returns {Float32Array} - Sliced and normalized vector, or original if no slicing
|
|
7
|
-
*/
|
|
1
|
+
|
|
8
2
|
export function sliceAndNormalize(vector, targetDim) {
|
|
9
3
|
if (!targetDim || targetDim >= vector.length) {
|
|
10
4
|
return vector;
|
|
11
5
|
}
|
|
12
6
|
|
|
13
|
-
|
|
7
|
+
|
|
14
8
|
const sliced = vector.slice(0, targetDim);
|
|
15
9
|
|
|
16
|
-
|
|
10
|
+
|
|
17
11
|
let sumSquares = 0;
|
|
18
12
|
for (let i = 0; i < targetDim; i++) {
|
|
19
13
|
sumSquares += sliced[i] * sliced[i];
|
|
@@ -29,13 +23,9 @@ export function sliceAndNormalize(vector, targetDim) {
|
|
|
29
23
|
return sliced;
|
|
30
24
|
}
|
|
31
25
|
|
|
32
|
-
|
|
33
|
-
* Convert any array-like to Float32Array (always creates a copy).
|
|
34
|
-
* @param {ArrayLike<number>} vector - Input vector
|
|
35
|
-
* @returns {Float32Array} - Copy as Float32Array
|
|
36
|
-
*/
|
|
26
|
+
|
|
37
27
|
export function toFloat32Array(vector) {
|
|
38
|
-
|
|
39
|
-
|
|
28
|
+
|
|
29
|
+
|
|
40
30
|
return new Float32Array(vector);
|
|
41
31
|
}
|
package/lib/tokenizer.js
CHANGED
|
@@ -1,24 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
* Token estimation and limits for embedding models
|
|
3
|
-
*
|
|
4
|
-
* Performance:
|
|
5
|
-
* - O(1) model lookups with precomputed maps
|
|
6
|
-
* - Zero regex / Zero allocations in hot loop
|
|
7
|
-
* - Proper LRU cache eviction
|
|
8
|
-
* - Optimized Unicode whitespace detection (ordered by probability)
|
|
9
|
-
* - Eliminated double toLowerCase() calls
|
|
10
|
-
* - Type-safe guard rails on all public APIs
|
|
11
|
-
* - Branchless special character counting
|
|
12
|
-
*/
|
|
1
|
+
|
|
13
2
|
|
|
14
3
|
const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
|
|
15
4
|
|
|
16
5
|
const MODEL_TOKEN_LIMITS_RAW = {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
20
9
|
'jinaai/jina-embeddings-v2-base-code': 512,
|
|
21
|
-
default: 512,
|
|
10
|
+
default: 512,
|
|
22
11
|
};
|
|
23
12
|
|
|
24
13
|
export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
|
|
@@ -27,34 +16,26 @@ export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
|
|
|
27
16
|
|
|
28
17
|
const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
|
|
29
18
|
|
|
30
|
-
|
|
31
|
-
* Precomputed case-insensitive lookup
|
|
32
|
-
*/
|
|
19
|
+
|
|
33
20
|
const MODEL_LIMITS_LC = new Map();
|
|
34
21
|
for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
35
22
|
MODEL_LIMITS_LC.set(k.toLowerCase(), v);
|
|
36
23
|
}
|
|
37
24
|
|
|
38
|
-
|
|
39
|
-
* Internal helper: get model limit from pre-normalized key
|
|
40
|
-
* Avoids double toLowerCase() when called from cache flow
|
|
41
|
-
* @param {string} lowerName - Pre-normalized lowercase model name
|
|
42
|
-
* @param {*} originalName - Original model name (may not be a string)
|
|
43
|
-
* @returns {number} Token limit
|
|
44
|
-
*/
|
|
25
|
+
|
|
45
26
|
function getModelTokenLimitFromLower(lowerName, originalName) {
|
|
46
|
-
|
|
27
|
+
|
|
47
28
|
if (typeof originalName === 'string') {
|
|
48
29
|
const direct = MODEL_TOKEN_LIMITS[originalName];
|
|
49
30
|
if (direct !== undefined) return direct;
|
|
50
31
|
}
|
|
51
32
|
|
|
52
|
-
|
|
33
|
+
|
|
53
34
|
const exact = MODEL_LIMITS_LC.get(lowerName);
|
|
54
35
|
if (exact !== undefined) return exact;
|
|
55
36
|
|
|
56
|
-
|
|
57
|
-
|
|
37
|
+
|
|
38
|
+
|
|
58
39
|
if (
|
|
59
40
|
lowerName.includes('jina') ||
|
|
60
41
|
lowerName.includes('nomic') ||
|
|
@@ -72,13 +53,9 @@ function getModelTokenLimitFromLower(lowerName, originalName) {
|
|
|
72
53
|
return DEFAULT_LIMIT;
|
|
73
54
|
}
|
|
74
55
|
|
|
75
|
-
|
|
76
|
-
* Get the maximum token limit for a given model
|
|
77
|
-
* @param {string} modelName - The model name
|
|
78
|
-
* @returns {number} Maximum tokens supported by the model
|
|
79
|
-
*/
|
|
56
|
+
|
|
80
57
|
export function getModelTokenLimit(modelName) {
|
|
81
|
-
|
|
58
|
+
|
|
82
59
|
if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
|
|
83
60
|
|
|
84
61
|
const direct = MODEL_TOKEN_LIMITS[modelName];
|
|
@@ -87,22 +64,15 @@ export function getModelTokenLimit(modelName) {
|
|
|
87
64
|
const lower = modelName.toLowerCase();
|
|
88
65
|
return getModelTokenLimitFromLower(lower, modelName);
|
|
89
66
|
}
|
|
90
|
-
|
|
91
|
-
* LRU cache for chunking parameters
|
|
92
|
-
* @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
|
|
93
|
-
*/
|
|
67
|
+
|
|
94
68
|
import { CHUNKING_PARAMS_CACHE_SIZE as MAX_CACHE_SIZE } from './constants.js';
|
|
95
69
|
const chunkingParamsCache = new Map();
|
|
96
70
|
|
|
97
|
-
|
|
98
|
-
* Get chunking parameters for a model
|
|
99
|
-
* @param {string} modelName - The model name
|
|
100
|
-
* @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
|
|
101
|
-
*/
|
|
71
|
+
|
|
102
72
|
export function getChunkingParams(modelName) {
|
|
103
73
|
const key = typeof modelName === 'string' && modelName.length ? modelName.toLowerCase() : '';
|
|
104
74
|
|
|
105
|
-
|
|
75
|
+
|
|
106
76
|
if (key === '') {
|
|
107
77
|
const maxTokens = DEFAULT_LIMIT;
|
|
108
78
|
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
@@ -110,10 +80,10 @@ export function getChunkingParams(modelName) {
|
|
|
110
80
|
return { maxTokens, targetTokens, overlapTokens };
|
|
111
81
|
}
|
|
112
82
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
117
87
|
const cached = chunkingParamsCache.get(key);
|
|
118
88
|
if (cached) {
|
|
119
89
|
chunkingParamsCache.delete(key);
|
|
@@ -121,14 +91,14 @@ export function getChunkingParams(modelName) {
|
|
|
121
91
|
return cached;
|
|
122
92
|
}
|
|
123
93
|
|
|
124
|
-
|
|
94
|
+
|
|
125
95
|
const maxTokens = getModelTokenLimitFromLower(key, modelName);
|
|
126
96
|
const targetTokens = Math.trunc(maxTokens * 0.85);
|
|
127
97
|
const overlapTokens = Math.trunc(targetTokens * 0.18);
|
|
128
98
|
|
|
129
99
|
const params = { maxTokens, targetTokens, overlapTokens };
|
|
130
100
|
|
|
131
|
-
|
|
101
|
+
|
|
132
102
|
if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
|
|
133
103
|
const oldestKey = chunkingParamsCache.keys().next().value;
|
|
134
104
|
chunkingParamsCache.delete(oldestKey);
|
|
@@ -138,66 +108,43 @@ export function getChunkingParams(modelName) {
|
|
|
138
108
|
return params;
|
|
139
109
|
}
|
|
140
110
|
|
|
141
|
-
|
|
142
|
-
* ASCII whitespace lookup table
|
|
143
|
-
*/
|
|
111
|
+
|
|
144
112
|
const WS = new Uint8Array(128);
|
|
145
|
-
WS[9] = 1;
|
|
146
|
-
WS[10] = 1;
|
|
147
|
-
WS[11] = 1;
|
|
148
|
-
WS[12] = 1;
|
|
149
|
-
WS[13] = 1;
|
|
150
|
-
WS[32] = 1;
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
* ASCII special character lookup table
|
|
154
|
-
*/
|
|
113
|
+
WS[9] = 1;
|
|
114
|
+
WS[10] = 1;
|
|
115
|
+
WS[11] = 1;
|
|
116
|
+
WS[12] = 1;
|
|
117
|
+
WS[13] = 1;
|
|
118
|
+
WS[32] = 1;
|
|
119
|
+
|
|
120
|
+
|
|
155
121
|
const SPECIAL = new Uint8Array(128);
|
|
156
122
|
const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
|
|
157
123
|
for (let i = 0; i < SPECIAL_CHARS.length; i++) {
|
|
158
124
|
SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
|
|
159
125
|
}
|
|
160
126
|
|
|
161
|
-
|
|
162
|
-
* Calculate token count for a word of given length
|
|
163
|
-
* This function will be inlined by V8
|
|
164
|
-
* @param {number} len - Word length in characters
|
|
165
|
-
* @returns {number} Estimated token count
|
|
166
|
-
*/
|
|
127
|
+
|
|
167
128
|
function calcWordTokens(len) {
|
|
168
129
|
if (len <= 4) return 1;
|
|
169
130
|
if (len <= 10) return 2;
|
|
170
|
-
return (len + 3) >> 2;
|
|
131
|
+
return (len + 3) >> 2;
|
|
171
132
|
}
|
|
172
133
|
|
|
173
|
-
|
|
174
|
-
* Estimate token count for text (conservative estimate for code)
|
|
175
|
-
*
|
|
176
|
-
* Performance optimizations:
|
|
177
|
-
* - No regex (pure integer comparisons)
|
|
178
|
-
* - No string allocations (charCodeAt only)
|
|
179
|
-
* - Inlined word token calculation
|
|
180
|
-
* - Unicode checks ordered by frequency
|
|
181
|
-
* - Branchless special character counting
|
|
182
|
-
*
|
|
183
|
-
* @param {string} text - The text to estimate tokens for
|
|
184
|
-
* @param {object} [options]
|
|
185
|
-
* @param {boolean} [options.includeSpecialTokens=true] - Whether to include [CLS]/[SEP]
|
|
186
|
-
* @returns {number} Estimated token count
|
|
187
|
-
*/
|
|
134
|
+
|
|
188
135
|
export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
189
|
-
|
|
136
|
+
|
|
190
137
|
if (typeof text !== 'string' || text.length === 0) return 0;
|
|
191
138
|
|
|
192
139
|
const len = text.length;
|
|
193
|
-
let tokenCount = includeSpecialTokens ? 2 : 0;
|
|
140
|
+
let tokenCount = includeSpecialTokens ? 2 : 0;
|
|
194
141
|
let specialCount = 0;
|
|
195
142
|
let wordStart = -1;
|
|
196
143
|
|
|
197
144
|
for (let i = 0; i < len; i++) {
|
|
198
145
|
const code = text.charCodeAt(i);
|
|
199
146
|
|
|
200
|
-
|
|
147
|
+
|
|
201
148
|
if (code < 128) {
|
|
202
149
|
if (WS[code]) {
|
|
203
150
|
if (wordStart !== -1) {
|
|
@@ -205,28 +152,28 @@ export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
|
205
152
|
wordStart = -1;
|
|
206
153
|
}
|
|
207
154
|
} else {
|
|
208
|
-
|
|
155
|
+
|
|
209
156
|
specialCount += SPECIAL[code];
|
|
210
157
|
if (wordStart === -1) wordStart = i;
|
|
211
158
|
}
|
|
212
159
|
continue;
|
|
213
160
|
}
|
|
214
161
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
218
165
|
const isUnicodeWS =
|
|
219
|
-
code === 0x00a0 ||
|
|
220
|
-
code === 0x202f ||
|
|
221
|
-
(code >= 0x2000 && code <= 0x200a) ||
|
|
222
|
-
code === 0x3000 ||
|
|
223
|
-
code === 0x2028 ||
|
|
224
|
-
code === 0x2029 ||
|
|
225
|
-
code === 0x205f ||
|
|
226
|
-
code === 0x1680 ||
|
|
227
|
-
code === 0x180e ||
|
|
228
|
-
code === 0x0085 ||
|
|
229
|
-
code === 0xfeff;
|
|
166
|
+
code === 0x00a0 ||
|
|
167
|
+
code === 0x202f ||
|
|
168
|
+
(code >= 0x2000 && code <= 0x200a) ||
|
|
169
|
+
code === 0x3000 ||
|
|
170
|
+
code === 0x2028 ||
|
|
171
|
+
code === 0x2029 ||
|
|
172
|
+
code === 0x205f ||
|
|
173
|
+
code === 0x1680 ||
|
|
174
|
+
code === 0x180e ||
|
|
175
|
+
code === 0x0085 ||
|
|
176
|
+
code === 0xfeff;
|
|
230
177
|
|
|
231
178
|
if (isUnicodeWS) {
|
|
232
179
|
if (wordStart !== -1) {
|
|
@@ -234,8 +181,8 @@ export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
|
234
181
|
wordStart = -1;
|
|
235
182
|
}
|
|
236
183
|
} else {
|
|
237
|
-
|
|
238
|
-
|
|
184
|
+
|
|
185
|
+
|
|
239
186
|
if (wordStart !== -1) {
|
|
240
187
|
tokenCount += calcWordTokens(i - wordStart);
|
|
241
188
|
wordStart = -1;
|
|
@@ -244,12 +191,12 @@ export function estimateTokens(text, { includeSpecialTokens = true } = {}) {
|
|
|
244
191
|
}
|
|
245
192
|
}
|
|
246
193
|
|
|
247
|
-
|
|
194
|
+
|
|
248
195
|
if (wordStart !== -1) {
|
|
249
196
|
tokenCount += calcWordTokens(len - wordStart);
|
|
250
197
|
}
|
|
251
198
|
|
|
252
|
-
|
|
199
|
+
|
|
253
200
|
tokenCount += specialCount >> 1;
|
|
254
201
|
|
|
255
202
|
return tokenCount;
|