@softerist/heuristic-mcp 2.1.47 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/.agent/workflows/code-review.md +60 -0
  2. package/.prettierrc +7 -0
  3. package/ARCHITECTURE.md +105 -170
  4. package/CONTRIBUTING.md +32 -113
  5. package/GEMINI.md +73 -0
  6. package/LICENSE +21 -21
  7. package/README.md +161 -54
  8. package/config.json +876 -75
  9. package/debug-pids.js +27 -0
  10. package/eslint.config.js +36 -0
  11. package/features/ann-config.js +37 -26
  12. package/features/clear-cache.js +28 -19
  13. package/features/find-similar-code.js +142 -66
  14. package/features/hybrid-search.js +253 -93
  15. package/features/index-codebase.js +1455 -394
  16. package/features/lifecycle.js +813 -180
  17. package/features/register.js +58 -52
  18. package/index.js +450 -306
  19. package/lib/cache-ops.js +22 -0
  20. package/lib/cache-utils.js +68 -0
  21. package/lib/cache.js +1392 -587
  22. package/lib/call-graph.js +165 -50
  23. package/lib/cli.js +154 -0
  24. package/lib/config.js +462 -121
  25. package/lib/embedding-process.js +77 -0
  26. package/lib/embedding-worker.js +545 -30
  27. package/lib/ignore-patterns.js +61 -59
  28. package/lib/json-worker.js +14 -0
  29. package/lib/json-writer.js +344 -0
  30. package/lib/logging.js +88 -0
  31. package/lib/memory-logger.js +13 -0
  32. package/lib/project-detector.js +13 -17
  33. package/lib/server-lifecycle.js +38 -0
  34. package/lib/settings-editor.js +645 -0
  35. package/lib/tokenizer.js +207 -104
  36. package/lib/utils.js +273 -198
  37. package/lib/vector-store-binary.js +592 -0
  38. package/mcp_config.example.json +13 -0
  39. package/package.json +13 -2
  40. package/scripts/clear-cache.js +6 -17
  41. package/scripts/download-model.js +14 -9
  42. package/scripts/postinstall.js +5 -5
  43. package/search-configs.js +36 -0
  44. package/test/ann-config.test.js +179 -0
  45. package/test/ann-fallback.test.js +6 -6
  46. package/test/binary-store.test.js +69 -0
  47. package/test/cache-branches.test.js +120 -0
  48. package/test/cache-errors.test.js +264 -0
  49. package/test/cache-extra.test.js +300 -0
  50. package/test/cache-helpers.test.js +205 -0
  51. package/test/cache-hnsw-failure.test.js +40 -0
  52. package/test/cache-json-worker.test.js +190 -0
  53. package/test/cache-worker.test.js +102 -0
  54. package/test/cache.test.js +443 -0
  55. package/test/call-graph.test.js +103 -4
  56. package/test/clear-cache.test.js +69 -68
  57. package/test/code-review-workflow.test.js +50 -0
  58. package/test/config.test.js +418 -0
  59. package/test/coverage-gap.test.js +497 -0
  60. package/test/coverage-maximizer.test.js +236 -0
  61. package/test/debug-analysis.js +107 -0
  62. package/test/embedding-model.test.js +173 -103
  63. package/test/embedding-worker-extra.test.js +272 -0
  64. package/test/embedding-worker.test.js +158 -0
  65. package/test/features.test.js +139 -0
  66. package/test/final-boost.test.js +271 -0
  67. package/test/final-polish.test.js +183 -0
  68. package/test/final.test.js +95 -0
  69. package/test/find-similar-code.test.js +191 -0
  70. package/test/helpers.js +92 -11
  71. package/test/helpers.test.js +46 -0
  72. package/test/hybrid-search-basic.test.js +62 -0
  73. package/test/hybrid-search-branch.test.js +202 -0
  74. package/test/hybrid-search-callgraph.test.js +229 -0
  75. package/test/hybrid-search-extra.test.js +81 -0
  76. package/test/hybrid-search.test.js +484 -71
  77. package/test/index-cli.test.js +520 -0
  78. package/test/index-codebase-batch.test.js +119 -0
  79. package/test/index-codebase-branches.test.js +585 -0
  80. package/test/index-codebase-core.test.js +1032 -0
  81. package/test/index-codebase-edge-cases.test.js +254 -0
  82. package/test/index-codebase-errors.test.js +132 -0
  83. package/test/index-codebase-gap.test.js +239 -0
  84. package/test/index-codebase-lines.test.js +151 -0
  85. package/test/index-codebase-watcher.test.js +259 -0
  86. package/test/index-codebase-zone.test.js +259 -0
  87. package/test/index-codebase.test.js +371 -69
  88. package/test/index-memory.test.js +220 -0
  89. package/test/indexer-detailed.test.js +176 -0
  90. package/test/integration.test.js +148 -92
  91. package/test/json-worker.test.js +50 -0
  92. package/test/lifecycle.test.js +541 -0
  93. package/test/master.test.js +198 -0
  94. package/test/perfection.test.js +349 -0
  95. package/test/project-detector.test.js +65 -0
  96. package/test/register.test.js +262 -0
  97. package/test/tokenizer.test.js +55 -93
  98. package/test/ultra-maximizer.test.js +116 -0
  99. package/test/utils-branches.test.js +161 -0
  100. package/test/utils-extra.test.js +116 -0
  101. package/test/utils.test.js +131 -0
  102. package/test/verify_fixes.js +76 -0
  103. package/test/worker-errors.test.js +96 -0
  104. package/test/worker-init.test.js +102 -0
  105. package/test/worker_throttling.test.js +93 -0
  106. package/tools/scripts/benchmark-search.js +95 -0
  107. package/tools/scripts/cache-stats.js +71 -0
  108. package/tools/scripts/manual-search.js +34 -0
  109. package/vitest.config.js +19 -9
package/lib/tokenizer.js CHANGED
@@ -1,142 +1,245 @@
1
1
  /**
2
2
  * Token estimation and limits for embedding models
3
3
  *
4
- * This module provides token counting utilities and model-specific limits
5
- * to ensure text chunks don't exceed the model's maximum sequence length.
4
+ * Performance:
5
+ * - O(1) model lookups with precomputed maps
6
+ * - Zero regex / Zero allocations in hot loop
7
+ * - Proper LRU cache eviction
8
+ * - Optimized Unicode whitespace detection (ordered by probability)
9
+ * - Eliminated double toLowerCase() calls
10
+ * - Type-safe guard rails on all public APIs
11
+ * - Branchless special character counting
6
12
  */
7
13
 
8
- /**
9
- * Token limits for supported embedding models
10
- * Each model has its own maximum sequence length
11
- */
12
- export const MODEL_TOKEN_LIMITS = {
13
- // Sentence Transformers / MiniLM family
14
- "Xenova/all-MiniLM-L6-v2": 256,
15
- "Xenova/all-MiniLM-L12-v2": 256,
16
- "Xenova/paraphrase-MiniLM-L6-v2": 128,
17
- "Xenova/paraphrase-MiniLM-L3-v2": 128,
14
+ const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
15
+
16
+ const MODEL_TOKEN_LIMITS_RAW = {
17
+ 'jinaai/jina-embeddings-v2-base-code': 8192,
18
+ default: 512, // Safe default for BERT-like models
19
+ };
18
20
 
19
- // MPNet models
20
- "Xenova/all-mpnet-base-v2": 384,
21
- "Xenova/paraphrase-mpnet-base-v2": 384,
21
+ export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
22
+ ? { ...MODEL_TOKEN_LIMITS_RAW }
23
+ : Object.freeze({ ...MODEL_TOKEN_LIMITS_RAW });
22
24
 
23
- // Multilingual models
24
- "Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
25
- "Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
25
+ const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
26
26
 
27
- // Code-specific models
28
- "Xenova/codebert-base": 512,
29
- "Xenova/graphcodebert-base": 512,
27
+ /**
28
+ * Precomputed case-insensitive lookup
29
+ */
30
+ const MODEL_LIMITS_LC = new Map();
31
+ for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
32
+ MODEL_LIMITS_LC.set(k.toLowerCase(), v);
33
+ }
30
34
 
31
- // E5 models
32
- "Xenova/e5-small-v2": 512,
33
- "Xenova/e5-base-v2": 512,
34
- "Xenova/e5-large-v2": 512,
35
+ /**
36
+ * Internal helper: get model limit from pre-normalized key
37
+ * Avoids double toLowerCase() when called from cache flow
38
+ * @param {string} lowerName - Pre-normalized lowercase model name
39
+ * @param {*} originalName - Original model name (may not be a string)
40
+ * @returns {number} Token limit
41
+ */
42
+ function getModelTokenLimitFromLower(lowerName, originalName) {
43
+ // Fast path: try exact match first (only if original is a string)
44
+ if (typeof originalName === 'string') {
45
+ const direct = MODEL_TOKEN_LIMITS[originalName];
46
+ if (direct !== undefined) return direct;
47
+ }
35
48
 
36
- // BGE models
37
- "Xenova/bge-small-en-v1.5": 512,
38
- "Xenova/bge-base-en-v1.5": 512,
39
- "Xenova/bge-large-en-v1.5": 512,
49
+ // Slow path: use pre-normalized key
50
+ const exact = MODEL_LIMITS_LC.get(lowerName);
51
+ if (exact !== undefined) return exact;
40
52
 
41
- // Default fallback
42
- "default": 256
43
- };
44
-
45
- /**
46
- * Get the maximum token limit for a given model
47
- * Case-insensitive lookup for robustness
48
- * @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
49
- * @returns {number} Maximum tokens supported by the model
50
- */
51
- export function getModelTokenLimit(modelName) {
52
- if (!modelName) return MODEL_TOKEN_LIMITS["default"];
53
+ // Heuristics for common high-context models
54
+ if (lowerName.includes('jina') || lowerName.includes('nomic') || lowerName.includes('gte-large')) {
55
+ return 8192;
56
+ }
57
+ if (lowerName.includes('gte-base') || lowerName.includes('gte-small')) {
58
+ return 512;
59
+ }
60
+ if (lowerName.includes('minilm')) {
61
+ return 512;
62
+ }
53
63
 
54
- // Direct match first (fastest)
55
- if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
56
- return MODEL_TOKEN_LIMITS[modelName];
64
+ return DEFAULT_LIMIT;
57
65
  }
58
66
 
59
- // Case-insensitive search
60
- const normalizedName = modelName.toLowerCase();
61
- for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
62
- if (key.toLowerCase() === normalizedName) {
63
- return value;
64
- }
65
- }
67
+ /**
68
+ * Get the maximum token limit for a given model
69
+ * @param {string} modelName - The model name
70
+ * @returns {number} Maximum tokens supported by the model
71
+ */
72
+ export function getModelTokenLimit(modelName) {
73
+ // Guard clause for non-string or empty inputs
74
+ if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
66
75
 
67
- return MODEL_TOKEN_LIMITS["default"];
68
- }
76
+ const direct = MODEL_TOKEN_LIMITS[modelName];
77
+ if (direct !== undefined) return direct;
78
+
79
+ const lower = modelName.toLowerCase();
80
+ return getModelTokenLimitFromLower(lower, modelName);
81
+ }
82
+ /**
83
+ * LRU cache for chunking parameters
84
+ * @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
85
+ */
86
+ const MAX_CACHE_SIZE = 100;
87
+ const chunkingParamsCache = new Map();
69
88
 
70
89
  /**
71
90
  * Get chunking parameters for a model
72
- * Returns target and overlap tokens based on the model's limit
73
91
  * @param {string} modelName - The model name
74
- * @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
92
+ * @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
75
93
  */
76
94
  export function getChunkingParams(modelName) {
77
- const maxTokens = getModelTokenLimit(modelName);
78
-
79
- // Target: 85% of max to leave safety buffer
80
- const targetTokens = Math.floor(maxTokens * 0.85);
95
+ const key = (typeof modelName === 'string' && modelName.length)
96
+ ? modelName.toLowerCase()
97
+ : '';
98
+
99
+ // Fast path for invalid inputs: don't consume cache slots
100
+ if (key === '') {
101
+ const maxTokens = DEFAULT_LIMIT;
102
+ const targetTokens = Math.trunc(maxTokens * 0.85);
103
+ const overlapTokens = Math.trunc(targetTokens * 0.18);
104
+ return { maxTokens, targetTokens, overlapTokens };
105
+ }
81
106
 
82
- // Overlap: 15-20% of target for context continuity
83
- const overlapTokens = Math.floor(targetTokens * 0.18);
107
+ // LRU: If hit, delete and re-insert to mark as most recently used
108
+ const cached = chunkingParamsCache.get(key);
109
+ if (cached) {
110
+ chunkingParamsCache.delete(key);
111
+ chunkingParamsCache.set(key, cached);
112
+ return cached;
113
+ }
114
+
115
+ // Cache miss: compute new params (avoid double toLowerCase)
116
+ const maxTokens = getModelTokenLimitFromLower(key, modelName);
117
+ const targetTokens = Math.trunc(maxTokens * 0.85);
118
+ const overlapTokens = Math.trunc(targetTokens * 0.18);
84
119
 
85
- return {
86
- maxTokens,
87
- targetTokens,
88
- overlapTokens
89
- };
120
+ const params = { maxTokens, targetTokens, overlapTokens };
121
+
122
+ // LRU eviction: remove oldest entry if at capacity
123
+ if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
124
+ const oldestKey = chunkingParamsCache.keys().next().value;
125
+ chunkingParamsCache.delete(oldestKey);
126
+ }
127
+
128
+ chunkingParamsCache.set(key, params);
129
+ return params;
130
+ }
131
+
132
+ /**
133
+ * ASCII whitespace lookup table
134
+ */
135
+ const WS = new Uint8Array(128);
136
+ WS[9] = 1; // \t (horizontal tab)
137
+ WS[10] = 1; // \n (line feed)
138
+ WS[11] = 1; // \v (vertical tab)
139
+ WS[12] = 1; // \f (form feed)
140
+ WS[13] = 1; // \r (carriage return)
141
+ WS[32] = 1; // space
142
+
143
+ /**
144
+ * ASCII special character lookup table
145
+ */
146
+ const SPECIAL = new Uint8Array(128);
147
+ const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
148
+ for (let i = 0; i < SPECIAL_CHARS.length; i++) {
149
+ SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
150
+ }
151
+
152
+ /**
153
+ * Calculate token count for a word of given length
154
+ * This function will be inlined by V8
155
+ * @param {number} len - Word length in characters
156
+ * @returns {number} Estimated token count
157
+ */
158
+ function calcWordTokens(len) {
159
+ if (len <= 4) return 1;
160
+ if (len <= 10) return 2;
161
+ return (len + 3) >> 2; // ceil(len / 4)
90
162
  }
91
163
 
92
164
  /**
93
165
  * Estimate token count for text (conservative estimate for code)
94
- * Uses a simple heuristic: counts words, special characters, and estimates subwords
95
166
  *
96
- * This is conservative - actual tokenizers may produce fewer tokens.
97
- * For most accurate results, use the actual tokenizer, but this is much faster.
167
+ * Performance optimizations:
168
+ * - No regex (pure integer comparisons)
169
+ * - No string allocations (charCodeAt only)
170
+ * - Inlined word token calculation
171
+ * - Unicode checks ordered by frequency
172
+ * - Branchless special character counting
98
173
  *
99
174
  * @param {string} text - The text to estimate tokens for
100
175
  * @returns {number} Estimated token count
101
176
  */
102
177
  export function estimateTokens(text) {
103
- if (!text || text.length === 0) return 0;
104
-
105
- // Count words (split by whitespace)
106
- const words = text.split(/\s+/).filter(w => w.length > 0);
107
-
108
- // Count special characters/punctuation that often become separate tokens
109
- const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
110
-
111
- // Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
112
- // For long words, add extra tokens due to subword tokenization
113
- let tokenCount = 2; // [CLS] and [SEP]
114
-
115
- for (const word of words) {
116
- if (word.length <= 4) {
117
- tokenCount += 1;
118
- } else if (word.length <= 10) {
119
- tokenCount += 2;
178
+ // Type-safe guard: prevents crashes from non-string inputs
179
+ if (typeof text !== 'string' || text.length === 0) return 0;
180
+
181
+ const len = text.length;
182
+ let tokenCount = 2; // [CLS] + [SEP]
183
+ let specialCount = 0;
184
+ let wordStart = -1;
185
+
186
+ for (let i = 0; i < len; i++) {
187
+ const code = text.charCodeAt(i);
188
+
189
+ // ASCII fast path (most common for code)
190
+ if (code < 128) {
191
+ if (WS[code]) {
192
+ if (wordStart !== -1) {
193
+ tokenCount += calcWordTokens(i - wordStart);
194
+ wordStart = -1;
195
+ }
196
+ } else {
197
+ // Branchless: add 0 or 1 based on SPECIAL[code]
198
+ specialCount += SPECIAL[code];
199
+ if (wordStart === -1) wordStart = i;
200
+ }
201
+ continue;
202
+ }
203
+
204
+ // Unicode whitespace: ordered by frequency for real-world text
205
+ // Note: Includes legacy 0x180E for tokenization compatibility even though
206
+ // modern JS \s doesn't consider it whitespace (ES2016+)
207
+ const isUnicodeWS =
208
+ code === 0x00a0 || // NBSP (most common)
209
+ code === 0x202f || // NARROW NO-BREAK SPACE
210
+ (code >= 0x2000 && code <= 0x200a) || // EN QUAD..HAIR SPACE
211
+ code === 0x3000 || // IDEOGRAPHIC SPACE (CJK)
212
+ code === 0x2028 || // LINE SEPARATOR
213
+ code === 0x2029 || // PARAGRAPH SEPARATOR
214
+ code === 0x205f || // MEDIUM MATHEMATICAL SPACE
215
+ code === 0x1680 || // OGHAM SPACE MARK
216
+ code === 0x180e || // MONGOLIAN VOWEL SEPARATOR (legacy)
217
+ code === 0x0085 || // NEXT LINE (NEL)
218
+ code === 0xfeff; // ZERO WIDTH NO-BREAK SPACE / BOM
219
+
220
+ if (isUnicodeWS) {
221
+ if (wordStart !== -1) {
222
+ tokenCount += calcWordTokens(i - wordStart);
223
+ wordStart = -1;
224
+ }
120
225
  } else {
121
- // Long words get split into ~4-char subwords
122
- tokenCount += Math.ceil(word.length / 4);
226
+ // Non-ASCII, non-whitespace (e.g., CJK, emojis, accented chars)
227
+ // Conservative estimate: treat each as 1 token
228
+ if (wordStart !== -1) {
229
+ tokenCount += calcWordTokens(i - wordStart);
230
+ wordStart = -1;
231
+ }
232
+ tokenCount++;
123
233
  }
124
234
  }
125
-
126
- // Many special chars merge with adjacent tokens, so count ~50%
127
- tokenCount += Math.floor(specialChars * 0.5);
128
-
129
- return tokenCount;
130
- }
131
235
 
132
- /**
133
- * Check if text exceeds the token limit for a model
134
- * @param {string} text - The text to check
135
- * @param {string} modelName - The model name
136
- * @returns {boolean} True if the text exceeds the limit
137
- */
138
- export function exceedsTokenLimit(text, modelName) {
139
- const limit = getModelTokenLimit(modelName);
140
- const tokens = estimateTokens(text);
141
- return tokens > limit;
236
+ // Flush final word
237
+ if (wordStart !== -1) {
238
+ tokenCount += calcWordTokens(len - wordStart);
239
+ }
240
+
241
+ // Add ~50% of special chars as tokens
242
+ tokenCount += specialCount >> 1;
243
+
244
+ return tokenCount;
142
245
  }