@softerist/heuristic-mcp 3.0.15 → 3.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +104 -104
  2. package/config.jsonc +173 -173
  3. package/features/ann-config.js +131 -0
  4. package/features/clear-cache.js +84 -0
  5. package/features/find-similar-code.js +291 -0
  6. package/features/hybrid-search.js +544 -0
  7. package/features/index-codebase.js +3268 -0
  8. package/features/lifecycle.js +1189 -0
  9. package/features/package-version.js +302 -0
  10. package/features/register.js +408 -0
  11. package/features/resources.js +156 -0
  12. package/features/set-workspace.js +265 -0
  13. package/index.js +96 -96
  14. package/lib/cache-ops.js +22 -22
  15. package/lib/cache-utils.js +565 -565
  16. package/lib/cache.js +1870 -1870
  17. package/lib/call-graph.js +396 -396
  18. package/lib/cli.js +1 -1
  19. package/lib/config.js +517 -517
  20. package/lib/constants.js +39 -39
  21. package/lib/embed-query-process.js +7 -7
  22. package/lib/embedding-process.js +7 -7
  23. package/lib/embedding-worker.js +299 -299
  24. package/lib/ignore-patterns.js +316 -316
  25. package/lib/json-worker.js +14 -14
  26. package/lib/json-writer.js +337 -337
  27. package/lib/logging.js +164 -164
  28. package/lib/memory-logger.js +13 -13
  29. package/lib/onnx-backend.js +193 -193
  30. package/lib/project-detector.js +84 -84
  31. package/lib/server-lifecycle.js +165 -165
  32. package/lib/settings-editor.js +754 -754
  33. package/lib/tokenizer.js +256 -256
  34. package/lib/utils.js +428 -428
  35. package/lib/vector-store-binary.js +627 -627
  36. package/lib/vector-store-sqlite.js +95 -95
  37. package/lib/workspace-env.js +28 -28
  38. package/mcp_config.json +9 -9
  39. package/package.json +86 -75
  40. package/scripts/clear-cache.js +20 -0
  41. package/scripts/download-model.js +43 -0
  42. package/scripts/mcp-launcher.js +49 -0
  43. package/scripts/postinstall.js +12 -0
  44. package/search-configs.js +36 -36
  45. package/.prettierrc +0 -7
  46. package/debug-pids.js +0 -30
  47. package/eslint.config.js +0 -36
  48. package/specs/plan.md +0 -23
  49. package/vitest.config.js +0 -39
package/lib/utils.js CHANGED
@@ -1,428 +1,428 @@
1
- import crypto from 'crypto';
2
- import path from 'path';
3
- import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
-
5
- // Re-export tokenizer utilities
6
- export {
7
- estimateTokens,
8
- getChunkingParams,
9
- getModelTokenLimit,
10
- MODEL_TOKEN_LIMITS,
11
- } from './tokenizer.js';
12
-
13
- // Minimum text length for a chunk to be considered valid (avoids tiny fragments)
14
- import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
15
-
16
- /**
17
- * Fast similarity for normalized vectors (dot product).
18
- * Uses loop unrolling for performance on large vectors.
19
- * NOTE: For very large codebases (10k+ chunks), consider WebAssembly SIMD
20
- * for ~2-4x speedup on 768-dim vectors.
21
- * @param {Float32Array} a - First normalized vector
22
- * @param {Float32Array} b - Second normalized vector
23
- * @returns {number} Dot product similarity score (-1 to 1 for normalized vectors)
24
- * @throws {Error} If vectors are null/undefined or have different dimensions
25
- */
26
- export function dotSimilarity(a, b) {
27
- if (!a || !b) {
28
- throw new Error(
29
- 'dotSimilarity requires two non-null vectors. ' +
30
- 'This may indicate a missing embedding or corrupted cache entry.'
31
- );
32
- }
33
- if (a.length !== b.length) {
34
- throw new Error(
35
- `Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
36
- 'This may indicate an embedding dimension configuration change. Consider reindexing.'
37
- );
38
- }
39
- let dot = 0;
40
- let i = 0;
41
- const len = a.length;
42
- const m = len % 4;
43
-
44
- while (i < m) {
45
- dot += a[i] * b[i];
46
- i++;
47
- }
48
-
49
- while (i < len) {
50
- dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
51
- i += 4;
52
- }
53
-
54
- return dot;
55
- }
56
-
57
- /**
58
- * Generate hash for file content to detect changes
59
- */
60
- export function hashContent(content) {
61
- return crypto.createHash('md5').update(content).digest('hex');
62
- }
63
-
64
- // Language-specific patterns for function/class detection
65
- const patterns = {
66
- // JavaScript/TypeScript
67
- js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
68
- jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
69
- ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
70
- tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
71
- mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
72
- cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
73
-
74
- // Python
75
- py: /^(class|def|async\s+def)\s+\w+/,
76
- pyw: /^(class|def|async\s+def)\s+\w+/,
77
- pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
78
-
79
- // Java/Kotlin/Scala
80
- java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
81
- kt: /^(class|interface|object|fun|val|var)\s+\w+/,
82
- kts: /^(class|interface|object|fun|val|var)\s+\w+/,
83
- scala: /^(class|object|trait|def|val|var)\s+\w+/,
84
-
85
- // C/C++
86
- c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
87
- cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
88
- cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
89
- cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
90
- h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
91
- hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
92
- hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
93
-
94
- // C#
95
- cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
96
- csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
97
-
98
- // Go
99
- go: /^(func|type|const|var)\s+\w+/,
100
-
101
- // Rust
102
- rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
103
-
104
- // PHP
105
- php: /^(class|interface|trait|function|const)\s+\w+/,
106
- phtml: /^(<\?php|class|interface|trait|function)\s*/,
107
-
108
- // Ruby
109
- rb: /^(class|module|def)\s+\w+/,
110
- rake: /^(class|module|def|task|namespace)\s+\w+/,
111
-
112
- // Swift
113
- swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
114
-
115
- // R
116
- r: /^(\w+)\s*(<-|=)\s*function/,
117
- R: /^(\w+)\s*(<-|=)\s*function/,
118
-
119
- // Lua
120
- lua: /^(function|local\s+function)\s+\w+/,
121
-
122
- // Shell scripts
123
- sh: /^(\w+\s*\(\)|function\s+\w+)/,
124
- bash: /^(\w+\s*\(\)|function\s+\w+)/,
125
- zsh: /^(\w+\s*\(\)|function\s+\w+)/,
126
- fish: /^function\s+\w+/,
127
-
128
- // CSS/Styles
129
- css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
130
- scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
131
- sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
132
- less: /^(@\w+:|\.|#|@media)\s*/,
133
- styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
134
-
135
- // Markup/HTML
136
- html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
137
- htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
138
- xml: /^(<\w+|\s*<!\[CDATA\[)/,
139
- svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
140
-
141
- // Config files
142
- json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
143
- yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
144
- yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
145
- toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
146
- ini: /^(\[\w+\]|\w+\s*=)/,
147
- env: /^[A-Z_][A-Z0-9_]*=/,
148
-
149
- // Makefile
150
- makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
151
- mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
152
-
153
- // Docker
154
- dockerfile:
155
- /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
156
-
157
- // Documentation
158
- md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
159
- mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
160
- txt: /^.{50,}/, // Split on long paragraphs
161
- rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
162
-
163
- // Database
164
- sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
165
-
166
- // Perl
167
- pl: /^(sub|package|use|require)\s+\w+/,
168
- pm: /^(sub|package|use|require)\s+\w+/,
169
-
170
- // Vim
171
- vim: /^(function|command|autocmd|let\s+g:)\s*/,
172
- };
173
-
174
- /**
175
- * Intelligent chunking with token limit awareness
176
- * Tries to split by function/class boundaries while respecting token limits
177
- *
178
- * @param {string} content - File content to chunk
179
- * @param {string} file - File path (for language detection)
180
- * @param {object} config - Configuration object with embeddingModel
181
- * @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
182
- */
183
- export function smartChunk(content, file, config) {
184
- const lines = content.split('\n');
185
- const chunks = [];
186
- const ext = path.extname(file).toLowerCase();
187
- const base = path.basename(file).toLowerCase();
188
- const SPECIAL_TOKENS = 2; // [CLS] + [SEP] accounted once per chunk
189
-
190
- // Get model-specific chunking parameters with optional user overrides
191
- let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
192
- if (config.maxTokens) maxTokens = config.maxTokens;
193
- if (config.targetTokens) targetTokens = config.targetTokens;
194
- if (config.overlapTokens) overlapTokens = config.overlapTokens;
195
-
196
- let langPattern = patterns[ext.slice(1)];
197
- if (!langPattern) {
198
- if (base === 'dockerfile') langPattern = patterns.dockerfile;
199
- else if (base === 'makefile') langPattern = patterns.makefile;
200
- else if (base.startsWith('.env')) langPattern = patterns.env;
201
- }
202
- if (!langPattern || typeof langPattern.test !== 'function') {
203
- langPattern = patterns.js; // Default fallback
204
- }
205
- let currentChunk = [];
206
- let chunkStartLine = 0;
207
- let lineTokenCounts = []; // Cache token counts for overlap calculation
208
-
209
- let currentTokenCount = 0;
210
-
211
- // Track bracket depth for better boundary detection
212
- let bracketDepth = 0;
213
- let braceDepth = 0;
214
- let parenDepth = 0;
215
- let inString = false;
216
- let inComment = false;
217
- let stringChar = null; // ' or " or `
218
-
219
- const splitOversizedLine = (line, lineTokens) => {
220
- const charsPerToken = line.length / Math.max(1, lineTokens);
221
- const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
222
- const segments = [];
223
-
224
- for (let start = 0; start < line.length; start += segmentSize) {
225
- segments.push(line.slice(start, start + segmentSize));
226
- }
227
-
228
- return segments;
229
- };
230
-
231
- for (let i = 0; i < lines.length; i++) {
232
- const line = lines[i];
233
- const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
234
-
235
- let j = 0;
236
-
237
- // Simple state tracking for heuristics (not a full parser)
238
- if (inComment) {
239
- // Look for end of block comment
240
- const endIdx = line.indexOf('*/');
241
- if (endIdx !== -1) {
242
- inComment = false;
243
- j = endIdx + 2;
244
- } else {
245
- // Skip whole line
246
- j = line.length;
247
- }
248
- }
249
-
250
- const scanLine = j < line.length ? line.slice(j) : '';
251
- const trimmed = scanLine.trim();
252
-
253
- for (; j < line.length; j++) {
254
- const char = line[j];
255
- const nextChar = line[j + 1];
256
-
257
- if (inString) {
258
- if (char === '\\') {
259
- j++; // Skip escaped char
260
- } else if (char === stringChar) {
261
- inString = false;
262
- stringChar = null;
263
- }
264
- } else {
265
- // Check for comment start
266
- if (char === '/' && nextChar === '*') {
267
- inComment = true;
268
- j++;
269
- // Check if it ends on same line
270
- const endIdx = line.indexOf('*/', j);
271
- if (endIdx !== -1) {
272
- inComment = false;
273
- j = endIdx + 1;
274
- } else {
275
- break; // Rest of line is comment
276
- }
277
- } else if (char === '/' && nextChar === '/') {
278
- break; // Skip rest of line (line comment)
279
- } else if (char === "'" || char === '"' || char === '`') {
280
- inString = true;
281
- stringChar = char;
282
- } else {
283
- // Only count brackets if not in string or comment
284
- if (char === '{') braceDepth++;
285
- else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
286
- else if (char === '[') bracketDepth++;
287
- else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
288
- else if (char === '(') parenDepth++;
289
- else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
290
- }
291
- }
292
- }
293
-
294
- // Split lines that are too large to ever fit in a single chunk
295
- if (lineTokens + SPECIAL_TOKENS > maxTokens) {
296
- if (currentChunk.length > 0) {
297
- const chunkText = currentChunk.join('\n');
298
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
299
- chunks.push({
300
- text: chunkText,
301
- startLine: chunkStartLine + 1,
302
- endLine: i,
303
- tokenCount: currentTokenCount + SPECIAL_TOKENS,
304
- });
305
- }
306
- }
307
-
308
- const parts = splitOversizedLine(line, lineTokens);
309
- for (const part of parts) {
310
- if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
311
- const partTokens = estimateTokens(part, { includeSpecialTokens: false });
312
- chunks.push({
313
- text: part,
314
- startLine: i + 1,
315
- endLine: i + 1,
316
- tokenCount: partTokens + SPECIAL_TOKENS,
317
- });
318
- }
319
-
320
- currentChunk = [];
321
- lineTokenCounts = [];
322
- currentTokenCount = 0;
323
- chunkStartLine = i + 1;
324
- continue;
325
- }
326
-
327
- // Check if adding this line would exceed token limit
328
- const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
329
- const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
330
-
331
- // Check if this is a good split point using multiple heuristics
332
- const matchesPattern = langPattern.test(trimmed);
333
- const atTopLevel =
334
- braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
335
- const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
336
- const isEmptyLine = trimmed.length === 0;
337
- const prevWasEmpty =
338
- i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
339
- const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
340
-
341
- const isGoodSplitPoint =
342
- currentChunk.length > 3 &&
343
- ((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
344
- (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
345
- (prevWasEmpty && (matchesPattern || isCommentStart)));
346
-
347
- const shouldSplit =
348
- wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
349
-
350
- // Avoid splitting in weird states if possible
351
- const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
352
-
353
- if (shouldSplit && safeToSplit && currentChunk.length > 0) {
354
- const chunkText = currentChunk.join('\n');
355
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
356
- chunks.push({
357
- text: chunkText,
358
- startLine: chunkStartLine + 1,
359
- endLine: i,
360
- tokenCount: currentTokenCount,
361
- });
362
- }
363
-
364
- let overlapLines = [];
365
- let overlapTokensCount = 0;
366
- let overlapStartOffset = 0; // Track how many lines back we went
367
- const MAX_OVERLAP_ITERATIONS = 50; // Absolute limit to prevent unbounded loops
368
- let overlapIterations = 0;
369
- for (
370
- let k = currentChunk.length - 1;
371
- k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
372
- k--
373
- ) {
374
- overlapIterations++;
375
- // Use cached token count instead of re-estimating
376
- const lineT = lineTokenCounts[k] ?? 0;
377
- // Guard against infinite loops: if lineT is 0, count the line but don't loop forever
378
- if (lineT <= 0) {
379
- // Include zero-token lines (e.g., empty lines) but limit to prevent infinite spin
380
- // Also guard with overlapStartOffset < 20 to prevent excessive lines even if under 10 in overlapLines
381
- if (overlapLines.length < 10 && overlapStartOffset < 20) {
382
- overlapLines.unshift(currentChunk[k]);
383
- overlapStartOffset++;
384
- }
385
- continue;
386
- }
387
- if (overlapTokensCount + lineT <= overlapTokens) {
388
- overlapLines.unshift(currentChunk[k]);
389
- overlapTokensCount += lineT;
390
- overlapStartOffset++;
391
- } else {
392
- break;
393
- }
394
- }
395
-
396
- currentChunk = overlapLines;
397
- // Rebuild lineTokenCounts for the overlap lines
398
- lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
399
- currentTokenCount = overlapTokensCount;
400
- // The new chunk starts from where the overlap begins in the original file
401
- // i is the current line we're about to process, overlap lines are from before
402
- // Ensure non-negative to handle edge cases where overlapStartOffset > i
403
- chunkStartLine = Math.max(0, i - overlapStartOffset);
404
- }
405
-
406
- currentChunk.push(line);
407
- lineTokenCounts.push(lineTokens);
408
- currentTokenCount += lineTokens;
409
-
410
- if (chunks.length >= (config.maxChunksPerFile || 1000)) {
411
- // Hard limit to prevent memory explosion on minified/data files
412
- break;
413
- }
414
- }
415
-
416
- // Add remaining chunk
417
- const chunkText = currentChunk.join('\n');
418
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
419
- chunks.push({
420
- text: chunkText,
421
- startLine: chunkStartLine + 1,
422
- endLine: lines.length,
423
- tokenCount: currentTokenCount + SPECIAL_TOKENS,
424
- });
425
- }
426
-
427
- return chunks;
428
- }
1
+ import crypto from 'crypto';
2
+ import path from 'path';
3
+ import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
+
5
+ // Re-export tokenizer utilities
6
+ export {
7
+ estimateTokens,
8
+ getChunkingParams,
9
+ getModelTokenLimit,
10
+ MODEL_TOKEN_LIMITS,
11
+ } from './tokenizer.js';
12
+
13
+ // Minimum text length for a chunk to be considered valid (avoids tiny fragments)
14
+ import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
15
+
16
+ /**
17
+ * Fast similarity for normalized vectors (dot product).
18
+ * Uses loop unrolling for performance on large vectors.
19
+ * NOTE: For very large codebases (10k+ chunks), consider WebAssembly SIMD
20
+ * for ~2-4x speedup on 768-dim vectors.
21
+ * @param {Float32Array} a - First normalized vector
22
+ * @param {Float32Array} b - Second normalized vector
23
+ * @returns {number} Dot product similarity score (-1 to 1 for normalized vectors)
24
+ * @throws {Error} If vectors are null/undefined or have different dimensions
25
+ */
26
+ export function dotSimilarity(a, b) {
27
+ if (!a || !b) {
28
+ throw new Error(
29
+ 'dotSimilarity requires two non-null vectors. ' +
30
+ 'This may indicate a missing embedding or corrupted cache entry.'
31
+ );
32
+ }
33
+ if (a.length !== b.length) {
34
+ throw new Error(
35
+ `Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
36
+ 'This may indicate an embedding dimension configuration change. Consider reindexing.'
37
+ );
38
+ }
39
+ let dot = 0;
40
+ let i = 0;
41
+ const len = a.length;
42
+ const m = len % 4;
43
+
44
+ while (i < m) {
45
+ dot += a[i] * b[i];
46
+ i++;
47
+ }
48
+
49
+ while (i < len) {
50
+ dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
51
+ i += 4;
52
+ }
53
+
54
+ return dot;
55
+ }
56
+
57
+ /**
58
+ * Generate hash for file content to detect changes
59
+ */
60
+ export function hashContent(content) {
61
+ return crypto.createHash('md5').update(content).digest('hex');
62
+ }
63
+
64
+ // Language-specific patterns for function/class detection
65
+ const patterns = {
66
+ // JavaScript/TypeScript
67
+ js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
68
+ jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
69
+ ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
70
+ tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
71
+ mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
72
+ cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
73
+
74
+ // Python
75
+ py: /^(class|def|async\s+def)\s+\w+/,
76
+ pyw: /^(class|def|async\s+def)\s+\w+/,
77
+ pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
78
+
79
+ // Java/Kotlin/Scala
80
+ java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
81
+ kt: /^(class|interface|object|fun|val|var)\s+\w+/,
82
+ kts: /^(class|interface|object|fun|val|var)\s+\w+/,
83
+ scala: /^(class|object|trait|def|val|var)\s+\w+/,
84
+
85
+ // C/C++
86
+ c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
87
+ cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
88
+ cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
89
+ cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
90
+ h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
91
+ hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
92
+ hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
93
+
94
+ // C#
95
+ cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
96
+ csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
97
+
98
+ // Go
99
+ go: /^(func|type|const|var)\s+\w+/,
100
+
101
+ // Rust
102
+ rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
103
+
104
+ // PHP
105
+ php: /^(class|interface|trait|function|const)\s+\w+/,
106
+ phtml: /^(<\?php|class|interface|trait|function)\s*/,
107
+
108
+ // Ruby
109
+ rb: /^(class|module|def)\s+\w+/,
110
+ rake: /^(class|module|def|task|namespace)\s+\w+/,
111
+
112
+ // Swift
113
+ swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
114
+
115
+ // R
116
+ r: /^(\w+)\s*(<-|=)\s*function/,
117
+ R: /^(\w+)\s*(<-|=)\s*function/,
118
+
119
+ // Lua
120
+ lua: /^(function|local\s+function)\s+\w+/,
121
+
122
+ // Shell scripts
123
+ sh: /^(\w+\s*\(\)|function\s+\w+)/,
124
+ bash: /^(\w+\s*\(\)|function\s+\w+)/,
125
+ zsh: /^(\w+\s*\(\)|function\s+\w+)/,
126
+ fish: /^function\s+\w+/,
127
+
128
+ // CSS/Styles
129
+ css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
130
+ scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
131
+ sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
132
+ less: /^(@\w+:|\.|#|@media)\s*/,
133
+ styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
134
+
135
+ // Markup/HTML
136
+ html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
137
+ htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
138
+ xml: /^(<\w+|\s*<!\[CDATA\[)/,
139
+ svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
140
+
141
+ // Config files
142
+ json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
143
+ yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
144
+ yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
145
+ toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
146
+ ini: /^(\[\w+\]|\w+\s*=)/,
147
+ env: /^[A-Z_][A-Z0-9_]*=/,
148
+
149
+ // Makefile
150
+ makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
151
+ mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
152
+
153
+ // Docker
154
+ dockerfile:
155
+ /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
156
+
157
+ // Documentation
158
+ md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
159
+ mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
160
+ txt: /^.{50,}/, // Split on long paragraphs
161
+ rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
162
+
163
+ // Database
164
+ sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
165
+
166
+ // Perl
167
+ pl: /^(sub|package|use|require)\s+\w+/,
168
+ pm: /^(sub|package|use|require)\s+\w+/,
169
+
170
+ // Vim
171
+ vim: /^(function|command|autocmd|let\s+g:)\s*/,
172
+ };
173
+
174
+ /**
175
+ * Intelligent chunking with token limit awareness
176
+ * Tries to split by function/class boundaries while respecting token limits
177
+ *
178
+ * @param {string} content - File content to chunk
179
+ * @param {string} file - File path (for language detection)
180
+ * @param {object} config - Configuration object with embeddingModel
181
+ * @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
182
+ */
183
+ export function smartChunk(content, file, config) {
184
+ const lines = content.split('\n');
185
+ const chunks = [];
186
+ const ext = path.extname(file).toLowerCase();
187
+ const base = path.basename(file).toLowerCase();
188
+ const SPECIAL_TOKENS = 2; // [CLS] + [SEP] accounted once per chunk
189
+
190
+ // Get model-specific chunking parameters with optional user overrides
191
+ let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
192
+ if (config.maxTokens) maxTokens = config.maxTokens;
193
+ if (config.targetTokens) targetTokens = config.targetTokens;
194
+ if (config.overlapTokens) overlapTokens = config.overlapTokens;
195
+
196
+ let langPattern = patterns[ext.slice(1)];
197
+ if (!langPattern) {
198
+ if (base === 'dockerfile') langPattern = patterns.dockerfile;
199
+ else if (base === 'makefile') langPattern = patterns.makefile;
200
+ else if (base.startsWith('.env')) langPattern = patterns.env;
201
+ }
202
+ if (!langPattern || typeof langPattern.test !== 'function') {
203
+ langPattern = patterns.js; // Default fallback
204
+ }
205
+ let currentChunk = [];
206
+ let chunkStartLine = 0;
207
+ let lineTokenCounts = []; // Cache token counts for overlap calculation
208
+
209
+ let currentTokenCount = 0;
210
+
211
+ // Track bracket depth for better boundary detection
212
+ let bracketDepth = 0;
213
+ let braceDepth = 0;
214
+ let parenDepth = 0;
215
+ let inString = false;
216
+ let inComment = false;
217
+ let stringChar = null; // ' or " or `
218
+
219
+ const splitOversizedLine = (line, lineTokens) => {
220
+ const charsPerToken = line.length / Math.max(1, lineTokens);
221
+ const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
222
+ const segments = [];
223
+
224
+ for (let start = 0; start < line.length; start += segmentSize) {
225
+ segments.push(line.slice(start, start + segmentSize));
226
+ }
227
+
228
+ return segments;
229
+ };
230
+
231
+ for (let i = 0; i < lines.length; i++) {
232
+ const line = lines[i];
233
+ const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
234
+
235
+ let j = 0;
236
+
237
+ // Simple state tracking for heuristics (not a full parser)
238
+ if (inComment) {
239
+ // Look for end of block comment
240
+ const endIdx = line.indexOf('*/');
241
+ if (endIdx !== -1) {
242
+ inComment = false;
243
+ j = endIdx + 2;
244
+ } else {
245
+ // Skip whole line
246
+ j = line.length;
247
+ }
248
+ }
249
+
250
+ const scanLine = j < line.length ? line.slice(j) : '';
251
+ const trimmed = scanLine.trim();
252
+
253
+ for (; j < line.length; j++) {
254
+ const char = line[j];
255
+ const nextChar = line[j + 1];
256
+
257
+ if (inString) {
258
+ if (char === '\\') {
259
+ j++; // Skip escaped char
260
+ } else if (char === stringChar) {
261
+ inString = false;
262
+ stringChar = null;
263
+ }
264
+ } else {
265
+ // Check for comment start
266
+ if (char === '/' && nextChar === '*') {
267
+ inComment = true;
268
+ j++;
269
+ // Check if it ends on same line
270
+ const endIdx = line.indexOf('*/', j);
271
+ if (endIdx !== -1) {
272
+ inComment = false;
273
+ j = endIdx + 1;
274
+ } else {
275
+ break; // Rest of line is comment
276
+ }
277
+ } else if (char === '/' && nextChar === '/') {
278
+ break; // Skip rest of line (line comment)
279
+ } else if (char === "'" || char === '"' || char === '`') {
280
+ inString = true;
281
+ stringChar = char;
282
+ } else {
283
+ // Only count brackets if not in string or comment
284
+ if (char === '{') braceDepth++;
285
+ else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
286
+ else if (char === '[') bracketDepth++;
287
+ else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
288
+ else if (char === '(') parenDepth++;
289
+ else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
290
+ }
291
+ }
292
+ }
293
+
294
+ // Split lines that are too large to ever fit in a single chunk
295
+ if (lineTokens + SPECIAL_TOKENS > maxTokens) {
296
+ if (currentChunk.length > 0) {
297
+ const chunkText = currentChunk.join('\n');
298
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
299
+ chunks.push({
300
+ text: chunkText,
301
+ startLine: chunkStartLine + 1,
302
+ endLine: i,
303
+ tokenCount: currentTokenCount + SPECIAL_TOKENS,
304
+ });
305
+ }
306
+ }
307
+
308
+ const parts = splitOversizedLine(line, lineTokens);
309
+ for (const part of parts) {
310
+ if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
311
+ const partTokens = estimateTokens(part, { includeSpecialTokens: false });
312
+ chunks.push({
313
+ text: part,
314
+ startLine: i + 1,
315
+ endLine: i + 1,
316
+ tokenCount: partTokens + SPECIAL_TOKENS,
317
+ });
318
+ }
319
+
320
+ currentChunk = [];
321
+ lineTokenCounts = [];
322
+ currentTokenCount = 0;
323
+ chunkStartLine = i + 1;
324
+ continue;
325
+ }
326
+
327
+ // Check if adding this line would exceed token limit
328
+ const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
329
+ const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
330
+
331
+ // Check if this is a good split point using multiple heuristics
332
+ const matchesPattern = langPattern.test(trimmed);
333
+ const atTopLevel =
334
+ braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
335
+ const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
336
+ const isEmptyLine = trimmed.length === 0;
337
+ const prevWasEmpty =
338
+ i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
339
+ const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
340
+
341
+ const isGoodSplitPoint =
342
+ currentChunk.length > 3 &&
343
+ ((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
344
+ (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
345
+ (prevWasEmpty && (matchesPattern || isCommentStart)));
346
+
347
+ const shouldSplit =
348
+ wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
349
+
350
+ // Avoid splitting in weird states if possible
351
+ const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
352
+
353
+ if (shouldSplit && safeToSplit && currentChunk.length > 0) {
354
+ const chunkText = currentChunk.join('\n');
355
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
356
+ chunks.push({
357
+ text: chunkText,
358
+ startLine: chunkStartLine + 1,
359
+ endLine: i,
360
+ tokenCount: currentTokenCount,
361
+ });
362
+ }
363
+
364
+ let overlapLines = [];
365
+ let overlapTokensCount = 0;
366
+ let overlapStartOffset = 0; // Track how many lines back we went
367
+ const MAX_OVERLAP_ITERATIONS = 50; // Absolute limit to prevent unbounded loops
368
+ let overlapIterations = 0;
369
+ for (
370
+ let k = currentChunk.length - 1;
371
+ k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
372
+ k--
373
+ ) {
374
+ overlapIterations++;
375
+ // Use cached token count instead of re-estimating
376
+ const lineT = lineTokenCounts[k] ?? 0;
377
+ // Guard against infinite loops: if lineT is 0, count the line but don't loop forever
378
+ if (lineT <= 0) {
379
+ // Include zero-token lines (e.g., empty lines) but limit to prevent infinite spin
380
+ // Also guard with overlapStartOffset < 20 to prevent excessive lines even if under 10 in overlapLines
381
+ if (overlapLines.length < 10 && overlapStartOffset < 20) {
382
+ overlapLines.unshift(currentChunk[k]);
383
+ overlapStartOffset++;
384
+ }
385
+ continue;
386
+ }
387
+ if (overlapTokensCount + lineT <= overlapTokens) {
388
+ overlapLines.unshift(currentChunk[k]);
389
+ overlapTokensCount += lineT;
390
+ overlapStartOffset++;
391
+ } else {
392
+ break;
393
+ }
394
+ }
395
+
396
+ currentChunk = overlapLines;
397
+ // Rebuild lineTokenCounts for the overlap lines
398
+ lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
399
+ currentTokenCount = overlapTokensCount;
400
+ // The new chunk starts from where the overlap begins in the original file
401
+ // i is the current line we're about to process, overlap lines are from before
402
+ // Ensure non-negative to handle edge cases where overlapStartOffset > i
403
+ chunkStartLine = Math.max(0, i - overlapStartOffset);
404
+ }
405
+
406
+ currentChunk.push(line);
407
+ lineTokenCounts.push(lineTokens);
408
+ currentTokenCount += lineTokens;
409
+
410
+ if (chunks.length >= (config.maxChunksPerFile || 1000)) {
411
+ // Hard limit to prevent memory explosion on minified/data files
412
+ break;
413
+ }
414
+ }
415
+
416
+ // Add remaining chunk
417
+ const chunkText = currentChunk.join('\n');
418
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
419
+ chunks.push({
420
+ text: chunkText,
421
+ startLine: chunkStartLine + 1,
422
+ endLine: lines.length,
423
+ tokenCount: currentTokenCount + SPECIAL_TOKENS,
424
+ });
425
+ }
426
+
427
+ return chunks;
428
+ }