@softerist/heuristic-mcp 3.2.2 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +387 -376
  2. package/config.jsonc +800 -800
  3. package/features/ann-config.js +102 -110
  4. package/features/clear-cache.js +81 -84
  5. package/features/find-similar-code.js +265 -286
  6. package/features/hybrid-search.js +487 -536
  7. package/features/index-codebase.js +3139 -3270
  8. package/features/lifecycle.js +1041 -1063
  9. package/features/package-version.js +277 -291
  10. package/features/register.js +351 -370
  11. package/features/resources.js +115 -130
  12. package/features/set-workspace.js +214 -240
  13. package/index.js +742 -762
  14. package/lib/cache-ops.js +22 -22
  15. package/lib/cache-utils.js +465 -519
  16. package/lib/cache.js +1699 -1767
  17. package/lib/call-graph.js +396 -396
  18. package/lib/cli.js +232 -226
  19. package/lib/config.js +1483 -1495
  20. package/lib/constants.js +511 -492
  21. package/lib/embed-query-process.js +206 -212
  22. package/lib/embedding-process.js +434 -451
  23. package/lib/embedding-worker.js +862 -934
  24. package/lib/ignore-patterns.js +276 -316
  25. package/lib/json-worker.js +14 -14
  26. package/lib/json-writer.js +302 -310
  27. package/lib/logging.js +116 -127
  28. package/lib/memory-logger.js +13 -13
  29. package/lib/onnx-backend.js +188 -193
  30. package/lib/path-utils.js +18 -23
  31. package/lib/project-detector.js +82 -84
  32. package/lib/server-lifecycle.js +133 -145
  33. package/lib/settings-editor.js +738 -739
  34. package/lib/slice-normalize.js +25 -31
  35. package/lib/tokenizer.js +168 -203
  36. package/lib/utils.js +364 -409
  37. package/lib/vector-store-binary.js +811 -591
  38. package/lib/vector-store-sqlite.js +377 -414
  39. package/lib/workspace-env.js +32 -34
  40. package/mcp_config.json +9 -9
  41. package/package.json +86 -86
  42. package/scripts/clear-cache.js +20 -20
  43. package/scripts/download-model.js +43 -43
  44. package/scripts/mcp-launcher.js +49 -49
  45. package/scripts/postinstall.js +12 -12
  46. package/search-configs.js +36 -36
package/lib/utils.js CHANGED
@@ -1,409 +1,364 @@
1
- import crypto from 'crypto';
2
- import path from 'path';
3
- import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
-
5
-
6
- export {
7
- estimateTokens,
8
- getChunkingParams,
9
- getModelTokenLimit,
10
- MODEL_TOKEN_LIMITS,
11
- } from './tokenizer.js';
12
-
13
-
14
- import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
15
-
16
-
17
- export function dotSimilarity(a, b) {
18
- if (!a || !b) {
19
- throw new Error(
20
- 'dotSimilarity requires two non-null vectors. ' +
21
- 'This may indicate a missing embedding or corrupted cache entry.'
22
- );
23
- }
24
- if (a.length !== b.length) {
25
- throw new Error(
26
- `Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
27
- 'This may indicate an embedding dimension configuration change. Consider reindexing.'
28
- );
29
- }
30
- let dot = 0;
31
- let i = 0;
32
- const len = a.length;
33
- const m = len % 4;
34
-
35
- while (i < m) {
36
- dot += a[i] * b[i];
37
- i++;
38
- }
39
-
40
- while (i < len) {
41
- dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
42
- i += 4;
43
- }
44
-
45
- return dot;
46
- }
47
-
48
-
49
- export function hashContent(content) {
50
- return crypto.createHash('md5').update(content).digest('hex');
51
- }
52
-
53
-
54
- const patterns = {
55
-
56
- js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
57
- jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
58
- ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
59
- tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
60
- mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
61
- cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
62
-
63
-
64
- py: /^(class|def|async\s+def)\s+\w+/,
65
- pyw: /^(class|def|async\s+def)\s+\w+/,
66
- pyx: /^(cdef|cpdef|def|class)\s+\w+/,
67
-
68
-
69
- java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
70
- kt: /^(class|interface|object|fun|val|var)\s+\w+/,
71
- kts: /^(class|interface|object|fun|val|var)\s+\w+/,
72
- scala: /^(class|object|trait|def|val|var)\s+\w+/,
73
-
74
-
75
- c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
76
- cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
77
- cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
78
- cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
79
- h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
80
- hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
81
- hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
82
-
83
-
84
- cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
85
- csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
86
-
87
-
88
- go: /^(func|type|const|var)\s+\w+/,
89
-
90
-
91
- rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
92
-
93
-
94
- php: /^(class|interface|trait|function|const)\s+\w+/,
95
- phtml: /^(<\?php|class|interface|trait|function)\s*/,
96
-
97
-
98
- rb: /^(class|module|def)\s+\w+/,
99
- rake: /^(class|module|def|task|namespace)\s+\w+/,
100
-
101
-
102
- swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
103
-
104
-
105
- r: /^(\w+)\s*(<-|=)\s*function/,
106
- R: /^(\w+)\s*(<-|=)\s*function/,
107
-
108
-
109
- lua: /^(function|local\s+function)\s+\w+/,
110
-
111
-
112
- sh: /^(\w+\s*\(\)|function\s+\w+)/,
113
- bash: /^(\w+\s*\(\)|function\s+\w+)/,
114
- zsh: /^(\w+\s*\(\)|function\s+\w+)/,
115
- fish: /^function\s+\w+/,
116
-
117
-
118
- css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
119
- scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
120
- sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
121
- less: /^(@\w+:|\.|#|@media)\s*/,
122
- styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
123
-
124
-
125
- html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
126
- htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
127
- xml: /^(<\w+|\s*<!\[CDATA\[)/,
128
- svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
129
-
130
-
131
- json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
132
- yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
133
- yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
134
- toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
135
- ini: /^(\[\w+\]|\w+\s*=)/,
136
- env: /^[A-Z_][A-Z0-9_]*=/,
137
-
138
-
139
- makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
140
- mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
141
-
142
-
143
- dockerfile:
144
- /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
145
-
146
-
147
- md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
148
- mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
149
- txt: /^.{50,}/,
150
- rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
151
-
152
-
153
- sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
154
-
155
-
156
- pl: /^(sub|package|use|require)\s+\w+/,
157
- pm: /^(sub|package|use|require)\s+\w+/,
158
-
159
-
160
- vim: /^(function|command|autocmd|let\s+g:)\s*/,
161
- };
162
-
163
-
164
- export function smartChunk(content, file, config) {
165
- const lines = content.split('\n');
166
- const chunks = [];
167
- const ext = path.extname(file).toLowerCase();
168
- const base = path.basename(file).toLowerCase();
169
- const SPECIAL_TOKENS = 2;
170
-
171
-
172
- let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
173
- if (config.maxTokens) maxTokens = config.maxTokens;
174
- if (config.targetTokens) targetTokens = config.targetTokens;
175
- if (config.overlapTokens) overlapTokens = config.overlapTokens;
176
-
177
- let langPattern = patterns[ext.slice(1)];
178
- if (!langPattern) {
179
- if (base === 'dockerfile') langPattern = patterns.dockerfile;
180
- else if (base === 'makefile') langPattern = patterns.makefile;
181
- else if (base.startsWith('.env')) langPattern = patterns.env;
182
- }
183
- if (!langPattern || typeof langPattern.test !== 'function') {
184
- langPattern = patterns.js;
185
- }
186
- let currentChunk = [];
187
- let chunkStartLine = 0;
188
- let lineTokenCounts = [];
189
-
190
- let currentTokenCount = 0;
191
-
192
-
193
- let bracketDepth = 0;
194
- let braceDepth = 0;
195
- let parenDepth = 0;
196
- let inString = false;
197
- let inComment = false;
198
- let stringChar = null;
199
-
200
- const splitOversizedLine = (line, lineTokens) => {
201
- const charsPerToken = line.length / Math.max(1, lineTokens);
202
- const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens));
203
- const segments = [];
204
-
205
- for (let start = 0; start < line.length; start += segmentSize) {
206
- segments.push(line.slice(start, start + segmentSize));
207
- }
208
-
209
- return segments;
210
- };
211
-
212
- for (let i = 0; i < lines.length; i++) {
213
- const line = lines[i];
214
- const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
215
-
216
- let j = 0;
217
-
218
-
219
- if (inComment) {
220
-
221
- const endIdx = line.indexOf('*/');
222
- if (endIdx !== -1) {
223
- inComment = false;
224
- j = endIdx + 2;
225
- } else {
226
-
227
- j = line.length;
228
- }
229
- }
230
-
231
- const scanLine = j < line.length ? line.slice(j) : '';
232
- const trimmed = scanLine.trim();
233
-
234
- for (; j < line.length; j++) {
235
- const char = line[j];
236
- const nextChar = line[j + 1];
237
-
238
- if (inString) {
239
- if (char === '\\') {
240
- j++;
241
- } else if (char === stringChar) {
242
- inString = false;
243
- stringChar = null;
244
- }
245
- } else {
246
-
247
- if (char === '/' && nextChar === '*') {
248
- inComment = true;
249
- j++;
250
-
251
- const endIdx = line.indexOf('*/', j);
252
- if (endIdx !== -1) {
253
- inComment = false;
254
- j = endIdx + 1;
255
- } else {
256
- break;
257
- }
258
- } else if (char === '/' && nextChar === '/') {
259
- break;
260
- } else if (char === "'" || char === '"' || char === '`') {
261
- inString = true;
262
- stringChar = char;
263
- } else {
264
-
265
- if (char === '{') braceDepth++;
266
- else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
267
- else if (char === '[') bracketDepth++;
268
- else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
269
- else if (char === '(') parenDepth++;
270
- else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
271
- }
272
- }
273
- }
274
-
275
-
276
- if (lineTokens + SPECIAL_TOKENS > maxTokens) {
277
- if (currentChunk.length > 0) {
278
- const chunkText = currentChunk.join('\n');
279
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
280
- chunks.push({
281
- text: chunkText,
282
- startLine: chunkStartLine + 1,
283
- endLine: i,
284
- tokenCount: currentTokenCount + SPECIAL_TOKENS,
285
- });
286
- }
287
- }
288
-
289
- const parts = splitOversizedLine(line, lineTokens);
290
- for (const part of parts) {
291
- if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
292
- const partTokens = estimateTokens(part, { includeSpecialTokens: false });
293
- chunks.push({
294
- text: part,
295
- startLine: i + 1,
296
- endLine: i + 1,
297
- tokenCount: partTokens + SPECIAL_TOKENS,
298
- });
299
- }
300
-
301
- currentChunk = [];
302
- lineTokenCounts = [];
303
- currentTokenCount = 0;
304
- chunkStartLine = i + 1;
305
- continue;
306
- }
307
-
308
-
309
- const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
310
- const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
311
-
312
-
313
- const matchesPattern = langPattern.test(trimmed);
314
- const atTopLevel =
315
- braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
316
- const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
317
- const isEmptyLine = trimmed.length === 0;
318
- const prevWasEmpty =
319
- i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
320
- const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
321
-
322
- const isGoodSplitPoint =
323
- currentChunk.length > 3 &&
324
- ((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
325
- (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
326
- (prevWasEmpty && (matchesPattern || isCommentStart)));
327
-
328
- const shouldSplit =
329
- wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
330
-
331
-
332
- const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
333
-
334
- if (shouldSplit && safeToSplit && currentChunk.length > 0) {
335
- const chunkText = currentChunk.join('\n');
336
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
337
- chunks.push({
338
- text: chunkText,
339
- startLine: chunkStartLine + 1,
340
- endLine: i,
341
- tokenCount: currentTokenCount,
342
- });
343
- }
344
-
345
- let overlapLines = [];
346
- let overlapTokensCount = 0;
347
- let overlapStartOffset = 0;
348
- const MAX_OVERLAP_ITERATIONS = 50;
349
- let overlapIterations = 0;
350
- for (
351
- let k = currentChunk.length - 1;
352
- k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
353
- k--
354
- ) {
355
- overlapIterations++;
356
-
357
- const lineT = lineTokenCounts[k] ?? 0;
358
-
359
- if (lineT <= 0) {
360
-
361
-
362
- if (overlapLines.length < 10 && overlapStartOffset < 20) {
363
- overlapLines.unshift(currentChunk[k]);
364
- overlapStartOffset++;
365
- }
366
- continue;
367
- }
368
- if (overlapTokensCount + lineT <= overlapTokens) {
369
- overlapLines.unshift(currentChunk[k]);
370
- overlapTokensCount += lineT;
371
- overlapStartOffset++;
372
- } else {
373
- break;
374
- }
375
- }
376
-
377
- currentChunk = overlapLines;
378
-
379
- lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
380
- currentTokenCount = overlapTokensCount;
381
-
382
-
383
-
384
- chunkStartLine = Math.max(0, i - overlapStartOffset);
385
- }
386
-
387
- currentChunk.push(line);
388
- lineTokenCounts.push(lineTokens);
389
- currentTokenCount += lineTokens;
390
-
391
- if (chunks.length >= (config.maxChunksPerFile || 1000)) {
392
-
393
- break;
394
- }
395
- }
396
-
397
-
398
- const chunkText = currentChunk.join('\n');
399
- if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
400
- chunks.push({
401
- text: chunkText,
402
- startLine: chunkStartLine + 1,
403
- endLine: lines.length,
404
- tokenCount: currentTokenCount + SPECIAL_TOKENS,
405
- });
406
- }
407
-
408
- return chunks;
409
- }
1
+ import crypto from 'crypto';
2
+ import path from 'path';
3
+ import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
+
5
+ export {
6
+ estimateTokens,
7
+ getChunkingParams,
8
+ getModelTokenLimit,
9
+ MODEL_TOKEN_LIMITS,
10
+ } from './tokenizer.js';
11
+
12
+ import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
13
+
14
+ export function dotSimilarity(a, b) {
15
+ if (!a || !b) {
16
+ throw new Error(
17
+ 'dotSimilarity requires two non-null vectors. ' +
18
+ 'This may indicate a missing embedding or corrupted cache entry.'
19
+ );
20
+ }
21
+ if (a.length !== b.length) {
22
+ throw new Error(
23
+ `Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
24
+ 'This may indicate an embedding dimension configuration change. Consider reindexing.'
25
+ );
26
+ }
27
+ let dot = 0;
28
+ let i = 0;
29
+ const len = a.length;
30
+ const m = len % 4;
31
+
32
+ while (i < m) {
33
+ dot += a[i] * b[i];
34
+ i++;
35
+ }
36
+
37
+ while (i < len) {
38
+ dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
39
+ i += 4;
40
+ }
41
+
42
+ return dot;
43
+ }
44
+
45
+ export function hashContent(content) {
46
+ return crypto.createHash('md5').update(content).digest('hex');
47
+ }
48
+
49
+ const patterns = {
50
+ js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
51
+ jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
52
+ ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
53
+ tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
54
+ mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
55
+ cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
56
+
57
+ py: /^(class|def|async\s+def)\s+\w+/,
58
+ pyw: /^(class|def|async\s+def)\s+\w+/,
59
+ pyx: /^(cdef|cpdef|def|class)\s+\w+/,
60
+
61
+ java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
62
+ kt: /^(class|interface|object|fun|val|var)\s+\w+/,
63
+ kts: /^(class|interface|object|fun|val|var)\s+\w+/,
64
+ scala: /^(class|object|trait|def|val|var)\s+\w+/,
65
+
66
+ c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
67
+ cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
68
+ cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
69
+ cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
70
+ h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
71
+ hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
72
+ hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
73
+
74
+ cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
75
+ csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
76
+
77
+ go: /^(func|type|const|var)\s+\w+/,
78
+
79
+ rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
80
+
81
+ php: /^(class|interface|trait|function|const)\s+\w+/,
82
+ phtml: /^(<\?php|class|interface|trait|function)\s*/,
83
+
84
+ rb: /^(class|module|def)\s+\w+/,
85
+ rake: /^(class|module|def|task|namespace)\s+\w+/,
86
+
87
+ swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
88
+
89
+ r: /^(\w+)\s*(<-|=)\s*function/,
90
+ R: /^(\w+)\s*(<-|=)\s*function/,
91
+
92
+ lua: /^(function|local\s+function)\s+\w+/,
93
+
94
+ sh: /^(\w+\s*\(\)|function\s+\w+)/,
95
+ bash: /^(\w+\s*\(\)|function\s+\w+)/,
96
+ zsh: /^(\w+\s*\(\)|function\s+\w+)/,
97
+ fish: /^function\s+\w+/,
98
+
99
+ css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
100
+ scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
101
+ sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
102
+ less: /^(@\w+:|\.|#|@media)\s*/,
103
+ styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
104
+
105
+ html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
106
+ htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
107
+ xml: /^(<\w+|\s*<!\[CDATA\[)/,
108
+ svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
109
+
110
+ json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
111
+ yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
112
+ yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
113
+ toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
114
+ ini: /^(\[\w+\]|\w+\s*=)/,
115
+ env: /^[A-Z_][A-Z0-9_]*=/,
116
+
117
+ makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
118
+ mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
119
+
120
+ dockerfile:
121
+ /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
122
+
123
+ md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
124
+ mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
125
+ txt: /^.{50,}/,
126
+ rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
127
+
128
+ sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
129
+
130
+ pl: /^(sub|package|use|require)\s+\w+/,
131
+ pm: /^(sub|package|use|require)\s+\w+/,
132
+
133
+ vim: /^(function|command|autocmd|let\s+g:)\s*/,
134
+ };
135
+
136
+ export function smartChunk(content, file, config) {
137
+ const lines = content.split('\n');
138
+ const chunks = [];
139
+ const ext = path.extname(file).toLowerCase();
140
+ const base = path.basename(file).toLowerCase();
141
+ const SPECIAL_TOKENS = 2;
142
+
143
+ let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
144
+ if (config.maxTokens) maxTokens = config.maxTokens;
145
+ if (config.targetTokens) targetTokens = config.targetTokens;
146
+ if (config.overlapTokens) overlapTokens = config.overlapTokens;
147
+
148
+ let langPattern = patterns[ext.slice(1)];
149
+ if (!langPattern) {
150
+ if (base === 'dockerfile') langPattern = patterns.dockerfile;
151
+ else if (base === 'makefile') langPattern = patterns.makefile;
152
+ else if (base.startsWith('.env')) langPattern = patterns.env;
153
+ }
154
+ if (!langPattern || typeof langPattern.test !== 'function') {
155
+ langPattern = patterns.js;
156
+ }
157
+ let currentChunk = [];
158
+ let chunkStartLine = 0;
159
+ let lineTokenCounts = [];
160
+
161
+ let currentTokenCount = 0;
162
+
163
+ let bracketDepth = 0;
164
+ let braceDepth = 0;
165
+ let parenDepth = 0;
166
+ let inString = false;
167
+ let inComment = false;
168
+ let stringChar = null;
169
+
170
+ const splitOversizedLine = (line, lineTokens) => {
171
+ const charsPerToken = line.length / Math.max(1, lineTokens);
172
+ const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens));
173
+ const segments = [];
174
+
175
+ for (let start = 0; start < line.length; start += segmentSize) {
176
+ segments.push(line.slice(start, start + segmentSize));
177
+ }
178
+
179
+ return segments;
180
+ };
181
+
182
+ for (let i = 0; i < lines.length; i++) {
183
+ const line = lines[i];
184
+ const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
185
+
186
+ let j = 0;
187
+
188
+ if (inComment) {
189
+ const endIdx = line.indexOf('*/');
190
+ if (endIdx !== -1) {
191
+ inComment = false;
192
+ j = endIdx + 2;
193
+ } else {
194
+ j = line.length;
195
+ }
196
+ }
197
+
198
+ const scanLine = j < line.length ? line.slice(j) : '';
199
+ const trimmed = scanLine.trim();
200
+
201
+ for (; j < line.length; j++) {
202
+ const char = line[j];
203
+ const nextChar = line[j + 1];
204
+
205
+ if (inString) {
206
+ if (char === '\\') {
207
+ j++;
208
+ } else if (char === stringChar) {
209
+ inString = false;
210
+ stringChar = null;
211
+ }
212
+ } else {
213
+ if (char === '/' && nextChar === '*') {
214
+ inComment = true;
215
+ j++;
216
+
217
+ const endIdx = line.indexOf('*/', j);
218
+ if (endIdx !== -1) {
219
+ inComment = false;
220
+ j = endIdx + 1;
221
+ } else {
222
+ break;
223
+ }
224
+ } else if (char === '/' && nextChar === '/') {
225
+ break;
226
+ } else if (char === "'" || char === '"' || char === '`') {
227
+ inString = true;
228
+ stringChar = char;
229
+ } else {
230
+ if (char === '{') braceDepth++;
231
+ else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
232
+ else if (char === '[') bracketDepth++;
233
+ else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
234
+ else if (char === '(') parenDepth++;
235
+ else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
236
+ }
237
+ }
238
+ }
239
+
240
+ if (lineTokens + SPECIAL_TOKENS > maxTokens) {
241
+ if (currentChunk.length > 0) {
242
+ const chunkText = currentChunk.join('\n');
243
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
244
+ chunks.push({
245
+ text: chunkText,
246
+ startLine: chunkStartLine + 1,
247
+ endLine: i,
248
+ tokenCount: currentTokenCount + SPECIAL_TOKENS,
249
+ });
250
+ }
251
+ }
252
+
253
+ const parts = splitOversizedLine(line, lineTokens);
254
+ for (const part of parts) {
255
+ if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
256
+ const partTokens = estimateTokens(part, { includeSpecialTokens: false });
257
+ chunks.push({
258
+ text: part,
259
+ startLine: i + 1,
260
+ endLine: i + 1,
261
+ tokenCount: partTokens + SPECIAL_TOKENS,
262
+ });
263
+ }
264
+
265
+ currentChunk = [];
266
+ lineTokenCounts = [];
267
+ currentTokenCount = 0;
268
+ chunkStartLine = i + 1;
269
+ continue;
270
+ }
271
+
272
+ const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
273
+ const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
274
+
275
+ const matchesPattern = langPattern.test(trimmed);
276
+ const atTopLevel =
277
+ braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
278
+ const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
279
+ const isEmptyLine = trimmed.length === 0;
280
+ const prevWasEmpty =
281
+ i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
282
+ const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
283
+
284
+ const isGoodSplitPoint =
285
+ currentChunk.length > 3 &&
286
+ ((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
287
+ (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
288
+ (prevWasEmpty && (matchesPattern || isCommentStart)));
289
+
290
+ const shouldSplit =
291
+ wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
292
+
293
+ const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
294
+
295
+ if (shouldSplit && safeToSplit && currentChunk.length > 0) {
296
+ const chunkText = currentChunk.join('\n');
297
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
298
+ chunks.push({
299
+ text: chunkText,
300
+ startLine: chunkStartLine + 1,
301
+ endLine: i,
302
+ tokenCount: currentTokenCount,
303
+ });
304
+ }
305
+
306
+ let overlapLines = [];
307
+ let overlapTokensCount = 0;
308
+ let overlapStartOffset = 0;
309
+ const MAX_OVERLAP_ITERATIONS = 50;
310
+ let overlapIterations = 0;
311
+ for (
312
+ let k = currentChunk.length - 1;
313
+ k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
314
+ k--
315
+ ) {
316
+ overlapIterations++;
317
+
318
+ const lineT = lineTokenCounts[k] ?? 0;
319
+
320
+ if (lineT <= 0) {
321
+ if (overlapLines.length < 10 && overlapStartOffset < 20) {
322
+ overlapLines.unshift(currentChunk[k]);
323
+ overlapStartOffset++;
324
+ }
325
+ continue;
326
+ }
327
+ if (overlapTokensCount + lineT <= overlapTokens) {
328
+ overlapLines.unshift(currentChunk[k]);
329
+ overlapTokensCount += lineT;
330
+ overlapStartOffset++;
331
+ } else {
332
+ break;
333
+ }
334
+ }
335
+
336
+ currentChunk = overlapLines;
337
+
338
+ lineTokenCounts = overlapLines.map((l) => estimateTokens(l, { includeSpecialTokens: false }));
339
+ currentTokenCount = overlapTokensCount;
340
+
341
+ chunkStartLine = Math.max(0, i - overlapStartOffset);
342
+ }
343
+
344
+ currentChunk.push(line);
345
+ lineTokenCounts.push(lineTokens);
346
+ currentTokenCount += lineTokens;
347
+
348
+ if (chunks.length >= (config.maxChunksPerFile || 1000)) {
349
+ break;
350
+ }
351
+ }
352
+
353
+ const chunkText = currentChunk.join('\n');
354
+ if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
355
+ chunks.push({
356
+ text: chunkText,
357
+ startLine: chunkStartLine + 1,
358
+ endLine: lines.length,
359
+ tokenCount: currentTokenCount + SPECIAL_TOKENS,
360
+ });
361
+ }
362
+
363
+ return chunks;
364
+ }