@softerist/heuristic-mcp 3.0.15 → 3.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -104
- package/config.jsonc +173 -173
- package/features/ann-config.js +131 -0
- package/features/clear-cache.js +84 -0
- package/features/find-similar-code.js +291 -0
- package/features/hybrid-search.js +544 -0
- package/features/index-codebase.js +3268 -0
- package/features/lifecycle.js +1189 -0
- package/features/package-version.js +302 -0
- package/features/register.js +408 -0
- package/features/resources.js +156 -0
- package/features/set-workspace.js +265 -0
- package/index.js +96 -96
- package/lib/cache-ops.js +22 -22
- package/lib/cache-utils.js +565 -565
- package/lib/cache.js +1870 -1870
- package/lib/call-graph.js +396 -396
- package/lib/cli.js +1 -1
- package/lib/config.js +517 -517
- package/lib/constants.js +39 -39
- package/lib/embed-query-process.js +7 -7
- package/lib/embedding-process.js +7 -7
- package/lib/embedding-worker.js +299 -299
- package/lib/ignore-patterns.js +316 -316
- package/lib/json-worker.js +14 -14
- package/lib/json-writer.js +337 -337
- package/lib/logging.js +164 -164
- package/lib/memory-logger.js +13 -13
- package/lib/onnx-backend.js +193 -193
- package/lib/project-detector.js +84 -84
- package/lib/server-lifecycle.js +165 -165
- package/lib/settings-editor.js +754 -754
- package/lib/tokenizer.js +256 -256
- package/lib/utils.js +428 -428
- package/lib/vector-store-binary.js +627 -627
- package/lib/vector-store-sqlite.js +95 -95
- package/lib/workspace-env.js +28 -28
- package/mcp_config.json +9 -9
- package/package.json +86 -75
- package/scripts/clear-cache.js +20 -0
- package/scripts/download-model.js +43 -0
- package/scripts/mcp-launcher.js +49 -0
- package/scripts/postinstall.js +12 -0
- package/search-configs.js +36 -36
- package/.prettierrc +0 -7
- package/debug-pids.js +0 -30
- package/eslint.config.js +0 -36
- package/specs/plan.md +0 -23
- package/vitest.config.js +0 -39
package/lib/utils.js
CHANGED
|
@@ -1,428 +1,428 @@
|
|
|
1
|
-
import crypto from 'crypto';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { estimateTokens, getChunkingParams } from './tokenizer.js';
|
|
4
|
-
|
|
5
|
-
// Re-export tokenizer utilities
|
|
6
|
-
export {
|
|
7
|
-
estimateTokens,
|
|
8
|
-
getChunkingParams,
|
|
9
|
-
getModelTokenLimit,
|
|
10
|
-
MODEL_TOKEN_LIMITS,
|
|
11
|
-
} from './tokenizer.js';
|
|
12
|
-
|
|
13
|
-
// Minimum text length for a chunk to be considered valid (avoids tiny fragments)
|
|
14
|
-
import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* Fast similarity for normalized vectors (dot product).
|
|
18
|
-
* Uses loop unrolling for performance on large vectors.
|
|
19
|
-
* NOTE: For very large codebases (10k+ chunks), consider WebAssembly SIMD
|
|
20
|
-
* for ~2-4x speedup on 768-dim vectors.
|
|
21
|
-
* @param {Float32Array} a - First normalized vector
|
|
22
|
-
* @param {Float32Array} b - Second normalized vector
|
|
23
|
-
* @returns {number} Dot product similarity score (-1 to 1 for normalized vectors)
|
|
24
|
-
* @throws {Error} If vectors are null/undefined or have different dimensions
|
|
25
|
-
*/
|
|
26
|
-
export function dotSimilarity(a, b) {
|
|
27
|
-
if (!a || !b) {
|
|
28
|
-
throw new Error(
|
|
29
|
-
'dotSimilarity requires two non-null vectors. ' +
|
|
30
|
-
'This may indicate a missing embedding or corrupted cache entry.'
|
|
31
|
-
);
|
|
32
|
-
}
|
|
33
|
-
if (a.length !== b.length) {
|
|
34
|
-
throw new Error(
|
|
35
|
-
`Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
|
|
36
|
-
'This may indicate an embedding dimension configuration change. Consider reindexing.'
|
|
37
|
-
);
|
|
38
|
-
}
|
|
39
|
-
let dot = 0;
|
|
40
|
-
let i = 0;
|
|
41
|
-
const len = a.length;
|
|
42
|
-
const m = len % 4;
|
|
43
|
-
|
|
44
|
-
while (i < m) {
|
|
45
|
-
dot += a[i] * b[i];
|
|
46
|
-
i++;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
while (i < len) {
|
|
50
|
-
dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
|
|
51
|
-
i += 4;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
return dot;
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
/**
|
|
58
|
-
* Generate hash for file content to detect changes
|
|
59
|
-
*/
|
|
60
|
-
export function hashContent(content) {
|
|
61
|
-
return crypto.createHash('md5').update(content).digest('hex');
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Language-specific patterns for function/class detection
|
|
65
|
-
const patterns = {
|
|
66
|
-
// JavaScript/TypeScript
|
|
67
|
-
js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
68
|
-
jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
69
|
-
ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
70
|
-
tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
71
|
-
mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
72
|
-
cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
73
|
-
|
|
74
|
-
// Python
|
|
75
|
-
py: /^(class|def|async\s+def)\s+\w+/,
|
|
76
|
-
pyw: /^(class|def|async\s+def)\s+\w+/,
|
|
77
|
-
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
78
|
-
|
|
79
|
-
// Java/Kotlin/Scala
|
|
80
|
-
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
81
|
-
kt: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
82
|
-
kts: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
83
|
-
scala: /^(class|object|trait|def|val|var)\s+\w+/,
|
|
84
|
-
|
|
85
|
-
// C/C++
|
|
86
|
-
c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
|
|
87
|
-
cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
88
|
-
cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
89
|
-
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
90
|
-
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
91
|
-
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
92
|
-
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
93
|
-
|
|
94
|
-
// C#
|
|
95
|
-
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
96
|
-
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
97
|
-
|
|
98
|
-
// Go
|
|
99
|
-
go: /^(func|type|const|var)\s+\w+/,
|
|
100
|
-
|
|
101
|
-
// Rust
|
|
102
|
-
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
103
|
-
|
|
104
|
-
// PHP
|
|
105
|
-
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
106
|
-
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
107
|
-
|
|
108
|
-
// Ruby
|
|
109
|
-
rb: /^(class|module|def)\s+\w+/,
|
|
110
|
-
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
111
|
-
|
|
112
|
-
// Swift
|
|
113
|
-
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
114
|
-
|
|
115
|
-
// R
|
|
116
|
-
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
117
|
-
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
118
|
-
|
|
119
|
-
// Lua
|
|
120
|
-
lua: /^(function|local\s+function)\s+\w+/,
|
|
121
|
-
|
|
122
|
-
// Shell scripts
|
|
123
|
-
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
124
|
-
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
125
|
-
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
126
|
-
fish: /^function\s+\w+/,
|
|
127
|
-
|
|
128
|
-
// CSS/Styles
|
|
129
|
-
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
130
|
-
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
131
|
-
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
132
|
-
less: /^(@\w+:|\.|#|@media)\s*/,
|
|
133
|
-
styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
|
|
134
|
-
|
|
135
|
-
// Markup/HTML
|
|
136
|
-
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
137
|
-
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
138
|
-
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
139
|
-
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
140
|
-
|
|
141
|
-
// Config files
|
|
142
|
-
json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
|
|
143
|
-
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
144
|
-
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
145
|
-
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
146
|
-
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
147
|
-
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
148
|
-
|
|
149
|
-
// Makefile
|
|
150
|
-
makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
151
|
-
mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
152
|
-
|
|
153
|
-
// Docker
|
|
154
|
-
dockerfile:
|
|
155
|
-
/^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
|
|
156
|
-
|
|
157
|
-
// Documentation
|
|
158
|
-
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
159
|
-
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
160
|
-
txt: /^.{50,}/, // Split on long paragraphs
|
|
161
|
-
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
162
|
-
|
|
163
|
-
// Database
|
|
164
|
-
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
165
|
-
|
|
166
|
-
// Perl
|
|
167
|
-
pl: /^(sub|package|use|require)\s+\w+/,
|
|
168
|
-
pm: /^(sub|package|use|require)\s+\w+/,
|
|
169
|
-
|
|
170
|
-
// Vim
|
|
171
|
-
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
/**
|
|
175
|
-
* Intelligent chunking with token limit awareness
|
|
176
|
-
* Tries to split by function/class boundaries while respecting token limits
|
|
177
|
-
*
|
|
178
|
-
* @param {string} content - File content to chunk
|
|
179
|
-
* @param {string} file - File path (for language detection)
|
|
180
|
-
* @param {object} config - Configuration object with embeddingModel
|
|
181
|
-
* @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
|
|
182
|
-
*/
|
|
183
|
-
export function smartChunk(content, file, config) {
|
|
184
|
-
const lines = content.split('\n');
|
|
185
|
-
const chunks = [];
|
|
186
|
-
const ext = path.extname(file).toLowerCase();
|
|
187
|
-
const base = path.basename(file).toLowerCase();
|
|
188
|
-
const SPECIAL_TOKENS = 2; // [CLS] + [SEP] accounted once per chunk
|
|
189
|
-
|
|
190
|
-
// Get model-specific chunking parameters with optional user overrides
|
|
191
|
-
let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
|
|
192
|
-
if (config.maxTokens) maxTokens = config.maxTokens;
|
|
193
|
-
if (config.targetTokens) targetTokens = config.targetTokens;
|
|
194
|
-
if (config.overlapTokens) overlapTokens = config.overlapTokens;
|
|
195
|
-
|
|
196
|
-
let langPattern = patterns[ext.slice(1)];
|
|
197
|
-
if (!langPattern) {
|
|
198
|
-
if (base === 'dockerfile') langPattern = patterns.dockerfile;
|
|
199
|
-
else if (base === 'makefile') langPattern = patterns.makefile;
|
|
200
|
-
else if (base.startsWith('.env')) langPattern = patterns.env;
|
|
201
|
-
}
|
|
202
|
-
if (!langPattern || typeof langPattern.test !== 'function') {
|
|
203
|
-
langPattern = patterns.js; // Default fallback
|
|
204
|
-
}
|
|
205
|
-
let currentChunk = [];
|
|
206
|
-
let chunkStartLine = 0;
|
|
207
|
-
let lineTokenCounts = []; // Cache token counts for overlap calculation
|
|
208
|
-
|
|
209
|
-
let currentTokenCount = 0;
|
|
210
|
-
|
|
211
|
-
// Track bracket depth for better boundary detection
|
|
212
|
-
let bracketDepth = 0;
|
|
213
|
-
let braceDepth = 0;
|
|
214
|
-
let parenDepth = 0;
|
|
215
|
-
let inString = false;
|
|
216
|
-
let inComment = false;
|
|
217
|
-
let stringChar = null; // ' or " or `
|
|
218
|
-
|
|
219
|
-
const splitOversizedLine = (line, lineTokens) => {
|
|
220
|
-
const charsPerToken = line.length / Math.max(1, lineTokens);
|
|
221
|
-
const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
|
|
222
|
-
const segments = [];
|
|
223
|
-
|
|
224
|
-
for (let start = 0; start < line.length; start += segmentSize) {
|
|
225
|
-
segments.push(line.slice(start, start + segmentSize));
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
return segments;
|
|
229
|
-
};
|
|
230
|
-
|
|
231
|
-
for (let i = 0; i < lines.length; i++) {
|
|
232
|
-
const line = lines[i];
|
|
233
|
-
const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
|
|
234
|
-
|
|
235
|
-
let j = 0;
|
|
236
|
-
|
|
237
|
-
// Simple state tracking for heuristics (not a full parser)
|
|
238
|
-
if (inComment) {
|
|
239
|
-
// Look for end of block comment
|
|
240
|
-
const endIdx = line.indexOf('*/');
|
|
241
|
-
if (endIdx !== -1) {
|
|
242
|
-
inComment = false;
|
|
243
|
-
j = endIdx + 2;
|
|
244
|
-
} else {
|
|
245
|
-
// Skip whole line
|
|
246
|
-
j = line.length;
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
const scanLine = j < line.length ? line.slice(j) : '';
|
|
251
|
-
const trimmed = scanLine.trim();
|
|
252
|
-
|
|
253
|
-
for (; j < line.length; j++) {
|
|
254
|
-
const char = line[j];
|
|
255
|
-
const nextChar = line[j + 1];
|
|
256
|
-
|
|
257
|
-
if (inString) {
|
|
258
|
-
if (char === '\\') {
|
|
259
|
-
j++; // Skip escaped char
|
|
260
|
-
} else if (char === stringChar) {
|
|
261
|
-
inString = false;
|
|
262
|
-
stringChar = null;
|
|
263
|
-
}
|
|
264
|
-
} else {
|
|
265
|
-
// Check for comment start
|
|
266
|
-
if (char === '/' && nextChar === '*') {
|
|
267
|
-
inComment = true;
|
|
268
|
-
j++;
|
|
269
|
-
// Check if it ends on same line
|
|
270
|
-
const endIdx = line.indexOf('*/', j);
|
|
271
|
-
if (endIdx !== -1) {
|
|
272
|
-
inComment = false;
|
|
273
|
-
j = endIdx + 1;
|
|
274
|
-
} else {
|
|
275
|
-
break; // Rest of line is comment
|
|
276
|
-
}
|
|
277
|
-
} else if (char === '/' && nextChar === '/') {
|
|
278
|
-
break; // Skip rest of line (line comment)
|
|
279
|
-
} else if (char === "'" || char === '"' || char === '`') {
|
|
280
|
-
inString = true;
|
|
281
|
-
stringChar = char;
|
|
282
|
-
} else {
|
|
283
|
-
// Only count brackets if not in string or comment
|
|
284
|
-
if (char === '{') braceDepth++;
|
|
285
|
-
else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
|
|
286
|
-
else if (char === '[') bracketDepth++;
|
|
287
|
-
else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
|
|
288
|
-
else if (char === '(') parenDepth++;
|
|
289
|
-
else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
// Split lines that are too large to ever fit in a single chunk
|
|
295
|
-
if (lineTokens + SPECIAL_TOKENS > maxTokens) {
|
|
296
|
-
if (currentChunk.length > 0) {
|
|
297
|
-
const chunkText = currentChunk.join('\n');
|
|
298
|
-
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
299
|
-
chunks.push({
|
|
300
|
-
text: chunkText,
|
|
301
|
-
startLine: chunkStartLine + 1,
|
|
302
|
-
endLine: i,
|
|
303
|
-
tokenCount: currentTokenCount + SPECIAL_TOKENS,
|
|
304
|
-
});
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
const parts = splitOversizedLine(line, lineTokens);
|
|
309
|
-
for (const part of parts) {
|
|
310
|
-
if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
|
|
311
|
-
const partTokens = estimateTokens(part, { includeSpecialTokens: false });
|
|
312
|
-
chunks.push({
|
|
313
|
-
text: part,
|
|
314
|
-
startLine: i + 1,
|
|
315
|
-
endLine: i + 1,
|
|
316
|
-
tokenCount: partTokens + SPECIAL_TOKENS,
|
|
317
|
-
});
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
currentChunk = [];
|
|
321
|
-
lineTokenCounts = [];
|
|
322
|
-
currentTokenCount = 0;
|
|
323
|
-
chunkStartLine = i + 1;
|
|
324
|
-
continue;
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
// Check if adding this line would exceed token limit
|
|
328
|
-
const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
|
|
329
|
-
const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
|
|
330
|
-
|
|
331
|
-
// Check if this is a good split point using multiple heuristics
|
|
332
|
-
const matchesPattern = langPattern.test(trimmed);
|
|
333
|
-
const atTopLevel =
|
|
334
|
-
braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
|
|
335
|
-
const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
|
|
336
|
-
const isEmptyLine = trimmed.length === 0;
|
|
337
|
-
const prevWasEmpty =
|
|
338
|
-
i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
|
|
339
|
-
const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
|
|
340
|
-
|
|
341
|
-
const isGoodSplitPoint =
|
|
342
|
-
currentChunk.length > 3 &&
|
|
343
|
-
((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
|
|
344
|
-
(atTopLevel && startsAtColumn0 && !isEmptyLine) ||
|
|
345
|
-
(prevWasEmpty && (matchesPattern || isCommentStart)));
|
|
346
|
-
|
|
347
|
-
const shouldSplit =
|
|
348
|
-
wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
|
|
349
|
-
|
|
350
|
-
// Avoid splitting in weird states if possible
|
|
351
|
-
const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
|
|
352
|
-
|
|
353
|
-
if (shouldSplit && safeToSplit && currentChunk.length > 0) {
|
|
354
|
-
const chunkText = currentChunk.join('\n');
|
|
355
|
-
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
356
|
-
chunks.push({
|
|
357
|
-
text: chunkText,
|
|
358
|
-
startLine: chunkStartLine + 1,
|
|
359
|
-
endLine: i,
|
|
360
|
-
tokenCount: currentTokenCount,
|
|
361
|
-
});
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
let overlapLines = [];
|
|
365
|
-
let overlapTokensCount = 0;
|
|
366
|
-
let overlapStartOffset = 0; // Track how many lines back we went
|
|
367
|
-
const MAX_OVERLAP_ITERATIONS = 50; // Absolute limit to prevent unbounded loops
|
|
368
|
-
let overlapIterations = 0;
|
|
369
|
-
for (
|
|
370
|
-
let k = currentChunk.length - 1;
|
|
371
|
-
k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
|
|
372
|
-
k--
|
|
373
|
-
) {
|
|
374
|
-
overlapIterations++;
|
|
375
|
-
// Use cached token count instead of re-estimating
|
|
376
|
-
const lineT = lineTokenCounts[k] ?? 0;
|
|
377
|
-
// Guard against infinite loops: if lineT is 0, count the line but don't loop forever
|
|
378
|
-
if (lineT <= 0) {
|
|
379
|
-
// Include zero-token lines (e.g., empty lines) but limit to prevent infinite spin
|
|
380
|
-
// Also guard with overlapStartOffset < 20 to prevent excessive lines even if under 10 in overlapLines
|
|
381
|
-
if (overlapLines.length < 10 && overlapStartOffset < 20) {
|
|
382
|
-
overlapLines.unshift(currentChunk[k]);
|
|
383
|
-
overlapStartOffset++;
|
|
384
|
-
}
|
|
385
|
-
continue;
|
|
386
|
-
}
|
|
387
|
-
if (overlapTokensCount + lineT <= overlapTokens) {
|
|
388
|
-
overlapLines.unshift(currentChunk[k]);
|
|
389
|
-
overlapTokensCount += lineT;
|
|
390
|
-
overlapStartOffset++;
|
|
391
|
-
} else {
|
|
392
|
-
break;
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
currentChunk = overlapLines;
|
|
397
|
-
// Rebuild lineTokenCounts for the overlap lines
|
|
398
|
-
lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
|
|
399
|
-
currentTokenCount = overlapTokensCount;
|
|
400
|
-
// The new chunk starts from where the overlap begins in the original file
|
|
401
|
-
// i is the current line we're about to process, overlap lines are from before
|
|
402
|
-
// Ensure non-negative to handle edge cases where overlapStartOffset > i
|
|
403
|
-
chunkStartLine = Math.max(0, i - overlapStartOffset);
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
currentChunk.push(line);
|
|
407
|
-
lineTokenCounts.push(lineTokens);
|
|
408
|
-
currentTokenCount += lineTokens;
|
|
409
|
-
|
|
410
|
-
if (chunks.length >= (config.maxChunksPerFile || 1000)) {
|
|
411
|
-
// Hard limit to prevent memory explosion on minified/data files
|
|
412
|
-
break;
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
// Add remaining chunk
|
|
417
|
-
const chunkText = currentChunk.join('\n');
|
|
418
|
-
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
419
|
-
chunks.push({
|
|
420
|
-
text: chunkText,
|
|
421
|
-
startLine: chunkStartLine + 1,
|
|
422
|
-
endLine: lines.length,
|
|
423
|
-
tokenCount: currentTokenCount + SPECIAL_TOKENS,
|
|
424
|
-
});
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
return chunks;
|
|
428
|
-
}
|
|
1
|
+
import crypto from 'crypto';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { estimateTokens, getChunkingParams } from './tokenizer.js';
|
|
4
|
+
|
|
5
|
+
// Re-export tokenizer utilities
|
|
6
|
+
export {
|
|
7
|
+
estimateTokens,
|
|
8
|
+
getChunkingParams,
|
|
9
|
+
getModelTokenLimit,
|
|
10
|
+
MODEL_TOKEN_LIMITS,
|
|
11
|
+
} from './tokenizer.js';
|
|
12
|
+
|
|
13
|
+
// Minimum text length for a chunk to be considered valid (avoids tiny fragments)
|
|
14
|
+
import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Fast similarity for normalized vectors (dot product).
|
|
18
|
+
* Uses loop unrolling for performance on large vectors.
|
|
19
|
+
* NOTE: For very large codebases (10k+ chunks), consider WebAssembly SIMD
|
|
20
|
+
* for ~2-4x speedup on 768-dim vectors.
|
|
21
|
+
* @param {Float32Array} a - First normalized vector
|
|
22
|
+
* @param {Float32Array} b - Second normalized vector
|
|
23
|
+
* @returns {number} Dot product similarity score (-1 to 1 for normalized vectors)
|
|
24
|
+
* @throws {Error} If vectors are null/undefined or have different dimensions
|
|
25
|
+
*/
|
|
26
|
+
export function dotSimilarity(a, b) {
|
|
27
|
+
if (!a || !b) {
|
|
28
|
+
throw new Error(
|
|
29
|
+
'dotSimilarity requires two non-null vectors. ' +
|
|
30
|
+
'This may indicate a missing embedding or corrupted cache entry.'
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
if (a.length !== b.length) {
|
|
34
|
+
throw new Error(
|
|
35
|
+
`Vector dimension mismatch in dotSimilarity: ${a.length} vs ${b.length}. ` +
|
|
36
|
+
'This may indicate an embedding dimension configuration change. Consider reindexing.'
|
|
37
|
+
);
|
|
38
|
+
}
|
|
39
|
+
let dot = 0;
|
|
40
|
+
let i = 0;
|
|
41
|
+
const len = a.length;
|
|
42
|
+
const m = len % 4;
|
|
43
|
+
|
|
44
|
+
while (i < m) {
|
|
45
|
+
dot += a[i] * b[i];
|
|
46
|
+
i++;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
while (i < len) {
|
|
50
|
+
dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
|
|
51
|
+
i += 4;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return dot;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Generate hash for file content to detect changes
|
|
59
|
+
*/
|
|
60
|
+
export function hashContent(content) {
|
|
61
|
+
return crypto.createHash('md5').update(content).digest('hex');
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Language-specific patterns for function/class detection
|
|
65
|
+
const patterns = {
|
|
66
|
+
// JavaScript/TypeScript
|
|
67
|
+
js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
68
|
+
jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
69
|
+
ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
70
|
+
tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
71
|
+
mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
72
|
+
cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
73
|
+
|
|
74
|
+
// Python
|
|
75
|
+
py: /^(class|def|async\s+def)\s+\w+/,
|
|
76
|
+
pyw: /^(class|def|async\s+def)\s+\w+/,
|
|
77
|
+
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
78
|
+
|
|
79
|
+
// Java/Kotlin/Scala
|
|
80
|
+
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
81
|
+
kt: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
82
|
+
kts: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
83
|
+
scala: /^(class|object|trait|def|val|var)\s+\w+/,
|
|
84
|
+
|
|
85
|
+
// C/C++
|
|
86
|
+
c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
|
|
87
|
+
cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
88
|
+
cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
89
|
+
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
90
|
+
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
91
|
+
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
92
|
+
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
93
|
+
|
|
94
|
+
// C#
|
|
95
|
+
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
96
|
+
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
97
|
+
|
|
98
|
+
// Go
|
|
99
|
+
go: /^(func|type|const|var)\s+\w+/,
|
|
100
|
+
|
|
101
|
+
// Rust
|
|
102
|
+
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
103
|
+
|
|
104
|
+
// PHP
|
|
105
|
+
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
106
|
+
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
107
|
+
|
|
108
|
+
// Ruby
|
|
109
|
+
rb: /^(class|module|def)\s+\w+/,
|
|
110
|
+
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
111
|
+
|
|
112
|
+
// Swift
|
|
113
|
+
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
114
|
+
|
|
115
|
+
// R
|
|
116
|
+
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
117
|
+
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
118
|
+
|
|
119
|
+
// Lua
|
|
120
|
+
lua: /^(function|local\s+function)\s+\w+/,
|
|
121
|
+
|
|
122
|
+
// Shell scripts
|
|
123
|
+
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
124
|
+
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
125
|
+
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
126
|
+
fish: /^function\s+\w+/,
|
|
127
|
+
|
|
128
|
+
// CSS/Styles
|
|
129
|
+
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
130
|
+
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
131
|
+
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
132
|
+
less: /^(@\w+:|\.|#|@media)\s*/,
|
|
133
|
+
styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
|
|
134
|
+
|
|
135
|
+
// Markup/HTML
|
|
136
|
+
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
137
|
+
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
138
|
+
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
139
|
+
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
140
|
+
|
|
141
|
+
// Config files
|
|
142
|
+
json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
|
|
143
|
+
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
144
|
+
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
145
|
+
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
146
|
+
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
147
|
+
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
148
|
+
|
|
149
|
+
// Makefile
|
|
150
|
+
makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
151
|
+
mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
152
|
+
|
|
153
|
+
// Docker
|
|
154
|
+
dockerfile:
|
|
155
|
+
/^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
|
|
156
|
+
|
|
157
|
+
// Documentation
|
|
158
|
+
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
159
|
+
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
160
|
+
txt: /^.{50,}/, // Split on long paragraphs
|
|
161
|
+
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
162
|
+
|
|
163
|
+
// Database
|
|
164
|
+
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
165
|
+
|
|
166
|
+
// Perl
|
|
167
|
+
pl: /^(sub|package|use|require)\s+\w+/,
|
|
168
|
+
pm: /^(sub|package|use|require)\s+\w+/,
|
|
169
|
+
|
|
170
|
+
// Vim
|
|
171
|
+
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Intelligent chunking with token limit awareness
|
|
176
|
+
* Tries to split by function/class boundaries while respecting token limits
|
|
177
|
+
*
|
|
178
|
+
* @param {string} content - File content to chunk
|
|
179
|
+
* @param {string} file - File path (for language detection)
|
|
180
|
+
* @param {object} config - Configuration object with embeddingModel
|
|
181
|
+
* @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
|
|
182
|
+
*/
|
|
183
|
+
export function smartChunk(content, file, config) {
|
|
184
|
+
const lines = content.split('\n');
|
|
185
|
+
const chunks = [];
|
|
186
|
+
const ext = path.extname(file).toLowerCase();
|
|
187
|
+
const base = path.basename(file).toLowerCase();
|
|
188
|
+
const SPECIAL_TOKENS = 2; // [CLS] + [SEP] accounted once per chunk
|
|
189
|
+
|
|
190
|
+
// Get model-specific chunking parameters with optional user overrides
|
|
191
|
+
let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
|
|
192
|
+
if (config.maxTokens) maxTokens = config.maxTokens;
|
|
193
|
+
if (config.targetTokens) targetTokens = config.targetTokens;
|
|
194
|
+
if (config.overlapTokens) overlapTokens = config.overlapTokens;
|
|
195
|
+
|
|
196
|
+
let langPattern = patterns[ext.slice(1)];
|
|
197
|
+
if (!langPattern) {
|
|
198
|
+
if (base === 'dockerfile') langPattern = patterns.dockerfile;
|
|
199
|
+
else if (base === 'makefile') langPattern = patterns.makefile;
|
|
200
|
+
else if (base.startsWith('.env')) langPattern = patterns.env;
|
|
201
|
+
}
|
|
202
|
+
if (!langPattern || typeof langPattern.test !== 'function') {
|
|
203
|
+
langPattern = patterns.js; // Default fallback
|
|
204
|
+
}
|
|
205
|
+
let currentChunk = [];
|
|
206
|
+
let chunkStartLine = 0;
|
|
207
|
+
let lineTokenCounts = []; // Cache token counts for overlap calculation
|
|
208
|
+
|
|
209
|
+
let currentTokenCount = 0;
|
|
210
|
+
|
|
211
|
+
// Track bracket depth for better boundary detection
|
|
212
|
+
let bracketDepth = 0;
|
|
213
|
+
let braceDepth = 0;
|
|
214
|
+
let parenDepth = 0;
|
|
215
|
+
let inString = false;
|
|
216
|
+
let inComment = false;
|
|
217
|
+
let stringChar = null; // ' or " or `
|
|
218
|
+
|
|
219
|
+
const splitOversizedLine = (line, lineTokens) => {
|
|
220
|
+
const charsPerToken = line.length / Math.max(1, lineTokens);
|
|
221
|
+
const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
|
|
222
|
+
const segments = [];
|
|
223
|
+
|
|
224
|
+
for (let start = 0; start < line.length; start += segmentSize) {
|
|
225
|
+
segments.push(line.slice(start, start + segmentSize));
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return segments;
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
for (let i = 0; i < lines.length; i++) {
|
|
232
|
+
const line = lines[i];
|
|
233
|
+
const lineTokens = estimateTokens(line, { includeSpecialTokens: false });
|
|
234
|
+
|
|
235
|
+
let j = 0;
|
|
236
|
+
|
|
237
|
+
// Simple state tracking for heuristics (not a full parser)
|
|
238
|
+
if (inComment) {
|
|
239
|
+
// Look for end of block comment
|
|
240
|
+
const endIdx = line.indexOf('*/');
|
|
241
|
+
if (endIdx !== -1) {
|
|
242
|
+
inComment = false;
|
|
243
|
+
j = endIdx + 2;
|
|
244
|
+
} else {
|
|
245
|
+
// Skip whole line
|
|
246
|
+
j = line.length;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const scanLine = j < line.length ? line.slice(j) : '';
|
|
251
|
+
const trimmed = scanLine.trim();
|
|
252
|
+
|
|
253
|
+
for (; j < line.length; j++) {
|
|
254
|
+
const char = line[j];
|
|
255
|
+
const nextChar = line[j + 1];
|
|
256
|
+
|
|
257
|
+
if (inString) {
|
|
258
|
+
if (char === '\\') {
|
|
259
|
+
j++; // Skip escaped char
|
|
260
|
+
} else if (char === stringChar) {
|
|
261
|
+
inString = false;
|
|
262
|
+
stringChar = null;
|
|
263
|
+
}
|
|
264
|
+
} else {
|
|
265
|
+
// Check for comment start
|
|
266
|
+
if (char === '/' && nextChar === '*') {
|
|
267
|
+
inComment = true;
|
|
268
|
+
j++;
|
|
269
|
+
// Check if it ends on same line
|
|
270
|
+
const endIdx = line.indexOf('*/', j);
|
|
271
|
+
if (endIdx !== -1) {
|
|
272
|
+
inComment = false;
|
|
273
|
+
j = endIdx + 1;
|
|
274
|
+
} else {
|
|
275
|
+
break; // Rest of line is comment
|
|
276
|
+
}
|
|
277
|
+
} else if (char === '/' && nextChar === '/') {
|
|
278
|
+
break; // Skip rest of line (line comment)
|
|
279
|
+
} else if (char === "'" || char === '"' || char === '`') {
|
|
280
|
+
inString = true;
|
|
281
|
+
stringChar = char;
|
|
282
|
+
} else {
|
|
283
|
+
// Only count brackets if not in string or comment
|
|
284
|
+
if (char === '{') braceDepth++;
|
|
285
|
+
else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
|
|
286
|
+
else if (char === '[') bracketDepth++;
|
|
287
|
+
else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
|
|
288
|
+
else if (char === '(') parenDepth++;
|
|
289
|
+
else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Split lines that are too large to ever fit in a single chunk
|
|
295
|
+
if (lineTokens + SPECIAL_TOKENS > maxTokens) {
|
|
296
|
+
if (currentChunk.length > 0) {
|
|
297
|
+
const chunkText = currentChunk.join('\n');
|
|
298
|
+
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
299
|
+
chunks.push({
|
|
300
|
+
text: chunkText,
|
|
301
|
+
startLine: chunkStartLine + 1,
|
|
302
|
+
endLine: i,
|
|
303
|
+
tokenCount: currentTokenCount + SPECIAL_TOKENS,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
const parts = splitOversizedLine(line, lineTokens);
|
|
309
|
+
for (const part of parts) {
|
|
310
|
+
if (part.trim().length <= MIN_CHUNK_TEXT_LENGTH) continue;
|
|
311
|
+
const partTokens = estimateTokens(part, { includeSpecialTokens: false });
|
|
312
|
+
chunks.push({
|
|
313
|
+
text: part,
|
|
314
|
+
startLine: i + 1,
|
|
315
|
+
endLine: i + 1,
|
|
316
|
+
tokenCount: partTokens + SPECIAL_TOKENS,
|
|
317
|
+
});
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
currentChunk = [];
|
|
321
|
+
lineTokenCounts = [];
|
|
322
|
+
currentTokenCount = 0;
|
|
323
|
+
chunkStartLine = i + 1;
|
|
324
|
+
continue;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Check if adding this line would exceed token limit
|
|
328
|
+
const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
|
|
329
|
+
const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
|
|
330
|
+
|
|
331
|
+
// Check if this is a good split point using multiple heuristics
|
|
332
|
+
const matchesPattern = langPattern.test(trimmed);
|
|
333
|
+
const atTopLevel =
|
|
334
|
+
braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
|
|
335
|
+
const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
|
|
336
|
+
const isEmptyLine = trimmed.length === 0;
|
|
337
|
+
const prevWasEmpty =
|
|
338
|
+
i > 0 && currentChunk.length > 0 && currentChunk.at(-1).trim().length === 0;
|
|
339
|
+
const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
|
|
340
|
+
|
|
341
|
+
const isGoodSplitPoint =
|
|
342
|
+
currentChunk.length > 3 &&
|
|
343
|
+
((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
|
|
344
|
+
(atTopLevel && startsAtColumn0 && !isEmptyLine) ||
|
|
345
|
+
(prevWasEmpty && (matchesPattern || isCommentStart)));
|
|
346
|
+
|
|
347
|
+
const shouldSplit =
|
|
348
|
+
wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
|
|
349
|
+
|
|
350
|
+
// Avoid splitting in weird states if possible
|
|
351
|
+
const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
|
|
352
|
+
|
|
353
|
+
if (shouldSplit && safeToSplit && currentChunk.length > 0) {
|
|
354
|
+
const chunkText = currentChunk.join('\n');
|
|
355
|
+
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
356
|
+
chunks.push({
|
|
357
|
+
text: chunkText,
|
|
358
|
+
startLine: chunkStartLine + 1,
|
|
359
|
+
endLine: i,
|
|
360
|
+
tokenCount: currentTokenCount,
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
let overlapLines = [];
|
|
365
|
+
let overlapTokensCount = 0;
|
|
366
|
+
let overlapStartOffset = 0; // Track how many lines back we went
|
|
367
|
+
const MAX_OVERLAP_ITERATIONS = 50; // Absolute limit to prevent unbounded loops
|
|
368
|
+
let overlapIterations = 0;
|
|
369
|
+
for (
|
|
370
|
+
let k = currentChunk.length - 1;
|
|
371
|
+
k >= 0 && overlapTokensCount < overlapTokens && overlapIterations < MAX_OVERLAP_ITERATIONS;
|
|
372
|
+
k--
|
|
373
|
+
) {
|
|
374
|
+
overlapIterations++;
|
|
375
|
+
// Use cached token count instead of re-estimating
|
|
376
|
+
const lineT = lineTokenCounts[k] ?? 0;
|
|
377
|
+
// Guard against infinite loops: if lineT is 0, count the line but don't loop forever
|
|
378
|
+
if (lineT <= 0) {
|
|
379
|
+
// Include zero-token lines (e.g., empty lines) but limit to prevent infinite spin
|
|
380
|
+
// Also guard with overlapStartOffset < 20 to prevent excessive lines even if under 10 in overlapLines
|
|
381
|
+
if (overlapLines.length < 10 && overlapStartOffset < 20) {
|
|
382
|
+
overlapLines.unshift(currentChunk[k]);
|
|
383
|
+
overlapStartOffset++;
|
|
384
|
+
}
|
|
385
|
+
continue;
|
|
386
|
+
}
|
|
387
|
+
if (overlapTokensCount + lineT <= overlapTokens) {
|
|
388
|
+
overlapLines.unshift(currentChunk[k]);
|
|
389
|
+
overlapTokensCount += lineT;
|
|
390
|
+
overlapStartOffset++;
|
|
391
|
+
} else {
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
currentChunk = overlapLines;
|
|
397
|
+
// Rebuild lineTokenCounts for the overlap lines
|
|
398
|
+
lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
|
|
399
|
+
currentTokenCount = overlapTokensCount;
|
|
400
|
+
// The new chunk starts from where the overlap begins in the original file
|
|
401
|
+
// i is the current line we're about to process, overlap lines are from before
|
|
402
|
+
// Ensure non-negative to handle edge cases where overlapStartOffset > i
|
|
403
|
+
chunkStartLine = Math.max(0, i - overlapStartOffset);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
currentChunk.push(line);
|
|
407
|
+
lineTokenCounts.push(lineTokens);
|
|
408
|
+
currentTokenCount += lineTokens;
|
|
409
|
+
|
|
410
|
+
if (chunks.length >= (config.maxChunksPerFile || 1000)) {
|
|
411
|
+
// Hard limit to prevent memory explosion on minified/data files
|
|
412
|
+
break;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Add remaining chunk
|
|
417
|
+
const chunkText = currentChunk.join('\n');
|
|
418
|
+
if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
|
|
419
|
+
chunks.push({
|
|
420
|
+
text: chunkText,
|
|
421
|
+
startLine: chunkStartLine + 1,
|
|
422
|
+
endLine: lines.length,
|
|
423
|
+
tokenCount: currentTokenCount + SPECIAL_TOKENS,
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return chunks;
|
|
428
|
+
}
|