@rce-mcp/retrieval-core 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +7 -0
- package/dist/.tsbuildinfo +1 -1
- package/dist/chunking.d.ts +13 -0
- package/dist/chunking.js +488 -77
- package/dist/index.d.ts +61 -0
- package/dist/index.js +993 -20
- package/dist/remote-sync.js +2 -1
- package/package.json +2 -2
- package/scripts/poc-parser-availability-benchmark.ts +2 -0
- package/src/chunking.ts +573 -80
- package/src/index.ts +1247 -20
- package/src/remote-sync.ts +3 -1
- package/test/benchmark.thresholds.test.ts +8 -0
- package/test/chunking.config.test.ts +47 -1
- package/test/chunking.language-aware.test.ts +227 -0
- package/test/embedding-context-prefix.test.ts +101 -0
- package/test/enhance-confidence.test.ts +4 -4
- package/test/mcp-search-quality.regression.test.ts +691 -4
- package/test/remote-sync.integration.test.ts +5 -1
- package/test/smart-cutoff.config.test.ts +86 -0
- package/test/snippet-integrity.config.test.ts +59 -0
package/dist/chunking.js
CHANGED
|
@@ -3,7 +3,7 @@ import Go from "tree-sitter-go";
|
|
|
3
3
|
import JavaScriptV023 from "tree-sitter-javascript-v023";
|
|
4
4
|
import PythonV023 from "tree-sitter-python-v023";
|
|
5
5
|
import TypeScript from "tree-sitter-typescript";
|
|
6
|
-
const
|
|
6
|
+
const DEFAULT_BOUNDARY_NODE_TYPES_LEGACY = {
|
|
7
7
|
typescript: new Set([
|
|
8
8
|
"function_declaration",
|
|
9
9
|
"generator_function_declaration",
|
|
@@ -20,11 +20,35 @@ const DEFAULT_BOUNDARY_NODE_TYPES = {
|
|
|
20
20
|
"enum_declaration",
|
|
21
21
|
"type_alias_declaration"
|
|
22
22
|
]),
|
|
23
|
-
javascript: new Set([
|
|
24
|
-
|
|
23
|
+
javascript: new Set([
|
|
24
|
+
"function_declaration",
|
|
25
|
+
"generator_function_declaration",
|
|
26
|
+
"class_declaration",
|
|
27
|
+
"function_expression",
|
|
28
|
+
"arrow_function"
|
|
29
|
+
]),
|
|
30
|
+
jsx: new Set([
|
|
31
|
+
"function_declaration",
|
|
32
|
+
"generator_function_declaration",
|
|
33
|
+
"class_declaration",
|
|
34
|
+
"function_expression",
|
|
35
|
+
"arrow_function"
|
|
36
|
+
]),
|
|
25
37
|
python: new Set(["function_definition", "class_definition"]),
|
|
26
38
|
go: new Set(["function_declaration", "method_declaration", "type_declaration"])
|
|
27
39
|
};
|
|
40
|
+
const DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS = {
|
|
41
|
+
...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY,
|
|
42
|
+
typescript: new Set([
|
|
43
|
+
...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.typescript,
|
|
44
|
+
"function_expression",
|
|
45
|
+
"arrow_function",
|
|
46
|
+
"method_definition"
|
|
47
|
+
]),
|
|
48
|
+
tsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.tsx, "function_expression", "arrow_function", "method_definition"]),
|
|
49
|
+
javascript: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.javascript, "method_definition"]),
|
|
50
|
+
jsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.jsx, "method_definition"])
|
|
51
|
+
};
|
|
28
52
|
const parserAvailabilityCache = new Map();
|
|
29
53
|
const parserInitAttempts = new Map();
|
|
30
54
|
const parserLanguageLoaderOverrides = new Map();
|
|
@@ -34,6 +58,15 @@ const CANONICAL_TO_PARSER_LANGUAGE = {
|
|
|
34
58
|
python: "python",
|
|
35
59
|
go: "go"
|
|
36
60
|
};
|
|
61
|
+
const JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES = new Set([
|
|
62
|
+
"assignment_expression",
|
|
63
|
+
"variable_declarator",
|
|
64
|
+
"pair",
|
|
65
|
+
"export_statement",
|
|
66
|
+
"public_field_definition",
|
|
67
|
+
"property_definition"
|
|
68
|
+
]);
|
|
69
|
+
const SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER = 1.35;
|
|
37
70
|
function parserLanguageToCanonical(language) {
|
|
38
71
|
if (language === "tsx") {
|
|
39
72
|
return "typescript";
|
|
@@ -70,7 +103,7 @@ function parserLanguageFromPath(path) {
|
|
|
70
103
|
if (normalized.endsWith(".tsx")) {
|
|
71
104
|
return "tsx";
|
|
72
105
|
}
|
|
73
|
-
if (normalized.endsWith(".ts")) {
|
|
106
|
+
if (normalized.endsWith(".ts") || normalized.endsWith(".mts") || normalized.endsWith(".cts")) {
|
|
74
107
|
return "typescript";
|
|
75
108
|
}
|
|
76
109
|
if (normalized.endsWith(".jsx")) {
|
|
@@ -238,36 +271,82 @@ function trimLineRange(lines, startRow, endRow) {
|
|
|
238
271
|
return { start, end };
|
|
239
272
|
}
|
|
240
273
|
function splitRangeWithBudget(input) {
|
|
274
|
+
const rangeTokenCount = (startRow, endRow) => {
|
|
275
|
+
let total = 0;
|
|
276
|
+
for (let row = startRow; row <= endRow; row += 1) {
|
|
277
|
+
total += input.lineTokenCounts[row] ?? 0;
|
|
278
|
+
}
|
|
279
|
+
return total;
|
|
280
|
+
};
|
|
281
|
+
const isSafeSplitBoundaryLine = (line) => {
|
|
282
|
+
const trimmed = line.trim();
|
|
283
|
+
if (trimmed.length === 0) {
|
|
284
|
+
return true;
|
|
285
|
+
}
|
|
286
|
+
return trimmed.endsWith(";") || trimmed.endsWith("}") || trimmed.endsWith("{");
|
|
287
|
+
};
|
|
241
288
|
const segments = [];
|
|
242
289
|
let start = input.startRow;
|
|
243
290
|
while (start <= input.endRow && segments.length < input.maxChunks) {
|
|
244
291
|
let tokens = 0;
|
|
245
|
-
let end = start;
|
|
246
|
-
while (end
|
|
247
|
-
|
|
248
|
-
|
|
292
|
+
let end = start - 1;
|
|
293
|
+
while (end < input.endRow) {
|
|
294
|
+
const nextEnd = end + 1;
|
|
295
|
+
tokens += input.lineTokenCounts[nextEnd] ?? 0;
|
|
296
|
+
end = nextEnd;
|
|
297
|
+
if (tokens >= input.targetChunkTokens && end >= start) {
|
|
249
298
|
break;
|
|
250
299
|
}
|
|
251
|
-
end += 1;
|
|
252
300
|
}
|
|
253
|
-
|
|
301
|
+
let safeEnd = Math.min(Math.max(start, end), input.endRow);
|
|
302
|
+
if (input.preferSafeBoundarySplit && safeEnd > start) {
|
|
303
|
+
let adjusted = safeEnd;
|
|
304
|
+
for (let row = safeEnd; row > start; row -= 1) {
|
|
305
|
+
if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
|
|
306
|
+
adjusted = row;
|
|
307
|
+
break;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
if (adjusted === safeEnd && typeof input.softMaxChunkTokens === "number" && input.softMaxChunkTokens > input.targetChunkTokens) {
|
|
311
|
+
for (let row = safeEnd + 1; row <= input.endRow; row += 1) {
|
|
312
|
+
if (rangeTokenCount(start, row) > input.softMaxChunkTokens) {
|
|
313
|
+
break;
|
|
314
|
+
}
|
|
315
|
+
if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
|
|
316
|
+
adjusted = row;
|
|
317
|
+
break;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
safeEnd = Math.max(start, adjusted);
|
|
322
|
+
}
|
|
254
323
|
if (safeEnd >= start) {
|
|
255
324
|
segments.push({ startRow: start, endRow: safeEnd });
|
|
256
325
|
}
|
|
257
326
|
if (safeEnd >= input.endRow) {
|
|
258
327
|
break;
|
|
259
328
|
}
|
|
260
|
-
|
|
261
|
-
|
|
329
|
+
let nextStart = safeEnd + 1;
|
|
330
|
+
if (input.overlapTokens > 0) {
|
|
331
|
+
let overlap = 0;
|
|
332
|
+
let cursor = safeEnd;
|
|
333
|
+
while (cursor >= start && overlap < input.overlapTokens) {
|
|
334
|
+
overlap += input.lineTokenCounts[cursor] ?? 0;
|
|
335
|
+
cursor -= 1;
|
|
336
|
+
}
|
|
337
|
+
nextStart = Math.max(start + 1, cursor + 1);
|
|
338
|
+
}
|
|
339
|
+
start = Math.max(start + 1, nextStart);
|
|
262
340
|
}
|
|
263
341
|
return segments;
|
|
264
342
|
}
|
|
265
343
|
function buildSlidingChunks(input) {
|
|
344
|
+
const lineTokenCounts = input.lineTokenCounts ?? computeLineTokenCounts(input.lines, input.tokenize);
|
|
266
345
|
const rawSegments = splitRangeWithBudget({
|
|
267
346
|
lines: input.lines,
|
|
347
|
+
lineTokenCounts,
|
|
268
348
|
startRow: 0,
|
|
269
349
|
endRow: Math.max(0, input.lines.length - 1),
|
|
270
|
-
tokenize: input.tokenize,
|
|
271
350
|
targetChunkTokens: input.targetChunkTokens,
|
|
272
351
|
overlapTokens: input.overlapTokens,
|
|
273
352
|
maxChunks: input.maxChunks
|
|
@@ -296,8 +375,281 @@ function hasBoundaryAncestor(node, boundaryTypes) {
|
|
|
296
375
|
}
|
|
297
376
|
return false;
|
|
298
377
|
}
|
|
378
|
+
function getBoundaryTypes(parserLanguage, boundaryStrictness) {
|
|
379
|
+
if (boundaryStrictness === "semantic_js_ts") {
|
|
380
|
+
return DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS[parserLanguage];
|
|
381
|
+
}
|
|
382
|
+
return DEFAULT_BOUNDARY_NODE_TYPES_LEGACY[parserLanguage];
|
|
383
|
+
}
|
|
384
|
+
function isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness) {
|
|
385
|
+
if (boundaryStrictness === "semantic_js_ts") {
|
|
386
|
+
return (parserLanguage === "javascript" ||
|
|
387
|
+
parserLanguage === "jsx" ||
|
|
388
|
+
parserLanguage === "typescript" ||
|
|
389
|
+
parserLanguage === "tsx");
|
|
390
|
+
}
|
|
391
|
+
return parserLanguage === "javascript" || parserLanguage === "jsx";
|
|
392
|
+
}
|
|
393
|
+
function isLanguageBoundaryCandidate(parserLanguage, node, boundaryStrictness) {
|
|
394
|
+
if (!isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness)) {
|
|
395
|
+
return true;
|
|
396
|
+
}
|
|
397
|
+
if (node.type !== "function_expression" && node.type !== "arrow_function") {
|
|
398
|
+
return true;
|
|
399
|
+
}
|
|
400
|
+
const parentType = node.parent?.type;
|
|
401
|
+
if (!parentType) {
|
|
402
|
+
return false;
|
|
403
|
+
}
|
|
404
|
+
return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(parentType);
|
|
405
|
+
}
|
|
406
|
+
export function __isChunkingBoundaryCandidateForTests(input) {
|
|
407
|
+
const strictness = input.boundaryStrictness ?? "legacy";
|
|
408
|
+
if (!isExpressionBoundaryLanguage(input.parserLanguage, strictness)) {
|
|
409
|
+
return true;
|
|
410
|
+
}
|
|
411
|
+
if (input.nodeType !== "function_expression" && input.nodeType !== "arrow_function") {
|
|
412
|
+
return true;
|
|
413
|
+
}
|
|
414
|
+
if (!input.parentType) {
|
|
415
|
+
return false;
|
|
416
|
+
}
|
|
417
|
+
return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(input.parentType);
|
|
418
|
+
}
|
|
419
|
+
function computeLineTokenCounts(lines, tokenize) {
|
|
420
|
+
return lines.map((line) => tokenize(line ?? "").length);
|
|
421
|
+
}
|
|
422
|
+
function rangeTokenCount(lineTokenCounts, startRow, endRow) {
|
|
423
|
+
let total = 0;
|
|
424
|
+
for (let row = startRow; row <= endRow; row += 1) {
|
|
425
|
+
total += lineTokenCounts[row] ?? 0;
|
|
426
|
+
}
|
|
427
|
+
return total;
|
|
428
|
+
}
|
|
429
|
+
function listNamedChildren(node) {
|
|
430
|
+
const children = [];
|
|
431
|
+
for (let index = 0; index < node.namedChildCount; index += 1) {
|
|
432
|
+
const child = node.namedChild(index);
|
|
433
|
+
if (child) {
|
|
434
|
+
children.push(child);
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
return children;
|
|
438
|
+
}
|
|
439
|
+
function normalizeNodeWindow(input) {
|
|
440
|
+
const startRow = Math.max(0, Math.min(input.lastRow, input.node.startPosition.row));
|
|
441
|
+
const endRow = Math.max(startRow, Math.min(input.lastRow, toInclusiveEndRow(input.node)));
|
|
442
|
+
const trimmed = trimLineRange(input.lines, startRow, endRow);
|
|
443
|
+
if (!trimmed) {
|
|
444
|
+
return undefined;
|
|
445
|
+
}
|
|
446
|
+
return {
|
|
447
|
+
startRow: trimmed.start,
|
|
448
|
+
endRow: trimmed.end
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
function buildRecursiveSemanticWindows(input) {
|
|
452
|
+
const lastRow = Math.max(0, input.lines.length - 1);
|
|
453
|
+
const windows = [];
|
|
454
|
+
const softMaxChunkTokens = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
455
|
+
const seen = new Set();
|
|
456
|
+
const pushSplitWindows = (startRow, endRow) => {
|
|
457
|
+
if (startRow > endRow || windows.length >= input.maxChunks) {
|
|
458
|
+
return;
|
|
459
|
+
}
|
|
460
|
+
const segments = splitRangeWithBudget({
|
|
461
|
+
lines: input.lines,
|
|
462
|
+
lineTokenCounts: input.lineTokenCounts,
|
|
463
|
+
startRow,
|
|
464
|
+
endRow,
|
|
465
|
+
targetChunkTokens: input.targetChunkTokens,
|
|
466
|
+
overlapTokens: 0,
|
|
467
|
+
maxChunks: input.maxChunks - windows.length,
|
|
468
|
+
preferSafeBoundarySplit: input.boundaryStrictness === "semantic_js_ts",
|
|
469
|
+
softMaxChunkTokens
|
|
470
|
+
});
|
|
471
|
+
for (const segment of segments) {
|
|
472
|
+
const trimmed = trimLineRange(input.lines, segment.startRow, segment.endRow);
|
|
473
|
+
if (!trimmed) {
|
|
474
|
+
continue;
|
|
475
|
+
}
|
|
476
|
+
const key = `${trimmed.start}:${trimmed.end}`;
|
|
477
|
+
if (seen.has(key)) {
|
|
478
|
+
continue;
|
|
479
|
+
}
|
|
480
|
+
seen.add(key);
|
|
481
|
+
windows.push({ startRow: trimmed.start, endRow: trimmed.end });
|
|
482
|
+
if (windows.length >= input.maxChunks) {
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
};
|
|
487
|
+
const visitNode = (node) => {
|
|
488
|
+
if (windows.length >= input.maxChunks) {
|
|
489
|
+
return;
|
|
490
|
+
}
|
|
491
|
+
const range = normalizeNodeWindow({
|
|
492
|
+
node,
|
|
493
|
+
lines: input.lines,
|
|
494
|
+
lastRow
|
|
495
|
+
});
|
|
496
|
+
if (!range) {
|
|
497
|
+
return;
|
|
498
|
+
}
|
|
499
|
+
const tokenCount = rangeTokenCount(input.lineTokenCounts, range.startRow, range.endRow);
|
|
500
|
+
if (tokenCount <= input.targetChunkTokens) {
|
|
501
|
+
const key = `${range.startRow}:${range.endRow}`;
|
|
502
|
+
if (!seen.has(key)) {
|
|
503
|
+
seen.add(key);
|
|
504
|
+
windows.push(range);
|
|
505
|
+
}
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
const children = listNamedChildren(node)
|
|
509
|
+
.map((child) => ({
|
|
510
|
+
node: child,
|
|
511
|
+
range: normalizeNodeWindow({
|
|
512
|
+
node: child,
|
|
513
|
+
lines: input.lines,
|
|
514
|
+
lastRow
|
|
515
|
+
})
|
|
516
|
+
}))
|
|
517
|
+
.filter((child) => Boolean(child.range))
|
|
518
|
+
.sort((a, b) => a.range.startRow - b.range.startRow || a.range.endRow - b.range.endRow);
|
|
519
|
+
if (children.length === 0) {
|
|
520
|
+
pushSplitWindows(range.startRow, range.endRow);
|
|
521
|
+
return;
|
|
522
|
+
}
|
|
523
|
+
let cursor = range.startRow;
|
|
524
|
+
for (const child of children) {
|
|
525
|
+
if (windows.length >= input.maxChunks) {
|
|
526
|
+
return;
|
|
527
|
+
}
|
|
528
|
+
if (child.range.endRow < cursor) {
|
|
529
|
+
continue;
|
|
530
|
+
}
|
|
531
|
+
if (child.range.startRow > cursor) {
|
|
532
|
+
pushSplitWindows(cursor, child.range.startRow - 1);
|
|
533
|
+
}
|
|
534
|
+
visitNode(child.node);
|
|
535
|
+
cursor = Math.max(cursor, child.range.endRow + 1);
|
|
536
|
+
if (cursor > range.endRow) {
|
|
537
|
+
return;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
if (cursor <= range.endRow) {
|
|
541
|
+
pushSplitWindows(cursor, range.endRow);
|
|
542
|
+
}
|
|
543
|
+
};
|
|
544
|
+
visitNode(input.root);
|
|
545
|
+
return windows.sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
|
|
546
|
+
}
|
|
547
|
+
function mergeSemanticWindows(input) {
|
|
548
|
+
if (input.windows.length <= 1) {
|
|
549
|
+
return [...input.windows];
|
|
550
|
+
}
|
|
551
|
+
const ordered = [...input.windows].sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
|
|
552
|
+
const merged = [];
|
|
553
|
+
const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
554
|
+
for (const window of ordered) {
|
|
555
|
+
const last = merged[merged.length - 1];
|
|
556
|
+
if (!last) {
|
|
557
|
+
merged.push({ ...window });
|
|
558
|
+
continue;
|
|
559
|
+
}
|
|
560
|
+
const gapLines = Math.max(0, window.startRow - last.endRow - 1);
|
|
561
|
+
const nextStartRow = Math.min(last.startRow, window.startRow);
|
|
562
|
+
const nextEndRow = Math.max(last.endRow, window.endRow);
|
|
563
|
+
const nextSpanLines = nextEndRow - nextStartRow + 1;
|
|
564
|
+
const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, nextStartRow, nextEndRow);
|
|
565
|
+
const canMerge = gapLines <= input.semanticMergeGapLines &&
|
|
566
|
+
nextSpanLines <= input.semanticMergeMaxSpanLines &&
|
|
567
|
+
mergedTokenCount <= mergeTokenBudget;
|
|
568
|
+
if (!canMerge) {
|
|
569
|
+
merged.push({ ...window });
|
|
570
|
+
continue;
|
|
571
|
+
}
|
|
572
|
+
last.startRow = nextStartRow;
|
|
573
|
+
last.endRow = nextEndRow;
|
|
574
|
+
}
|
|
575
|
+
return merged;
|
|
576
|
+
}
|
|
577
|
+
function isCommentOnlyLine(line) {
|
|
578
|
+
const trimmed = line.trim();
|
|
579
|
+
if (trimmed.length === 0) {
|
|
580
|
+
return true;
|
|
581
|
+
}
|
|
582
|
+
return (trimmed.startsWith("//") ||
|
|
583
|
+
trimmed.startsWith("/*") ||
|
|
584
|
+
trimmed.startsWith("*") ||
|
|
585
|
+
trimmed.startsWith("*/") ||
|
|
586
|
+
trimmed.startsWith("#"));
|
|
587
|
+
}
|
|
588
|
+
function windowLooksCommentOnly(input) {
|
|
589
|
+
for (let row = input.startRow; row <= input.endRow; row += 1) {
|
|
590
|
+
if (!isCommentOnlyLine(input.lines[row] ?? "")) {
|
|
591
|
+
return false;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
return true;
|
|
595
|
+
}
|
|
596
|
+
function absorbForwardCommentWindows(input) {
|
|
597
|
+
if (input.windows.length <= 1) {
|
|
598
|
+
return [...input.windows];
|
|
599
|
+
}
|
|
600
|
+
const output = [];
|
|
601
|
+
const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
602
|
+
for (let index = 0; index < input.windows.length; index += 1) {
|
|
603
|
+
const current = input.windows[index];
|
|
604
|
+
const next = input.windows[index + 1];
|
|
605
|
+
if (!current) {
|
|
606
|
+
continue;
|
|
607
|
+
}
|
|
608
|
+
if (!next) {
|
|
609
|
+
output.push({ ...current });
|
|
610
|
+
continue;
|
|
611
|
+
}
|
|
612
|
+
if (!windowLooksCommentOnly({ lines: input.lines, startRow: current.startRow, endRow: current.endRow })) {
|
|
613
|
+
output.push({ ...current });
|
|
614
|
+
continue;
|
|
615
|
+
}
|
|
616
|
+
const gapLines = Math.max(0, next.startRow - current.endRow - 1);
|
|
617
|
+
const nextSpanLines = next.endRow - current.startRow + 1;
|
|
618
|
+
const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, current.startRow, next.endRow);
|
|
619
|
+
const canAbsorb = gapLines <= 1 && nextSpanLines <= input.semanticMergeMaxSpanLines && mergedTokenCount <= mergeTokenBudget;
|
|
620
|
+
if (!canAbsorb) {
|
|
621
|
+
output.push({ ...current });
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
output.push({
|
|
625
|
+
startRow: current.startRow,
|
|
626
|
+
endRow: next.endRow
|
|
627
|
+
});
|
|
628
|
+
index += 1;
|
|
629
|
+
}
|
|
630
|
+
return output;
|
|
631
|
+
}
|
|
632
|
+
function windowsToChunks(input) {
|
|
633
|
+
const chunks = [];
|
|
634
|
+
for (const window of input.windows) {
|
|
635
|
+
if (chunks.length >= input.maxChunks) {
|
|
636
|
+
break;
|
|
637
|
+
}
|
|
638
|
+
const trimmed = trimLineRange(input.lines, window.startRow, window.endRow);
|
|
639
|
+
if (!trimmed) {
|
|
640
|
+
continue;
|
|
641
|
+
}
|
|
642
|
+
chunks.push({
|
|
643
|
+
start_line: trimmed.start + 1,
|
|
644
|
+
end_line: trimmed.end + 1,
|
|
645
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
646
|
+
});
|
|
647
|
+
}
|
|
648
|
+
return chunks;
|
|
649
|
+
}
|
|
299
650
|
function buildLanguageAwareChunks(input) {
|
|
300
651
|
const languageAwareAttemptStart = Date.now();
|
|
652
|
+
const lineTokenCounts = computeLineTokenCounts(input.lines, input.tokenize);
|
|
301
653
|
const parser = getParser(input.parserLanguage);
|
|
302
654
|
if (!parser) {
|
|
303
655
|
const fallbackStart = Date.now();
|
|
@@ -306,7 +658,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
306
658
|
tokenize: input.tokenize,
|
|
307
659
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
308
660
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
309
|
-
maxChunks: input.config.max_chunks_per_file
|
|
661
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
662
|
+
lineTokenCounts
|
|
310
663
|
});
|
|
311
664
|
return {
|
|
312
665
|
chunks,
|
|
@@ -329,7 +682,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
329
682
|
tokenize: input.tokenize,
|
|
330
683
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
331
684
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
332
|
-
maxChunks: input.config.max_chunks_per_file
|
|
685
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
686
|
+
lineTokenCounts
|
|
333
687
|
});
|
|
334
688
|
return {
|
|
335
689
|
chunks,
|
|
@@ -349,7 +703,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
349
703
|
tokenize: input.tokenize,
|
|
350
704
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
351
705
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
352
|
-
maxChunks: input.config.max_chunks_per_file
|
|
706
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
707
|
+
lineTokenCounts
|
|
353
708
|
});
|
|
354
709
|
return {
|
|
355
710
|
chunks,
|
|
@@ -361,74 +716,127 @@ function buildLanguageAwareChunks(input) {
|
|
|
361
716
|
language: parserLanguageToCanonical(input.parserLanguage)
|
|
362
717
|
};
|
|
363
718
|
}
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
.
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
const chunks = buildSlidingChunks({
|
|
719
|
+
let chunks = [];
|
|
720
|
+
let recursiveSemanticChunkingUsed = false;
|
|
721
|
+
if (input.config.recursive_semantic_chunking_enabled) {
|
|
722
|
+
const semanticMergeGapLines = input.config.semantic_merge_gap_lines ?? 6;
|
|
723
|
+
const semanticMergeMaxSpanLines = input.config.semantic_merge_max_span_lines ?? 220;
|
|
724
|
+
const recursiveWindows = buildRecursiveSemanticWindows({
|
|
725
|
+
root,
|
|
372
726
|
lines: input.lines,
|
|
373
|
-
|
|
727
|
+
lineTokenCounts,
|
|
374
728
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
375
|
-
|
|
729
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
730
|
+
boundaryStrictness: input.config.boundary_strictness
|
|
731
|
+
});
|
|
732
|
+
const mergedWindows = mergeSemanticWindows({
|
|
733
|
+
windows: recursiveWindows,
|
|
734
|
+
lineTokenCounts,
|
|
735
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
736
|
+
semanticMergeGapLines,
|
|
737
|
+
semanticMergeMaxSpanLines
|
|
738
|
+
});
|
|
739
|
+
const absorbedWindows = input.config.comment_forward_absorb_enabled === false
|
|
740
|
+
? mergedWindows
|
|
741
|
+
: absorbForwardCommentWindows({
|
|
742
|
+
windows: mergedWindows,
|
|
743
|
+
lines: input.lines,
|
|
744
|
+
lineTokenCounts,
|
|
745
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
746
|
+
semanticMergeMaxSpanLines
|
|
747
|
+
});
|
|
748
|
+
chunks = windowsToChunks({
|
|
749
|
+
windows: absorbedWindows,
|
|
750
|
+
lines: input.lines,
|
|
376
751
|
maxChunks: input.config.max_chunks_per_file
|
|
377
752
|
});
|
|
378
|
-
|
|
379
|
-
chunks,
|
|
380
|
-
strategy: "sliding",
|
|
381
|
-
fallback_reason: "empty_language_boundaries",
|
|
382
|
-
parse_latency_ms: parseLatencyMs,
|
|
383
|
-
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
384
|
-
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
385
|
-
language: parserLanguageToCanonical(input.parserLanguage)
|
|
386
|
-
};
|
|
753
|
+
recursiveSemanticChunkingUsed = chunks.length > 0;
|
|
387
754
|
}
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
755
|
+
else {
|
|
756
|
+
const boundaryTypes = getBoundaryTypes(input.parserLanguage, input.config.boundary_strictness);
|
|
757
|
+
const candidates = root.descendantsOfType([...boundaryTypes]);
|
|
758
|
+
const boundaryNodes = candidates
|
|
759
|
+
.filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
|
|
760
|
+
.filter((node) => isLanguageBoundaryCandidate(input.parserLanguage, node, input.config.boundary_strictness))
|
|
761
|
+
.sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
|
|
762
|
+
if (boundaryNodes.length === 0) {
|
|
763
|
+
const fallbackStart = Date.now();
|
|
764
|
+
const fallbackChunks = buildSlidingChunks({
|
|
765
|
+
lines: input.lines,
|
|
766
|
+
tokenize: input.tokenize,
|
|
767
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
768
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
769
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
770
|
+
lineTokenCounts
|
|
771
|
+
});
|
|
772
|
+
return {
|
|
773
|
+
chunks: fallbackChunks,
|
|
774
|
+
strategy: "sliding",
|
|
775
|
+
fallback_reason: "empty_language_boundaries",
|
|
776
|
+
parse_latency_ms: parseLatencyMs,
|
|
777
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
778
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
779
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
780
|
+
};
|
|
396
781
|
}
|
|
397
|
-
segments
|
|
398
|
-
cursor =
|
|
399
|
-
|
|
400
|
-
|
|
782
|
+
const segments = [];
|
|
783
|
+
let cursor = 0;
|
|
784
|
+
const lastRow = Math.max(0, input.lines.length - 1);
|
|
785
|
+
for (const node of boundaryNodes) {
|
|
786
|
+
const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
|
|
787
|
+
const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
|
|
788
|
+
if (startRow > cursor) {
|
|
789
|
+
segments.push({ startRow: cursor, endRow: startRow - 1, boundary: false });
|
|
790
|
+
}
|
|
791
|
+
segments.push({ startRow, endRow, boundary: true });
|
|
792
|
+
cursor = endRow + 1;
|
|
793
|
+
if (cursor > lastRow) {
|
|
794
|
+
break;
|
|
795
|
+
}
|
|
401
796
|
}
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
segments.push({ startRow: cursor, endRow: lastRow });
|
|
405
|
-
}
|
|
406
|
-
const chunks = [];
|
|
407
|
-
for (const segment of segments) {
|
|
408
|
-
if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
|
|
409
|
-
continue;
|
|
797
|
+
if (cursor <= lastRow) {
|
|
798
|
+
segments.push({ startRow: cursor, endRow: lastRow, boundary: false });
|
|
410
799
|
}
|
|
411
|
-
const
|
|
412
|
-
|
|
413
|
-
startRow: segment.startRow,
|
|
414
|
-
endRow: segment.endRow,
|
|
415
|
-
tokenize: input.tokenize,
|
|
416
|
-
targetChunkTokens: input.config.target_chunk_tokens,
|
|
417
|
-
overlapTokens: input.config.chunk_overlap_tokens,
|
|
418
|
-
maxChunks: input.config.max_chunks_per_file - chunks.length
|
|
419
|
-
});
|
|
420
|
-
for (const piece of pieces) {
|
|
421
|
-
const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
|
|
422
|
-
if (!trimmed) {
|
|
800
|
+
for (const segment of segments) {
|
|
801
|
+
if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
|
|
423
802
|
continue;
|
|
424
803
|
}
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
804
|
+
const segmentTokenCount = lineTokenCounts
|
|
805
|
+
.slice(segment.startRow, segment.endRow + 1)
|
|
806
|
+
.reduce((sum, value) => sum + value, 0);
|
|
807
|
+
const enableSemanticBoundarySplits = input.config.boundary_strictness === "semantic_js_ts" &&
|
|
808
|
+
(input.parserLanguage === "javascript" ||
|
|
809
|
+
input.parserLanguage === "jsx" ||
|
|
810
|
+
input.parserLanguage === "typescript" ||
|
|
811
|
+
input.parserLanguage === "tsx") &&
|
|
812
|
+
segment.boundary;
|
|
813
|
+
const softMaxChunkTokens = Math.floor(input.config.target_chunk_tokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
814
|
+
const pieces = enableSemanticBoundarySplits && segmentTokenCount <= softMaxChunkTokens
|
|
815
|
+
? [{ startRow: segment.startRow, endRow: segment.endRow }]
|
|
816
|
+
: splitRangeWithBudget({
|
|
817
|
+
lines: input.lines,
|
|
818
|
+
lineTokenCounts,
|
|
819
|
+
startRow: segment.startRow,
|
|
820
|
+
endRow: segment.endRow,
|
|
821
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
822
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
823
|
+
maxChunks: input.config.max_chunks_per_file - chunks.length,
|
|
824
|
+
preferSafeBoundarySplit: enableSemanticBoundarySplits,
|
|
825
|
+
softMaxChunkTokens
|
|
826
|
+
});
|
|
827
|
+
for (const piece of pieces) {
|
|
828
|
+
const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
|
|
829
|
+
if (!trimmed) {
|
|
830
|
+
continue;
|
|
831
|
+
}
|
|
832
|
+
chunks.push({
|
|
833
|
+
start_line: trimmed.start + 1,
|
|
834
|
+
end_line: trimmed.end + 1,
|
|
835
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
836
|
+
});
|
|
837
|
+
if (chunks.length >= input.config.max_chunks_per_file) {
|
|
838
|
+
break;
|
|
839
|
+
}
|
|
432
840
|
}
|
|
433
841
|
}
|
|
434
842
|
}
|
|
@@ -439,7 +847,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
439
847
|
tokenize: input.tokenize,
|
|
440
848
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
441
849
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
442
|
-
maxChunks: input.config.max_chunks_per_file
|
|
850
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
851
|
+
lineTokenCounts
|
|
443
852
|
});
|
|
444
853
|
return {
|
|
445
854
|
chunks: slidingChunks,
|
|
@@ -456,7 +865,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
456
865
|
strategy: "language_aware",
|
|
457
866
|
parse_latency_ms: parseLatencyMs,
|
|
458
867
|
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
459
|
-
language: parserLanguageToCanonical(input.parserLanguage)
|
|
868
|
+
language: parserLanguageToCanonical(input.parserLanguage),
|
|
869
|
+
recursive_semantic_chunking_used: recursiveSemanticChunkingUsed
|
|
460
870
|
};
|
|
461
871
|
}
|
|
462
872
|
catch {
|
|
@@ -466,7 +876,8 @@ function buildLanguageAwareChunks(input) {
|
|
|
466
876
|
tokenize: input.tokenize,
|
|
467
877
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
468
878
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
469
|
-
maxChunks: input.config.max_chunks_per_file
|
|
879
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
880
|
+
lineTokenCounts
|
|
470
881
|
});
|
|
471
882
|
return {
|
|
472
883
|
chunks,
|