@rce-mcp/retrieval-core 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +8 -0
- package/dist/.tsbuildinfo +1 -1
- package/dist/chunking.d.ts +13 -0
- package/dist/chunking.js +493 -81
- package/dist/index.d.ts +280 -4
- package/dist/index.js +2960 -235
- package/dist/remote-sync.js +4 -2
- package/package.json +8 -6
- package/scripts/poc-parser-availability-benchmark.ts +2 -0
- package/src/chunking.ts +578 -84
- package/src/index.ts +3818 -401
- package/src/remote-sync.ts +6 -2
- package/test/benchmark.thresholds.test.ts +63 -0
- package/test/chunking.config.test.ts +74 -0
- package/test/chunking.language-aware.test.ts +250 -4
- package/test/chunking.parser-availability.poc.test.ts +3 -3
- package/test/claude-agent-provider.test.ts +209 -0
- package/test/embedding-context-prefix.test.ts +101 -0
- package/test/embedding-provider.test.ts +450 -1
- package/test/enhance-confidence.test.ts +275 -3
- package/test/integration.test.ts +185 -1
- package/test/mcp-search-quality.regression.test.ts +1009 -0
- package/test/remote-sync.integration.test.ts +15 -0
- package/test/smart-cutoff.config.test.ts +86 -0
- package/test/snippet-integrity.config.test.ts +59 -0
package/src/chunking.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import Parser from "tree-sitter";
|
|
2
2
|
import Go from "tree-sitter-go";
|
|
3
|
-
import
|
|
4
|
-
import
|
|
3
|
+
import JavaScriptV023 from "tree-sitter-javascript-v023";
|
|
4
|
+
import PythonV023 from "tree-sitter-python-v023";
|
|
5
5
|
import TypeScript from "tree-sitter-typescript";
|
|
6
6
|
|
|
7
7
|
export type ChunkingStrategy = "language_aware" | "sliding";
|
|
@@ -17,9 +17,15 @@ export interface ChunkingConfig {
|
|
|
17
17
|
fallback_strategy: "sliding";
|
|
18
18
|
target_chunk_tokens: number;
|
|
19
19
|
chunk_overlap_tokens: number;
|
|
20
|
+
budget_tokenizer: "ranking" | "lightweight";
|
|
21
|
+
boundary_strictness: "legacy" | "semantic_js_ts";
|
|
20
22
|
max_chunks_per_file: number;
|
|
21
23
|
parse_timeout_ms: number;
|
|
22
24
|
enabled_languages: string[];
|
|
25
|
+
recursive_semantic_chunking_enabled?: boolean;
|
|
26
|
+
semantic_merge_gap_lines?: number;
|
|
27
|
+
semantic_merge_max_span_lines?: number;
|
|
28
|
+
comment_forward_absorb_enabled?: boolean;
|
|
23
29
|
}
|
|
24
30
|
|
|
25
31
|
export interface ChunkingRawFile {
|
|
@@ -42,6 +48,7 @@ export interface ChunkingResult {
|
|
|
42
48
|
language_aware_attempt_latency_ms?: number;
|
|
43
49
|
fallback_path_latency_ms?: number;
|
|
44
50
|
language?: string;
|
|
51
|
+
recursive_semantic_chunking_used?: boolean;
|
|
45
52
|
}
|
|
46
53
|
|
|
47
54
|
export type ParserLanguage = "typescript" | "tsx" | "javascript" | "jsx" | "python" | "go";
|
|
@@ -60,7 +67,7 @@ export interface ChunkingParserAvailabilitySnapshotEntry {
|
|
|
60
67
|
error?: string;
|
|
61
68
|
}
|
|
62
69
|
|
|
63
|
-
const
|
|
70
|
+
const DEFAULT_BOUNDARY_NODE_TYPES_LEGACY: Record<ParserLanguage, Set<string>> = {
|
|
64
71
|
typescript: new Set([
|
|
65
72
|
"function_declaration",
|
|
66
73
|
"generator_function_declaration",
|
|
@@ -77,11 +84,35 @@ const DEFAULT_BOUNDARY_NODE_TYPES: Record<ParserLanguage, Set<string>> = {
|
|
|
77
84
|
"enum_declaration",
|
|
78
85
|
"type_alias_declaration"
|
|
79
86
|
]),
|
|
80
|
-
javascript: new Set([
|
|
81
|
-
|
|
87
|
+
javascript: new Set([
|
|
88
|
+
"function_declaration",
|
|
89
|
+
"generator_function_declaration",
|
|
90
|
+
"class_declaration",
|
|
91
|
+
"function_expression",
|
|
92
|
+
"arrow_function"
|
|
93
|
+
]),
|
|
94
|
+
jsx: new Set([
|
|
95
|
+
"function_declaration",
|
|
96
|
+
"generator_function_declaration",
|
|
97
|
+
"class_declaration",
|
|
98
|
+
"function_expression",
|
|
99
|
+
"arrow_function"
|
|
100
|
+
]),
|
|
82
101
|
python: new Set(["function_definition", "class_definition"]),
|
|
83
102
|
go: new Set(["function_declaration", "method_declaration", "type_declaration"])
|
|
84
103
|
};
|
|
104
|
+
const DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS: Record<ParserLanguage, Set<string>> = {
|
|
105
|
+
...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY,
|
|
106
|
+
typescript: new Set([
|
|
107
|
+
...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.typescript,
|
|
108
|
+
"function_expression",
|
|
109
|
+
"arrow_function",
|
|
110
|
+
"method_definition"
|
|
111
|
+
]),
|
|
112
|
+
tsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.tsx, "function_expression", "arrow_function", "method_definition"]),
|
|
113
|
+
javascript: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.javascript, "method_definition"]),
|
|
114
|
+
jsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.jsx, "method_definition"])
|
|
115
|
+
};
|
|
85
116
|
|
|
86
117
|
const parserAvailabilityCache = new Map<ParserLanguage, ParserAvailability>();
|
|
87
118
|
const parserInitAttempts = new Map<ParserLanguage, number>();
|
|
@@ -92,6 +123,15 @@ const CANONICAL_TO_PARSER_LANGUAGE: Record<string, ParserLanguage> = {
|
|
|
92
123
|
python: "python",
|
|
93
124
|
go: "go"
|
|
94
125
|
};
|
|
126
|
+
const JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES = new Set([
|
|
127
|
+
"assignment_expression",
|
|
128
|
+
"variable_declarator",
|
|
129
|
+
"pair",
|
|
130
|
+
"export_statement",
|
|
131
|
+
"public_field_definition",
|
|
132
|
+
"property_definition"
|
|
133
|
+
]);
|
|
134
|
+
const SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER = 1.35;
|
|
95
135
|
|
|
96
136
|
function parserLanguageToCanonical(language: ParserLanguage): string {
|
|
97
137
|
if (language === "tsx") {
|
|
@@ -131,7 +171,7 @@ function parserLanguageFromPath(path: string): ParserLanguage | undefined {
|
|
|
131
171
|
if (normalized.endsWith(".tsx")) {
|
|
132
172
|
return "tsx";
|
|
133
173
|
}
|
|
134
|
-
if (normalized.endsWith(".ts")) {
|
|
174
|
+
if (normalized.endsWith(".ts") || normalized.endsWith(".mts") || normalized.endsWith(".cts")) {
|
|
135
175
|
return "typescript";
|
|
136
176
|
}
|
|
137
177
|
if (normalized.endsWith(".jsx")) {
|
|
@@ -183,10 +223,11 @@ function loadParserLanguage(language: ParserLanguage): Parser.Language {
|
|
|
183
223
|
return (TypeScript as unknown as { tsx: unknown }).tsx as Parser.Language;
|
|
184
224
|
}
|
|
185
225
|
if (language === "javascript" || language === "jsx") {
|
|
186
|
-
|
|
226
|
+
// Bun is currently most reliable with tree-sitter 0.23-compatible JS/Python grammars.
|
|
227
|
+
return resolveTreeSitterLanguageHandle(JavaScriptV023);
|
|
187
228
|
}
|
|
188
229
|
if (language === "python") {
|
|
189
|
-
return resolveTreeSitterLanguageHandle(
|
|
230
|
+
return resolveTreeSitterLanguageHandle(PythonV023);
|
|
190
231
|
}
|
|
191
232
|
return resolveTreeSitterLanguageHandle(Go);
|
|
192
233
|
}
|
|
@@ -317,28 +358,66 @@ function trimLineRange(lines: string[], startRow: number, endRow: number): { sta
|
|
|
317
358
|
|
|
318
359
|
function splitRangeWithBudget(input: {
|
|
319
360
|
lines: string[];
|
|
361
|
+
lineTokenCounts: number[];
|
|
320
362
|
startRow: number;
|
|
321
363
|
endRow: number;
|
|
322
|
-
tokenize: (text: string) => string[];
|
|
323
364
|
targetChunkTokens: number;
|
|
324
365
|
overlapTokens: number;
|
|
325
366
|
maxChunks: number;
|
|
367
|
+
preferSafeBoundarySplit?: boolean;
|
|
368
|
+
softMaxChunkTokens?: number;
|
|
326
369
|
}): Array<{ startRow: number; endRow: number }> {
|
|
370
|
+
const rangeTokenCount = (startRow: number, endRow: number): number => {
|
|
371
|
+
let total = 0;
|
|
372
|
+
for (let row = startRow; row <= endRow; row += 1) {
|
|
373
|
+
total += input.lineTokenCounts[row] ?? 0;
|
|
374
|
+
}
|
|
375
|
+
return total;
|
|
376
|
+
};
|
|
377
|
+
const isSafeSplitBoundaryLine = (line: string): boolean => {
|
|
378
|
+
const trimmed = line.trim();
|
|
379
|
+
if (trimmed.length === 0) {
|
|
380
|
+
return true;
|
|
381
|
+
}
|
|
382
|
+
return trimmed.endsWith(";") || trimmed.endsWith("}") || trimmed.endsWith("{");
|
|
383
|
+
};
|
|
327
384
|
const segments: Array<{ startRow: number; endRow: number }> = [];
|
|
328
385
|
let start = input.startRow;
|
|
329
386
|
|
|
330
387
|
while (start <= input.endRow && segments.length < input.maxChunks) {
|
|
331
388
|
let tokens = 0;
|
|
332
|
-
let end = start;
|
|
333
|
-
while (end
|
|
334
|
-
|
|
335
|
-
|
|
389
|
+
let end = start - 1;
|
|
390
|
+
while (end < input.endRow) {
|
|
391
|
+
const nextEnd = end + 1;
|
|
392
|
+
tokens += input.lineTokenCounts[nextEnd] ?? 0;
|
|
393
|
+
end = nextEnd;
|
|
394
|
+
if (tokens >= input.targetChunkTokens && end >= start) {
|
|
336
395
|
break;
|
|
337
396
|
}
|
|
338
|
-
end += 1;
|
|
339
397
|
}
|
|
340
398
|
|
|
341
|
-
|
|
399
|
+
let safeEnd = Math.min(Math.max(start, end), input.endRow);
|
|
400
|
+
if (input.preferSafeBoundarySplit && safeEnd > start) {
|
|
401
|
+
let adjusted = safeEnd;
|
|
402
|
+
for (let row = safeEnd; row > start; row -= 1) {
|
|
403
|
+
if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
|
|
404
|
+
adjusted = row;
|
|
405
|
+
break;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
if (adjusted === safeEnd && typeof input.softMaxChunkTokens === "number" && input.softMaxChunkTokens > input.targetChunkTokens) {
|
|
409
|
+
for (let row = safeEnd + 1; row <= input.endRow; row += 1) {
|
|
410
|
+
if (rangeTokenCount(start, row) > input.softMaxChunkTokens) {
|
|
411
|
+
break;
|
|
412
|
+
}
|
|
413
|
+
if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
|
|
414
|
+
adjusted = row;
|
|
415
|
+
break;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
safeEnd = Math.max(start, adjusted);
|
|
420
|
+
}
|
|
342
421
|
if (safeEnd >= start) {
|
|
343
422
|
segments.push({ startRow: start, endRow: safeEnd });
|
|
344
423
|
}
|
|
@@ -346,8 +425,18 @@ function splitRangeWithBudget(input: {
|
|
|
346
425
|
if (safeEnd >= input.endRow) {
|
|
347
426
|
break;
|
|
348
427
|
}
|
|
349
|
-
|
|
350
|
-
|
|
428
|
+
|
|
429
|
+
let nextStart = safeEnd + 1;
|
|
430
|
+
if (input.overlapTokens > 0) {
|
|
431
|
+
let overlap = 0;
|
|
432
|
+
let cursor = safeEnd;
|
|
433
|
+
while (cursor >= start && overlap < input.overlapTokens) {
|
|
434
|
+
overlap += input.lineTokenCounts[cursor] ?? 0;
|
|
435
|
+
cursor -= 1;
|
|
436
|
+
}
|
|
437
|
+
nextStart = Math.max(start + 1, cursor + 1);
|
|
438
|
+
}
|
|
439
|
+
start = Math.max(start + 1, nextStart);
|
|
351
440
|
}
|
|
352
441
|
|
|
353
442
|
return segments;
|
|
@@ -359,12 +448,14 @@ function buildSlidingChunks(input: {
|
|
|
359
448
|
targetChunkTokens: number;
|
|
360
449
|
overlapTokens: number;
|
|
361
450
|
maxChunks: number;
|
|
451
|
+
lineTokenCounts?: number[];
|
|
362
452
|
}): ChunkingOutput[] {
|
|
453
|
+
const lineTokenCounts = input.lineTokenCounts ?? computeLineTokenCounts(input.lines, input.tokenize);
|
|
363
454
|
const rawSegments = splitRangeWithBudget({
|
|
364
455
|
lines: input.lines,
|
|
456
|
+
lineTokenCounts,
|
|
365
457
|
startRow: 0,
|
|
366
458
|
endRow: Math.max(0, input.lines.length - 1),
|
|
367
|
-
tokenize: input.tokenize,
|
|
368
459
|
targetChunkTokens: input.targetChunkTokens,
|
|
369
460
|
overlapTokens: input.overlapTokens,
|
|
370
461
|
maxChunks: input.maxChunks
|
|
@@ -395,6 +486,346 @@ function hasBoundaryAncestor(node: Parser.SyntaxNode, boundaryTypes: Set<string>
|
|
|
395
486
|
return false;
|
|
396
487
|
}
|
|
397
488
|
|
|
489
|
+
function getBoundaryTypes(
|
|
490
|
+
parserLanguage: ParserLanguage,
|
|
491
|
+
boundaryStrictness: "legacy" | "semantic_js_ts"
|
|
492
|
+
): Set<string> {
|
|
493
|
+
if (boundaryStrictness === "semantic_js_ts") {
|
|
494
|
+
return DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS[parserLanguage];
|
|
495
|
+
}
|
|
496
|
+
return DEFAULT_BOUNDARY_NODE_TYPES_LEGACY[parserLanguage];
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
function isExpressionBoundaryLanguage(parserLanguage: ParserLanguage, boundaryStrictness: "legacy" | "semantic_js_ts"): boolean {
|
|
500
|
+
if (boundaryStrictness === "semantic_js_ts") {
|
|
501
|
+
return (
|
|
502
|
+
parserLanguage === "javascript" ||
|
|
503
|
+
parserLanguage === "jsx" ||
|
|
504
|
+
parserLanguage === "typescript" ||
|
|
505
|
+
parserLanguage === "tsx"
|
|
506
|
+
);
|
|
507
|
+
}
|
|
508
|
+
return parserLanguage === "javascript" || parserLanguage === "jsx";
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
function isLanguageBoundaryCandidate(
|
|
512
|
+
parserLanguage: ParserLanguage,
|
|
513
|
+
node: Parser.SyntaxNode,
|
|
514
|
+
boundaryStrictness: "legacy" | "semantic_js_ts"
|
|
515
|
+
): boolean {
|
|
516
|
+
if (!isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness)) {
|
|
517
|
+
return true;
|
|
518
|
+
}
|
|
519
|
+
if (node.type !== "function_expression" && node.type !== "arrow_function") {
|
|
520
|
+
return true;
|
|
521
|
+
}
|
|
522
|
+
const parentType = node.parent?.type;
|
|
523
|
+
if (!parentType) {
|
|
524
|
+
return false;
|
|
525
|
+
}
|
|
526
|
+
return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(parentType);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
export function __isChunkingBoundaryCandidateForTests(input: {
|
|
530
|
+
parserLanguage: ParserLanguage;
|
|
531
|
+
nodeType: string;
|
|
532
|
+
parentType?: string;
|
|
533
|
+
boundaryStrictness?: "legacy" | "semantic_js_ts";
|
|
534
|
+
}): boolean {
|
|
535
|
+
const strictness = input.boundaryStrictness ?? "legacy";
|
|
536
|
+
if (!isExpressionBoundaryLanguage(input.parserLanguage, strictness)) {
|
|
537
|
+
return true;
|
|
538
|
+
}
|
|
539
|
+
if (input.nodeType !== "function_expression" && input.nodeType !== "arrow_function") {
|
|
540
|
+
return true;
|
|
541
|
+
}
|
|
542
|
+
if (!input.parentType) {
|
|
543
|
+
return false;
|
|
544
|
+
}
|
|
545
|
+
return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(input.parentType);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
function computeLineTokenCounts(lines: string[], tokenize: (text: string) => string[]): number[] {
|
|
549
|
+
return lines.map((line) => tokenize(line ?? "").length);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
function rangeTokenCount(lineTokenCounts: number[], startRow: number, endRow: number): number {
|
|
553
|
+
let total = 0;
|
|
554
|
+
for (let row = startRow; row <= endRow; row += 1) {
|
|
555
|
+
total += lineTokenCounts[row] ?? 0;
|
|
556
|
+
}
|
|
557
|
+
return total;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
function listNamedChildren(node: Parser.SyntaxNode): Parser.SyntaxNode[] {
|
|
561
|
+
const children: Parser.SyntaxNode[] = [];
|
|
562
|
+
for (let index = 0; index < node.namedChildCount; index += 1) {
|
|
563
|
+
const child = node.namedChild(index);
|
|
564
|
+
if (child) {
|
|
565
|
+
children.push(child);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
return children;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
function normalizeNodeWindow(input: {
|
|
572
|
+
node: Parser.SyntaxNode;
|
|
573
|
+
lines: string[];
|
|
574
|
+
lastRow: number;
|
|
575
|
+
}): { startRow: number; endRow: number } | undefined {
|
|
576
|
+
const startRow = Math.max(0, Math.min(input.lastRow, input.node.startPosition.row));
|
|
577
|
+
const endRow = Math.max(startRow, Math.min(input.lastRow, toInclusiveEndRow(input.node)));
|
|
578
|
+
const trimmed = trimLineRange(input.lines, startRow, endRow);
|
|
579
|
+
if (!trimmed) {
|
|
580
|
+
return undefined;
|
|
581
|
+
}
|
|
582
|
+
return {
|
|
583
|
+
startRow: trimmed.start,
|
|
584
|
+
endRow: trimmed.end
|
|
585
|
+
};
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
function buildRecursiveSemanticWindows(input: {
|
|
589
|
+
root: Parser.SyntaxNode;
|
|
590
|
+
lines: string[];
|
|
591
|
+
lineTokenCounts: number[];
|
|
592
|
+
targetChunkTokens: number;
|
|
593
|
+
maxChunks: number;
|
|
594
|
+
boundaryStrictness: "legacy" | "semantic_js_ts";
|
|
595
|
+
}): Array<{ startRow: number; endRow: number }> {
|
|
596
|
+
const lastRow = Math.max(0, input.lines.length - 1);
|
|
597
|
+
const windows: Array<{ startRow: number; endRow: number }> = [];
|
|
598
|
+
const softMaxChunkTokens = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
599
|
+
const seen = new Set<string>();
|
|
600
|
+
|
|
601
|
+
const pushSplitWindows = (startRow: number, endRow: number): void => {
|
|
602
|
+
if (startRow > endRow || windows.length >= input.maxChunks) {
|
|
603
|
+
return;
|
|
604
|
+
}
|
|
605
|
+
const segments = splitRangeWithBudget({
|
|
606
|
+
lines: input.lines,
|
|
607
|
+
lineTokenCounts: input.lineTokenCounts,
|
|
608
|
+
startRow,
|
|
609
|
+
endRow,
|
|
610
|
+
targetChunkTokens: input.targetChunkTokens,
|
|
611
|
+
overlapTokens: 0,
|
|
612
|
+
maxChunks: input.maxChunks - windows.length,
|
|
613
|
+
preferSafeBoundarySplit: input.boundaryStrictness === "semantic_js_ts",
|
|
614
|
+
softMaxChunkTokens
|
|
615
|
+
});
|
|
616
|
+
for (const segment of segments) {
|
|
617
|
+
const trimmed = trimLineRange(input.lines, segment.startRow, segment.endRow);
|
|
618
|
+
if (!trimmed) {
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
const key = `${trimmed.start}:${trimmed.end}`;
|
|
622
|
+
if (seen.has(key)) {
|
|
623
|
+
continue;
|
|
624
|
+
}
|
|
625
|
+
seen.add(key);
|
|
626
|
+
windows.push({ startRow: trimmed.start, endRow: trimmed.end });
|
|
627
|
+
if (windows.length >= input.maxChunks) {
|
|
628
|
+
return;
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
};
|
|
632
|
+
|
|
633
|
+
const visitNode = (node: Parser.SyntaxNode): void => {
|
|
634
|
+
if (windows.length >= input.maxChunks) {
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
const range = normalizeNodeWindow({
|
|
638
|
+
node,
|
|
639
|
+
lines: input.lines,
|
|
640
|
+
lastRow
|
|
641
|
+
});
|
|
642
|
+
if (!range) {
|
|
643
|
+
return;
|
|
644
|
+
}
|
|
645
|
+
const tokenCount = rangeTokenCount(input.lineTokenCounts, range.startRow, range.endRow);
|
|
646
|
+
if (tokenCount <= input.targetChunkTokens) {
|
|
647
|
+
const key = `${range.startRow}:${range.endRow}`;
|
|
648
|
+
if (!seen.has(key)) {
|
|
649
|
+
seen.add(key);
|
|
650
|
+
windows.push(range);
|
|
651
|
+
}
|
|
652
|
+
return;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
const children = listNamedChildren(node)
|
|
656
|
+
.map((child) => ({
|
|
657
|
+
node: child,
|
|
658
|
+
range: normalizeNodeWindow({
|
|
659
|
+
node: child,
|
|
660
|
+
lines: input.lines,
|
|
661
|
+
lastRow
|
|
662
|
+
})
|
|
663
|
+
}))
|
|
664
|
+
.filter((child): child is { node: Parser.SyntaxNode; range: { startRow: number; endRow: number } } =>
|
|
665
|
+
Boolean(child.range)
|
|
666
|
+
)
|
|
667
|
+
.sort((a, b) => a.range.startRow - b.range.startRow || a.range.endRow - b.range.endRow);
|
|
668
|
+
|
|
669
|
+
if (children.length === 0) {
|
|
670
|
+
pushSplitWindows(range.startRow, range.endRow);
|
|
671
|
+
return;
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
let cursor = range.startRow;
|
|
675
|
+
for (const child of children) {
|
|
676
|
+
if (windows.length >= input.maxChunks) {
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
if (child.range.endRow < cursor) {
|
|
680
|
+
continue;
|
|
681
|
+
}
|
|
682
|
+
if (child.range.startRow > cursor) {
|
|
683
|
+
pushSplitWindows(cursor, child.range.startRow - 1);
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
visitNode(child.node);
|
|
687
|
+
cursor = Math.max(cursor, child.range.endRow + 1);
|
|
688
|
+
if (cursor > range.endRow) {
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
if (cursor <= range.endRow) {
|
|
693
|
+
pushSplitWindows(cursor, range.endRow);
|
|
694
|
+
}
|
|
695
|
+
};
|
|
696
|
+
|
|
697
|
+
visitNode(input.root);
|
|
698
|
+
return windows.sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
function mergeSemanticWindows(input: {
|
|
702
|
+
windows: Array<{ startRow: number; endRow: number }>;
|
|
703
|
+
lineTokenCounts: number[];
|
|
704
|
+
targetChunkTokens: number;
|
|
705
|
+
semanticMergeGapLines: number;
|
|
706
|
+
semanticMergeMaxSpanLines: number;
|
|
707
|
+
}): Array<{ startRow: number; endRow: number }> {
|
|
708
|
+
if (input.windows.length <= 1) {
|
|
709
|
+
return [...input.windows];
|
|
710
|
+
}
|
|
711
|
+
const ordered = [...input.windows].sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
|
|
712
|
+
const merged: Array<{ startRow: number; endRow: number }> = [];
|
|
713
|
+
const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
714
|
+
for (const window of ordered) {
|
|
715
|
+
const last = merged[merged.length - 1];
|
|
716
|
+
if (!last) {
|
|
717
|
+
merged.push({ ...window });
|
|
718
|
+
continue;
|
|
719
|
+
}
|
|
720
|
+
const gapLines = Math.max(0, window.startRow - last.endRow - 1);
|
|
721
|
+
const nextStartRow = Math.min(last.startRow, window.startRow);
|
|
722
|
+
const nextEndRow = Math.max(last.endRow, window.endRow);
|
|
723
|
+
const nextSpanLines = nextEndRow - nextStartRow + 1;
|
|
724
|
+
const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, nextStartRow, nextEndRow);
|
|
725
|
+
const canMerge =
|
|
726
|
+
gapLines <= input.semanticMergeGapLines &&
|
|
727
|
+
nextSpanLines <= input.semanticMergeMaxSpanLines &&
|
|
728
|
+
mergedTokenCount <= mergeTokenBudget;
|
|
729
|
+
if (!canMerge) {
|
|
730
|
+
merged.push({ ...window });
|
|
731
|
+
continue;
|
|
732
|
+
}
|
|
733
|
+
last.startRow = nextStartRow;
|
|
734
|
+
last.endRow = nextEndRow;
|
|
735
|
+
}
|
|
736
|
+
return merged;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
function isCommentOnlyLine(line: string): boolean {
|
|
740
|
+
const trimmed = line.trim();
|
|
741
|
+
if (trimmed.length === 0) {
|
|
742
|
+
return true;
|
|
743
|
+
}
|
|
744
|
+
return (
|
|
745
|
+
trimmed.startsWith("//") ||
|
|
746
|
+
trimmed.startsWith("/*") ||
|
|
747
|
+
trimmed.startsWith("*") ||
|
|
748
|
+
trimmed.startsWith("*/") ||
|
|
749
|
+
trimmed.startsWith("#")
|
|
750
|
+
);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
function windowLooksCommentOnly(input: { lines: string[]; startRow: number; endRow: number }): boolean {
|
|
754
|
+
for (let row = input.startRow; row <= input.endRow; row += 1) {
|
|
755
|
+
if (!isCommentOnlyLine(input.lines[row] ?? "")) {
|
|
756
|
+
return false;
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
return true;
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
function absorbForwardCommentWindows(input: {
|
|
763
|
+
windows: Array<{ startRow: number; endRow: number }>;
|
|
764
|
+
lines: string[];
|
|
765
|
+
lineTokenCounts: number[];
|
|
766
|
+
targetChunkTokens: number;
|
|
767
|
+
semanticMergeMaxSpanLines: number;
|
|
768
|
+
}): Array<{ startRow: number; endRow: number }> {
|
|
769
|
+
if (input.windows.length <= 1) {
|
|
770
|
+
return [...input.windows];
|
|
771
|
+
}
|
|
772
|
+
const output: Array<{ startRow: number; endRow: number }> = [];
|
|
773
|
+
const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
774
|
+
for (let index = 0; index < input.windows.length; index += 1) {
|
|
775
|
+
const current = input.windows[index];
|
|
776
|
+
const next = input.windows[index + 1];
|
|
777
|
+
if (!current) {
|
|
778
|
+
continue;
|
|
779
|
+
}
|
|
780
|
+
if (!next) {
|
|
781
|
+
output.push({ ...current });
|
|
782
|
+
continue;
|
|
783
|
+
}
|
|
784
|
+
if (!windowLooksCommentOnly({ lines: input.lines, startRow: current.startRow, endRow: current.endRow })) {
|
|
785
|
+
output.push({ ...current });
|
|
786
|
+
continue;
|
|
787
|
+
}
|
|
788
|
+
const gapLines = Math.max(0, next.startRow - current.endRow - 1);
|
|
789
|
+
const nextSpanLines = next.endRow - current.startRow + 1;
|
|
790
|
+
const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, current.startRow, next.endRow);
|
|
791
|
+
const canAbsorb =
|
|
792
|
+
gapLines <= 1 && nextSpanLines <= input.semanticMergeMaxSpanLines && mergedTokenCount <= mergeTokenBudget;
|
|
793
|
+
if (!canAbsorb) {
|
|
794
|
+
output.push({ ...current });
|
|
795
|
+
continue;
|
|
796
|
+
}
|
|
797
|
+
output.push({
|
|
798
|
+
startRow: current.startRow,
|
|
799
|
+
endRow: next.endRow
|
|
800
|
+
});
|
|
801
|
+
index += 1;
|
|
802
|
+
}
|
|
803
|
+
return output;
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
function windowsToChunks(input: {
|
|
807
|
+
windows: Array<{ startRow: number; endRow: number }>;
|
|
808
|
+
lines: string[];
|
|
809
|
+
maxChunks: number;
|
|
810
|
+
}): ChunkingOutput[] {
|
|
811
|
+
const chunks: ChunkingOutput[] = [];
|
|
812
|
+
for (const window of input.windows) {
|
|
813
|
+
if (chunks.length >= input.maxChunks) {
|
|
814
|
+
break;
|
|
815
|
+
}
|
|
816
|
+
const trimmed = trimLineRange(input.lines, window.startRow, window.endRow);
|
|
817
|
+
if (!trimmed) {
|
|
818
|
+
continue;
|
|
819
|
+
}
|
|
820
|
+
chunks.push({
|
|
821
|
+
start_line: trimmed.start + 1,
|
|
822
|
+
end_line: trimmed.end + 1,
|
|
823
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
824
|
+
});
|
|
825
|
+
}
|
|
826
|
+
return chunks;
|
|
827
|
+
}
|
|
828
|
+
|
|
398
829
|
function buildLanguageAwareChunks(input: {
|
|
399
830
|
file: ChunkingRawFile;
|
|
400
831
|
lines: string[];
|
|
@@ -403,6 +834,7 @@ function buildLanguageAwareChunks(input: {
|
|
|
403
834
|
tokenize: (text: string) => string[];
|
|
404
835
|
}): ChunkingResult {
|
|
405
836
|
const languageAwareAttemptStart = Date.now();
|
|
837
|
+
const lineTokenCounts = computeLineTokenCounts(input.lines, input.tokenize);
|
|
406
838
|
const parser = getParser(input.parserLanguage);
|
|
407
839
|
if (!parser) {
|
|
408
840
|
const fallbackStart = Date.now();
|
|
@@ -411,7 +843,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
411
843
|
tokenize: input.tokenize,
|
|
412
844
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
413
845
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
414
|
-
maxChunks: input.config.max_chunks_per_file
|
|
846
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
847
|
+
lineTokenCounts
|
|
415
848
|
});
|
|
416
849
|
return {
|
|
417
850
|
chunks,
|
|
@@ -435,7 +868,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
435
868
|
tokenize: input.tokenize,
|
|
436
869
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
437
870
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
438
|
-
maxChunks: input.config.max_chunks_per_file
|
|
871
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
872
|
+
lineTokenCounts
|
|
439
873
|
});
|
|
440
874
|
return {
|
|
441
875
|
chunks,
|
|
@@ -456,7 +890,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
456
890
|
tokenize: input.tokenize,
|
|
457
891
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
458
892
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
459
|
-
maxChunks: input.config.max_chunks_per_file
|
|
893
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
894
|
+
lineTokenCounts
|
|
460
895
|
});
|
|
461
896
|
return {
|
|
462
897
|
chunks,
|
|
@@ -469,77 +904,133 @@ function buildLanguageAwareChunks(input: {
|
|
|
469
904
|
};
|
|
470
905
|
}
|
|
471
906
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
const boundaryNodes = candidates
|
|
475
|
-
.filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
|
|
476
|
-
.sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
|
|
907
|
+
let chunks: ChunkingOutput[] = [];
|
|
908
|
+
let recursiveSemanticChunkingUsed = false;
|
|
477
909
|
|
|
478
|
-
if (
|
|
479
|
-
const
|
|
480
|
-
const
|
|
910
|
+
if (input.config.recursive_semantic_chunking_enabled) {
|
|
911
|
+
const semanticMergeGapLines = input.config.semantic_merge_gap_lines ?? 6;
|
|
912
|
+
const semanticMergeMaxSpanLines = input.config.semantic_merge_max_span_lines ?? 220;
|
|
913
|
+
const recursiveWindows = buildRecursiveSemanticWindows({
|
|
914
|
+
root,
|
|
481
915
|
lines: input.lines,
|
|
482
|
-
|
|
916
|
+
lineTokenCounts,
|
|
483
917
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
484
|
-
|
|
918
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
919
|
+
boundaryStrictness: input.config.boundary_strictness
|
|
920
|
+
});
|
|
921
|
+
const mergedWindows = mergeSemanticWindows({
|
|
922
|
+
windows: recursiveWindows,
|
|
923
|
+
lineTokenCounts,
|
|
924
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
925
|
+
semanticMergeGapLines,
|
|
926
|
+
semanticMergeMaxSpanLines
|
|
927
|
+
});
|
|
928
|
+
const absorbedWindows =
|
|
929
|
+
input.config.comment_forward_absorb_enabled === false
|
|
930
|
+
? mergedWindows
|
|
931
|
+
: absorbForwardCommentWindows({
|
|
932
|
+
windows: mergedWindows,
|
|
933
|
+
lines: input.lines,
|
|
934
|
+
lineTokenCounts,
|
|
935
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
936
|
+
semanticMergeMaxSpanLines
|
|
937
|
+
});
|
|
938
|
+
chunks = windowsToChunks({
|
|
939
|
+
windows: absorbedWindows,
|
|
940
|
+
lines: input.lines,
|
|
485
941
|
maxChunks: input.config.max_chunks_per_file
|
|
486
942
|
});
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
};
|
|
496
|
-
}
|
|
943
|
+
recursiveSemanticChunkingUsed = chunks.length > 0;
|
|
944
|
+
} else {
|
|
945
|
+
const boundaryTypes = getBoundaryTypes(input.parserLanguage, input.config.boundary_strictness);
|
|
946
|
+
const candidates = root.descendantsOfType([...boundaryTypes]);
|
|
947
|
+
const boundaryNodes = candidates
|
|
948
|
+
.filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
|
|
949
|
+
.filter((node) => isLanguageBoundaryCandidate(input.parserLanguage, node, input.config.boundary_strictness))
|
|
950
|
+
.sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
|
|
497
951
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
952
|
+
if (boundaryNodes.length === 0) {
|
|
953
|
+
const fallbackStart = Date.now();
|
|
954
|
+
const fallbackChunks = buildSlidingChunks({
|
|
955
|
+
lines: input.lines,
|
|
956
|
+
tokenize: input.tokenize,
|
|
957
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
958
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
959
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
960
|
+
lineTokenCounts
|
|
961
|
+
});
|
|
962
|
+
return {
|
|
963
|
+
chunks: fallbackChunks,
|
|
964
|
+
strategy: "sliding",
|
|
965
|
+
fallback_reason: "empty_language_boundaries",
|
|
966
|
+
parse_latency_ms: parseLatencyMs,
|
|
967
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
968
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
969
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
970
|
+
};
|
|
511
971
|
}
|
|
512
|
-
}
|
|
513
|
-
if (cursor <= lastRow) {
|
|
514
|
-
segments.push({ startRow: cursor, endRow: lastRow });
|
|
515
|
-
}
|
|
516
972
|
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
973
|
+
const segments: Array<{ startRow: number; endRow: number; boundary: boolean }> = [];
|
|
974
|
+
let cursor = 0;
|
|
975
|
+
const lastRow = Math.max(0, input.lines.length - 1);
|
|
976
|
+
for (const node of boundaryNodes) {
|
|
977
|
+
const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
|
|
978
|
+
const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
|
|
979
|
+
if (startRow > cursor) {
|
|
980
|
+
segments.push({ startRow: cursor, endRow: startRow - 1, boundary: false });
|
|
981
|
+
}
|
|
982
|
+
segments.push({ startRow, endRow, boundary: true });
|
|
983
|
+
cursor = endRow + 1;
|
|
984
|
+
if (cursor > lastRow) {
|
|
985
|
+
break;
|
|
986
|
+
}
|
|
521
987
|
}
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
overlapTokens: input.config.chunk_overlap_tokens,
|
|
529
|
-
maxChunks: input.config.max_chunks_per_file - chunks.length
|
|
530
|
-
});
|
|
531
|
-
for (const piece of pieces) {
|
|
532
|
-
const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
|
|
533
|
-
if (!trimmed) {
|
|
988
|
+
if (cursor <= lastRow) {
|
|
989
|
+
segments.push({ startRow: cursor, endRow: lastRow, boundary: false });
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
for (const segment of segments) {
|
|
993
|
+
if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
|
|
534
994
|
continue;
|
|
535
995
|
}
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
996
|
+
const segmentTokenCount = lineTokenCounts
|
|
997
|
+
.slice(segment.startRow, segment.endRow + 1)
|
|
998
|
+
.reduce((sum, value) => sum + value, 0);
|
|
999
|
+
const enableSemanticBoundarySplits =
|
|
1000
|
+
input.config.boundary_strictness === "semantic_js_ts" &&
|
|
1001
|
+
(input.parserLanguage === "javascript" ||
|
|
1002
|
+
input.parserLanguage === "jsx" ||
|
|
1003
|
+
input.parserLanguage === "typescript" ||
|
|
1004
|
+
input.parserLanguage === "tsx") &&
|
|
1005
|
+
segment.boundary;
|
|
1006
|
+
const softMaxChunkTokens = Math.floor(input.config.target_chunk_tokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
|
|
1007
|
+
const pieces =
|
|
1008
|
+
enableSemanticBoundarySplits && segmentTokenCount <= softMaxChunkTokens
|
|
1009
|
+
? [{ startRow: segment.startRow, endRow: segment.endRow }]
|
|
1010
|
+
: splitRangeWithBudget({
|
|
1011
|
+
lines: input.lines,
|
|
1012
|
+
lineTokenCounts,
|
|
1013
|
+
startRow: segment.startRow,
|
|
1014
|
+
endRow: segment.endRow,
|
|
1015
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
1016
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
1017
|
+
maxChunks: input.config.max_chunks_per_file - chunks.length,
|
|
1018
|
+
preferSafeBoundarySplit: enableSemanticBoundarySplits,
|
|
1019
|
+
softMaxChunkTokens
|
|
1020
|
+
});
|
|
1021
|
+
for (const piece of pieces) {
|
|
1022
|
+
const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
|
|
1023
|
+
if (!trimmed) {
|
|
1024
|
+
continue;
|
|
1025
|
+
}
|
|
1026
|
+
chunks.push({
|
|
1027
|
+
start_line: trimmed.start + 1,
|
|
1028
|
+
end_line: trimmed.end + 1,
|
|
1029
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
1030
|
+
});
|
|
1031
|
+
if (chunks.length >= input.config.max_chunks_per_file) {
|
|
1032
|
+
break;
|
|
1033
|
+
}
|
|
543
1034
|
}
|
|
544
1035
|
}
|
|
545
1036
|
}
|
|
@@ -551,7 +1042,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
551
1042
|
tokenize: input.tokenize,
|
|
552
1043
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
553
1044
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
554
|
-
maxChunks: input.config.max_chunks_per_file
|
|
1045
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
1046
|
+
lineTokenCounts
|
|
555
1047
|
});
|
|
556
1048
|
return {
|
|
557
1049
|
chunks: slidingChunks,
|
|
@@ -569,7 +1061,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
569
1061
|
strategy: "language_aware",
|
|
570
1062
|
parse_latency_ms: parseLatencyMs,
|
|
571
1063
|
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
572
|
-
language: parserLanguageToCanonical(input.parserLanguage)
|
|
1064
|
+
language: parserLanguageToCanonical(input.parserLanguage),
|
|
1065
|
+
recursive_semantic_chunking_used: recursiveSemanticChunkingUsed
|
|
573
1066
|
};
|
|
574
1067
|
} catch {
|
|
575
1068
|
const fallbackStart = Date.now();
|
|
@@ -578,7 +1071,8 @@ function buildLanguageAwareChunks(input: {
|
|
|
578
1071
|
tokenize: input.tokenize,
|
|
579
1072
|
targetChunkTokens: input.config.target_chunk_tokens,
|
|
580
1073
|
overlapTokens: input.config.chunk_overlap_tokens,
|
|
581
|
-
maxChunks: input.config.max_chunks_per_file
|
|
1074
|
+
maxChunks: input.config.max_chunks_per_file,
|
|
1075
|
+
lineTokenCounts
|
|
582
1076
|
});
|
|
583
1077
|
return {
|
|
584
1078
|
chunks,
|