@rce-mcp/retrieval-core 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/chunking.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import Parser from "tree-sitter";
2
2
  import Go from "tree-sitter-go";
3
- import JavaScript from "tree-sitter-javascript";
4
- import Python from "tree-sitter-python";
3
+ import JavaScriptV023 from "tree-sitter-javascript-v023";
4
+ import PythonV023 from "tree-sitter-python-v023";
5
5
  import TypeScript from "tree-sitter-typescript";
6
6
 
7
7
  export type ChunkingStrategy = "language_aware" | "sliding";
@@ -17,9 +17,15 @@ export interface ChunkingConfig {
17
17
  fallback_strategy: "sliding";
18
18
  target_chunk_tokens: number;
19
19
  chunk_overlap_tokens: number;
20
+ budget_tokenizer: "ranking" | "lightweight";
21
+ boundary_strictness: "legacy" | "semantic_js_ts";
20
22
  max_chunks_per_file: number;
21
23
  parse_timeout_ms: number;
22
24
  enabled_languages: string[];
25
+ recursive_semantic_chunking_enabled?: boolean;
26
+ semantic_merge_gap_lines?: number;
27
+ semantic_merge_max_span_lines?: number;
28
+ comment_forward_absorb_enabled?: boolean;
23
29
  }
24
30
 
25
31
  export interface ChunkingRawFile {
@@ -42,6 +48,7 @@ export interface ChunkingResult {
42
48
  language_aware_attempt_latency_ms?: number;
43
49
  fallback_path_latency_ms?: number;
44
50
  language?: string;
51
+ recursive_semantic_chunking_used?: boolean;
45
52
  }
46
53
 
47
54
  export type ParserLanguage = "typescript" | "tsx" | "javascript" | "jsx" | "python" | "go";
@@ -60,7 +67,7 @@ export interface ChunkingParserAvailabilitySnapshotEntry {
60
67
  error?: string;
61
68
  }
62
69
 
63
- const DEFAULT_BOUNDARY_NODE_TYPES: Record<ParserLanguage, Set<string>> = {
70
+ const DEFAULT_BOUNDARY_NODE_TYPES_LEGACY: Record<ParserLanguage, Set<string>> = {
64
71
  typescript: new Set([
65
72
  "function_declaration",
66
73
  "generator_function_declaration",
@@ -77,11 +84,35 @@ const DEFAULT_BOUNDARY_NODE_TYPES: Record<ParserLanguage, Set<string>> = {
77
84
  "enum_declaration",
78
85
  "type_alias_declaration"
79
86
  ]),
80
- javascript: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
81
- jsx: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
87
+ javascript: new Set([
88
+ "function_declaration",
89
+ "generator_function_declaration",
90
+ "class_declaration",
91
+ "function_expression",
92
+ "arrow_function"
93
+ ]),
94
+ jsx: new Set([
95
+ "function_declaration",
96
+ "generator_function_declaration",
97
+ "class_declaration",
98
+ "function_expression",
99
+ "arrow_function"
100
+ ]),
82
101
  python: new Set(["function_definition", "class_definition"]),
83
102
  go: new Set(["function_declaration", "method_declaration", "type_declaration"])
84
103
  };
104
+ const DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS: Record<ParserLanguage, Set<string>> = {
105
+ ...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY,
106
+ typescript: new Set([
107
+ ...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.typescript,
108
+ "function_expression",
109
+ "arrow_function",
110
+ "method_definition"
111
+ ]),
112
+ tsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.tsx, "function_expression", "arrow_function", "method_definition"]),
113
+ javascript: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.javascript, "method_definition"]),
114
+ jsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.jsx, "method_definition"])
115
+ };
85
116
 
86
117
  const parserAvailabilityCache = new Map<ParserLanguage, ParserAvailability>();
87
118
  const parserInitAttempts = new Map<ParserLanguage, number>();
@@ -92,6 +123,15 @@ const CANONICAL_TO_PARSER_LANGUAGE: Record<string, ParserLanguage> = {
92
123
  python: "python",
93
124
  go: "go"
94
125
  };
126
+ const JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES = new Set([
127
+ "assignment_expression",
128
+ "variable_declarator",
129
+ "pair",
130
+ "export_statement",
131
+ "public_field_definition",
132
+ "property_definition"
133
+ ]);
134
+ const SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER = 1.35;
95
135
 
96
136
  function parserLanguageToCanonical(language: ParserLanguage): string {
97
137
  if (language === "tsx") {
@@ -131,7 +171,7 @@ function parserLanguageFromPath(path: string): ParserLanguage | undefined {
131
171
  if (normalized.endsWith(".tsx")) {
132
172
  return "tsx";
133
173
  }
134
- if (normalized.endsWith(".ts")) {
174
+ if (normalized.endsWith(".ts") || normalized.endsWith(".mts") || normalized.endsWith(".cts")) {
135
175
  return "typescript";
136
176
  }
137
177
  if (normalized.endsWith(".jsx")) {
@@ -183,10 +223,11 @@ function loadParserLanguage(language: ParserLanguage): Parser.Language {
183
223
  return (TypeScript as unknown as { tsx: unknown }).tsx as Parser.Language;
184
224
  }
185
225
  if (language === "javascript" || language === "jsx") {
186
- return resolveTreeSitterLanguageHandle(JavaScript);
226
+ // Bun is currently most reliable with tree-sitter 0.23-compatible JS/Python grammars.
227
+ return resolveTreeSitterLanguageHandle(JavaScriptV023);
187
228
  }
188
229
  if (language === "python") {
189
- return resolveTreeSitterLanguageHandle(Python);
230
+ return resolveTreeSitterLanguageHandle(PythonV023);
190
231
  }
191
232
  return resolveTreeSitterLanguageHandle(Go);
192
233
  }
@@ -317,28 +358,66 @@ function trimLineRange(lines: string[], startRow: number, endRow: number): { sta
317
358
 
318
359
  function splitRangeWithBudget(input: {
319
360
  lines: string[];
361
+ lineTokenCounts: number[];
320
362
  startRow: number;
321
363
  endRow: number;
322
- tokenize: (text: string) => string[];
323
364
  targetChunkTokens: number;
324
365
  overlapTokens: number;
325
366
  maxChunks: number;
367
+ preferSafeBoundarySplit?: boolean;
368
+ softMaxChunkTokens?: number;
326
369
  }): Array<{ startRow: number; endRow: number }> {
370
+ const rangeTokenCount = (startRow: number, endRow: number): number => {
371
+ let total = 0;
372
+ for (let row = startRow; row <= endRow; row += 1) {
373
+ total += input.lineTokenCounts[row] ?? 0;
374
+ }
375
+ return total;
376
+ };
377
+ const isSafeSplitBoundaryLine = (line: string): boolean => {
378
+ const trimmed = line.trim();
379
+ if (trimmed.length === 0) {
380
+ return true;
381
+ }
382
+ return trimmed.endsWith(";") || trimmed.endsWith("}") || trimmed.endsWith("{");
383
+ };
327
384
  const segments: Array<{ startRow: number; endRow: number }> = [];
328
385
  let start = input.startRow;
329
386
 
330
387
  while (start <= input.endRow && segments.length < input.maxChunks) {
331
388
  let tokens = 0;
332
- let end = start;
333
- while (end <= input.endRow) {
334
- tokens += input.tokenize(input.lines[end] ?? "").length;
335
- if (tokens >= input.targetChunkTokens) {
389
+ let end = start - 1;
390
+ while (end < input.endRow) {
391
+ const nextEnd = end + 1;
392
+ tokens += input.lineTokenCounts[nextEnd] ?? 0;
393
+ end = nextEnd;
394
+ if (tokens >= input.targetChunkTokens && end >= start) {
336
395
  break;
337
396
  }
338
- end += 1;
339
397
  }
340
398
 
341
- const safeEnd = Math.min(end, input.endRow);
399
+ let safeEnd = Math.min(Math.max(start, end), input.endRow);
400
+ if (input.preferSafeBoundarySplit && safeEnd > start) {
401
+ let adjusted = safeEnd;
402
+ for (let row = safeEnd; row > start; row -= 1) {
403
+ if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
404
+ adjusted = row;
405
+ break;
406
+ }
407
+ }
408
+ if (adjusted === safeEnd && typeof input.softMaxChunkTokens === "number" && input.softMaxChunkTokens > input.targetChunkTokens) {
409
+ for (let row = safeEnd + 1; row <= input.endRow; row += 1) {
410
+ if (rangeTokenCount(start, row) > input.softMaxChunkTokens) {
411
+ break;
412
+ }
413
+ if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
414
+ adjusted = row;
415
+ break;
416
+ }
417
+ }
418
+ }
419
+ safeEnd = Math.max(start, adjusted);
420
+ }
342
421
  if (safeEnd >= start) {
343
422
  segments.push({ startRow: start, endRow: safeEnd });
344
423
  }
@@ -346,8 +425,18 @@ function splitRangeWithBudget(input: {
346
425
  if (safeEnd >= input.endRow) {
347
426
  break;
348
427
  }
349
- const rewind = Math.max(1, Math.floor(input.overlapTokens / 4));
350
- start = Math.max(start + 1, safeEnd - rewind + 1);
428
+
429
+ let nextStart = safeEnd + 1;
430
+ if (input.overlapTokens > 0) {
431
+ let overlap = 0;
432
+ let cursor = safeEnd;
433
+ while (cursor >= start && overlap < input.overlapTokens) {
434
+ overlap += input.lineTokenCounts[cursor] ?? 0;
435
+ cursor -= 1;
436
+ }
437
+ nextStart = Math.max(start + 1, cursor + 1);
438
+ }
439
+ start = Math.max(start + 1, nextStart);
351
440
  }
352
441
 
353
442
  return segments;
@@ -359,12 +448,14 @@ function buildSlidingChunks(input: {
359
448
  targetChunkTokens: number;
360
449
  overlapTokens: number;
361
450
  maxChunks: number;
451
+ lineTokenCounts?: number[];
362
452
  }): ChunkingOutput[] {
453
+ const lineTokenCounts = input.lineTokenCounts ?? computeLineTokenCounts(input.lines, input.tokenize);
363
454
  const rawSegments = splitRangeWithBudget({
364
455
  lines: input.lines,
456
+ lineTokenCounts,
365
457
  startRow: 0,
366
458
  endRow: Math.max(0, input.lines.length - 1),
367
- tokenize: input.tokenize,
368
459
  targetChunkTokens: input.targetChunkTokens,
369
460
  overlapTokens: input.overlapTokens,
370
461
  maxChunks: input.maxChunks
@@ -395,6 +486,346 @@ function hasBoundaryAncestor(node: Parser.SyntaxNode, boundaryTypes: Set<string>
395
486
  return false;
396
487
  }
397
488
 
489
+ function getBoundaryTypes(
490
+ parserLanguage: ParserLanguage,
491
+ boundaryStrictness: "legacy" | "semantic_js_ts"
492
+ ): Set<string> {
493
+ if (boundaryStrictness === "semantic_js_ts") {
494
+ return DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS[parserLanguage];
495
+ }
496
+ return DEFAULT_BOUNDARY_NODE_TYPES_LEGACY[parserLanguage];
497
+ }
498
+
499
+ function isExpressionBoundaryLanguage(parserLanguage: ParserLanguage, boundaryStrictness: "legacy" | "semantic_js_ts"): boolean {
500
+ if (boundaryStrictness === "semantic_js_ts") {
501
+ return (
502
+ parserLanguage === "javascript" ||
503
+ parserLanguage === "jsx" ||
504
+ parserLanguage === "typescript" ||
505
+ parserLanguage === "tsx"
506
+ );
507
+ }
508
+ return parserLanguage === "javascript" || parserLanguage === "jsx";
509
+ }
510
+
511
+ function isLanguageBoundaryCandidate(
512
+ parserLanguage: ParserLanguage,
513
+ node: Parser.SyntaxNode,
514
+ boundaryStrictness: "legacy" | "semantic_js_ts"
515
+ ): boolean {
516
+ if (!isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness)) {
517
+ return true;
518
+ }
519
+ if (node.type !== "function_expression" && node.type !== "arrow_function") {
520
+ return true;
521
+ }
522
+ const parentType = node.parent?.type;
523
+ if (!parentType) {
524
+ return false;
525
+ }
526
+ return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(parentType);
527
+ }
528
+
529
+ export function __isChunkingBoundaryCandidateForTests(input: {
530
+ parserLanguage: ParserLanguage;
531
+ nodeType: string;
532
+ parentType?: string;
533
+ boundaryStrictness?: "legacy" | "semantic_js_ts";
534
+ }): boolean {
535
+ const strictness = input.boundaryStrictness ?? "legacy";
536
+ if (!isExpressionBoundaryLanguage(input.parserLanguage, strictness)) {
537
+ return true;
538
+ }
539
+ if (input.nodeType !== "function_expression" && input.nodeType !== "arrow_function") {
540
+ return true;
541
+ }
542
+ if (!input.parentType) {
543
+ return false;
544
+ }
545
+ return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(input.parentType);
546
+ }
547
+
548
+ function computeLineTokenCounts(lines: string[], tokenize: (text: string) => string[]): number[] {
549
+ return lines.map((line) => tokenize(line ?? "").length);
550
+ }
551
+
552
+ function rangeTokenCount(lineTokenCounts: number[], startRow: number, endRow: number): number {
553
+ let total = 0;
554
+ for (let row = startRow; row <= endRow; row += 1) {
555
+ total += lineTokenCounts[row] ?? 0;
556
+ }
557
+ return total;
558
+ }
559
+
560
+ function listNamedChildren(node: Parser.SyntaxNode): Parser.SyntaxNode[] {
561
+ const children: Parser.SyntaxNode[] = [];
562
+ for (let index = 0; index < node.namedChildCount; index += 1) {
563
+ const child = node.namedChild(index);
564
+ if (child) {
565
+ children.push(child);
566
+ }
567
+ }
568
+ return children;
569
+ }
570
+
571
+ function normalizeNodeWindow(input: {
572
+ node: Parser.SyntaxNode;
573
+ lines: string[];
574
+ lastRow: number;
575
+ }): { startRow: number; endRow: number } | undefined {
576
+ const startRow = Math.max(0, Math.min(input.lastRow, input.node.startPosition.row));
577
+ const endRow = Math.max(startRow, Math.min(input.lastRow, toInclusiveEndRow(input.node)));
578
+ const trimmed = trimLineRange(input.lines, startRow, endRow);
579
+ if (!trimmed) {
580
+ return undefined;
581
+ }
582
+ return {
583
+ startRow: trimmed.start,
584
+ endRow: trimmed.end
585
+ };
586
+ }
587
+
588
+ function buildRecursiveSemanticWindows(input: {
589
+ root: Parser.SyntaxNode;
590
+ lines: string[];
591
+ lineTokenCounts: number[];
592
+ targetChunkTokens: number;
593
+ maxChunks: number;
594
+ boundaryStrictness: "legacy" | "semantic_js_ts";
595
+ }): Array<{ startRow: number; endRow: number }> {
596
+ const lastRow = Math.max(0, input.lines.length - 1);
597
+ const windows: Array<{ startRow: number; endRow: number }> = [];
598
+ const softMaxChunkTokens = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
599
+ const seen = new Set<string>();
600
+
601
+ const pushSplitWindows = (startRow: number, endRow: number): void => {
602
+ if (startRow > endRow || windows.length >= input.maxChunks) {
603
+ return;
604
+ }
605
+ const segments = splitRangeWithBudget({
606
+ lines: input.lines,
607
+ lineTokenCounts: input.lineTokenCounts,
608
+ startRow,
609
+ endRow,
610
+ targetChunkTokens: input.targetChunkTokens,
611
+ overlapTokens: 0,
612
+ maxChunks: input.maxChunks - windows.length,
613
+ preferSafeBoundarySplit: input.boundaryStrictness === "semantic_js_ts",
614
+ softMaxChunkTokens
615
+ });
616
+ for (const segment of segments) {
617
+ const trimmed = trimLineRange(input.lines, segment.startRow, segment.endRow);
618
+ if (!trimmed) {
619
+ continue;
620
+ }
621
+ const key = `${trimmed.start}:${trimmed.end}`;
622
+ if (seen.has(key)) {
623
+ continue;
624
+ }
625
+ seen.add(key);
626
+ windows.push({ startRow: trimmed.start, endRow: trimmed.end });
627
+ if (windows.length >= input.maxChunks) {
628
+ return;
629
+ }
630
+ }
631
+ };
632
+
633
+ const visitNode = (node: Parser.SyntaxNode): void => {
634
+ if (windows.length >= input.maxChunks) {
635
+ return;
636
+ }
637
+ const range = normalizeNodeWindow({
638
+ node,
639
+ lines: input.lines,
640
+ lastRow
641
+ });
642
+ if (!range) {
643
+ return;
644
+ }
645
+ const tokenCount = rangeTokenCount(input.lineTokenCounts, range.startRow, range.endRow);
646
+ if (tokenCount <= input.targetChunkTokens) {
647
+ const key = `${range.startRow}:${range.endRow}`;
648
+ if (!seen.has(key)) {
649
+ seen.add(key);
650
+ windows.push(range);
651
+ }
652
+ return;
653
+ }
654
+
655
+ const children = listNamedChildren(node)
656
+ .map((child) => ({
657
+ node: child,
658
+ range: normalizeNodeWindow({
659
+ node: child,
660
+ lines: input.lines,
661
+ lastRow
662
+ })
663
+ }))
664
+ .filter((child): child is { node: Parser.SyntaxNode; range: { startRow: number; endRow: number } } =>
665
+ Boolean(child.range)
666
+ )
667
+ .sort((a, b) => a.range.startRow - b.range.startRow || a.range.endRow - b.range.endRow);
668
+
669
+ if (children.length === 0) {
670
+ pushSplitWindows(range.startRow, range.endRow);
671
+ return;
672
+ }
673
+
674
+ let cursor = range.startRow;
675
+ for (const child of children) {
676
+ if (windows.length >= input.maxChunks) {
677
+ return;
678
+ }
679
+ if (child.range.endRow < cursor) {
680
+ continue;
681
+ }
682
+ if (child.range.startRow > cursor) {
683
+ pushSplitWindows(cursor, child.range.startRow - 1);
684
+ }
685
+
686
+ visitNode(child.node);
687
+ cursor = Math.max(cursor, child.range.endRow + 1);
688
+ if (cursor > range.endRow) {
689
+ return;
690
+ }
691
+ }
692
+ if (cursor <= range.endRow) {
693
+ pushSplitWindows(cursor, range.endRow);
694
+ }
695
+ };
696
+
697
+ visitNode(input.root);
698
+ return windows.sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
699
+ }
700
+
701
+ function mergeSemanticWindows(input: {
702
+ windows: Array<{ startRow: number; endRow: number }>;
703
+ lineTokenCounts: number[];
704
+ targetChunkTokens: number;
705
+ semanticMergeGapLines: number;
706
+ semanticMergeMaxSpanLines: number;
707
+ }): Array<{ startRow: number; endRow: number }> {
708
+ if (input.windows.length <= 1) {
709
+ return [...input.windows];
710
+ }
711
+ const ordered = [...input.windows].sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
712
+ const merged: Array<{ startRow: number; endRow: number }> = [];
713
+ const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
714
+ for (const window of ordered) {
715
+ const last = merged[merged.length - 1];
716
+ if (!last) {
717
+ merged.push({ ...window });
718
+ continue;
719
+ }
720
+ const gapLines = Math.max(0, window.startRow - last.endRow - 1);
721
+ const nextStartRow = Math.min(last.startRow, window.startRow);
722
+ const nextEndRow = Math.max(last.endRow, window.endRow);
723
+ const nextSpanLines = nextEndRow - nextStartRow + 1;
724
+ const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, nextStartRow, nextEndRow);
725
+ const canMerge =
726
+ gapLines <= input.semanticMergeGapLines &&
727
+ nextSpanLines <= input.semanticMergeMaxSpanLines &&
728
+ mergedTokenCount <= mergeTokenBudget;
729
+ if (!canMerge) {
730
+ merged.push({ ...window });
731
+ continue;
732
+ }
733
+ last.startRow = nextStartRow;
734
+ last.endRow = nextEndRow;
735
+ }
736
+ return merged;
737
+ }
738
+
739
+ function isCommentOnlyLine(line: string): boolean {
740
+ const trimmed = line.trim();
741
+ if (trimmed.length === 0) {
742
+ return true;
743
+ }
744
+ return (
745
+ trimmed.startsWith("//") ||
746
+ trimmed.startsWith("/*") ||
747
+ trimmed.startsWith("*") ||
748
+ trimmed.startsWith("*/") ||
749
+ trimmed.startsWith("#")
750
+ );
751
+ }
752
+
753
+ function windowLooksCommentOnly(input: { lines: string[]; startRow: number; endRow: number }): boolean {
754
+ for (let row = input.startRow; row <= input.endRow; row += 1) {
755
+ if (!isCommentOnlyLine(input.lines[row] ?? "")) {
756
+ return false;
757
+ }
758
+ }
759
+ return true;
760
+ }
761
+
762
+ function absorbForwardCommentWindows(input: {
763
+ windows: Array<{ startRow: number; endRow: number }>;
764
+ lines: string[];
765
+ lineTokenCounts: number[];
766
+ targetChunkTokens: number;
767
+ semanticMergeMaxSpanLines: number;
768
+ }): Array<{ startRow: number; endRow: number }> {
769
+ if (input.windows.length <= 1) {
770
+ return [...input.windows];
771
+ }
772
+ const output: Array<{ startRow: number; endRow: number }> = [];
773
+ const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
774
+ for (let index = 0; index < input.windows.length; index += 1) {
775
+ const current = input.windows[index];
776
+ const next = input.windows[index + 1];
777
+ if (!current) {
778
+ continue;
779
+ }
780
+ if (!next) {
781
+ output.push({ ...current });
782
+ continue;
783
+ }
784
+ if (!windowLooksCommentOnly({ lines: input.lines, startRow: current.startRow, endRow: current.endRow })) {
785
+ output.push({ ...current });
786
+ continue;
787
+ }
788
+ const gapLines = Math.max(0, next.startRow - current.endRow - 1);
789
+ const nextSpanLines = next.endRow - current.startRow + 1;
790
+ const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, current.startRow, next.endRow);
791
+ const canAbsorb =
792
+ gapLines <= 1 && nextSpanLines <= input.semanticMergeMaxSpanLines && mergedTokenCount <= mergeTokenBudget;
793
+ if (!canAbsorb) {
794
+ output.push({ ...current });
795
+ continue;
796
+ }
797
+ output.push({
798
+ startRow: current.startRow,
799
+ endRow: next.endRow
800
+ });
801
+ index += 1;
802
+ }
803
+ return output;
804
+ }
805
+
806
+ function windowsToChunks(input: {
807
+ windows: Array<{ startRow: number; endRow: number }>;
808
+ lines: string[];
809
+ maxChunks: number;
810
+ }): ChunkingOutput[] {
811
+ const chunks: ChunkingOutput[] = [];
812
+ for (const window of input.windows) {
813
+ if (chunks.length >= input.maxChunks) {
814
+ break;
815
+ }
816
+ const trimmed = trimLineRange(input.lines, window.startRow, window.endRow);
817
+ if (!trimmed) {
818
+ continue;
819
+ }
820
+ chunks.push({
821
+ start_line: trimmed.start + 1,
822
+ end_line: trimmed.end + 1,
823
+ snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
824
+ });
825
+ }
826
+ return chunks;
827
+ }
828
+
398
829
  function buildLanguageAwareChunks(input: {
399
830
  file: ChunkingRawFile;
400
831
  lines: string[];
@@ -403,6 +834,7 @@ function buildLanguageAwareChunks(input: {
403
834
  tokenize: (text: string) => string[];
404
835
  }): ChunkingResult {
405
836
  const languageAwareAttemptStart = Date.now();
837
+ const lineTokenCounts = computeLineTokenCounts(input.lines, input.tokenize);
406
838
  const parser = getParser(input.parserLanguage);
407
839
  if (!parser) {
408
840
  const fallbackStart = Date.now();
@@ -411,7 +843,8 @@ function buildLanguageAwareChunks(input: {
411
843
  tokenize: input.tokenize,
412
844
  targetChunkTokens: input.config.target_chunk_tokens,
413
845
  overlapTokens: input.config.chunk_overlap_tokens,
414
- maxChunks: input.config.max_chunks_per_file
846
+ maxChunks: input.config.max_chunks_per_file,
847
+ lineTokenCounts
415
848
  });
416
849
  return {
417
850
  chunks,
@@ -435,7 +868,8 @@ function buildLanguageAwareChunks(input: {
435
868
  tokenize: input.tokenize,
436
869
  targetChunkTokens: input.config.target_chunk_tokens,
437
870
  overlapTokens: input.config.chunk_overlap_tokens,
438
- maxChunks: input.config.max_chunks_per_file
871
+ maxChunks: input.config.max_chunks_per_file,
872
+ lineTokenCounts
439
873
  });
440
874
  return {
441
875
  chunks,
@@ -456,7 +890,8 @@ function buildLanguageAwareChunks(input: {
456
890
  tokenize: input.tokenize,
457
891
  targetChunkTokens: input.config.target_chunk_tokens,
458
892
  overlapTokens: input.config.chunk_overlap_tokens,
459
- maxChunks: input.config.max_chunks_per_file
893
+ maxChunks: input.config.max_chunks_per_file,
894
+ lineTokenCounts
460
895
  });
461
896
  return {
462
897
  chunks,
@@ -469,77 +904,133 @@ function buildLanguageAwareChunks(input: {
469
904
  };
470
905
  }
471
906
 
472
- const boundaryTypes = DEFAULT_BOUNDARY_NODE_TYPES[input.parserLanguage];
473
- const candidates = root.descendantsOfType([...boundaryTypes]);
474
- const boundaryNodes = candidates
475
- .filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
476
- .sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
907
+ let chunks: ChunkingOutput[] = [];
908
+ let recursiveSemanticChunkingUsed = false;
477
909
 
478
- if (boundaryNodes.length === 0) {
479
- const fallbackStart = Date.now();
480
- const chunks = buildSlidingChunks({
910
+ if (input.config.recursive_semantic_chunking_enabled) {
911
+ const semanticMergeGapLines = input.config.semantic_merge_gap_lines ?? 6;
912
+ const semanticMergeMaxSpanLines = input.config.semantic_merge_max_span_lines ?? 220;
913
+ const recursiveWindows = buildRecursiveSemanticWindows({
914
+ root,
481
915
  lines: input.lines,
482
- tokenize: input.tokenize,
916
+ lineTokenCounts,
483
917
  targetChunkTokens: input.config.target_chunk_tokens,
484
- overlapTokens: input.config.chunk_overlap_tokens,
918
+ maxChunks: input.config.max_chunks_per_file,
919
+ boundaryStrictness: input.config.boundary_strictness
920
+ });
921
+ const mergedWindows = mergeSemanticWindows({
922
+ windows: recursiveWindows,
923
+ lineTokenCounts,
924
+ targetChunkTokens: input.config.target_chunk_tokens,
925
+ semanticMergeGapLines,
926
+ semanticMergeMaxSpanLines
927
+ });
928
+ const absorbedWindows =
929
+ input.config.comment_forward_absorb_enabled === false
930
+ ? mergedWindows
931
+ : absorbForwardCommentWindows({
932
+ windows: mergedWindows,
933
+ lines: input.lines,
934
+ lineTokenCounts,
935
+ targetChunkTokens: input.config.target_chunk_tokens,
936
+ semanticMergeMaxSpanLines
937
+ });
938
+ chunks = windowsToChunks({
939
+ windows: absorbedWindows,
940
+ lines: input.lines,
485
941
  maxChunks: input.config.max_chunks_per_file
486
942
  });
487
- return {
488
- chunks,
489
- strategy: "sliding",
490
- fallback_reason: "empty_language_boundaries",
491
- parse_latency_ms: parseLatencyMs,
492
- language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
493
- fallback_path_latency_ms: Date.now() - fallbackStart,
494
- language: parserLanguageToCanonical(input.parserLanguage)
495
- };
496
- }
943
+ recursiveSemanticChunkingUsed = chunks.length > 0;
944
+ } else {
945
+ const boundaryTypes = getBoundaryTypes(input.parserLanguage, input.config.boundary_strictness);
946
+ const candidates = root.descendantsOfType([...boundaryTypes]);
947
+ const boundaryNodes = candidates
948
+ .filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
949
+ .filter((node) => isLanguageBoundaryCandidate(input.parserLanguage, node, input.config.boundary_strictness))
950
+ .sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
497
951
 
498
- const segments: Array<{ startRow: number; endRow: number }> = [];
499
- let cursor = 0;
500
- const lastRow = Math.max(0, input.lines.length - 1);
501
- for (const node of boundaryNodes) {
502
- const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
503
- const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
504
- if (startRow > cursor) {
505
- segments.push({ startRow: cursor, endRow: startRow - 1 });
506
- }
507
- segments.push({ startRow, endRow });
508
- cursor = endRow + 1;
509
- if (cursor > lastRow) {
510
- break;
952
+ if (boundaryNodes.length === 0) {
953
+ const fallbackStart = Date.now();
954
+ const fallbackChunks = buildSlidingChunks({
955
+ lines: input.lines,
956
+ tokenize: input.tokenize,
957
+ targetChunkTokens: input.config.target_chunk_tokens,
958
+ overlapTokens: input.config.chunk_overlap_tokens,
959
+ maxChunks: input.config.max_chunks_per_file,
960
+ lineTokenCounts
961
+ });
962
+ return {
963
+ chunks: fallbackChunks,
964
+ strategy: "sliding",
965
+ fallback_reason: "empty_language_boundaries",
966
+ parse_latency_ms: parseLatencyMs,
967
+ language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
968
+ fallback_path_latency_ms: Date.now() - fallbackStart,
969
+ language: parserLanguageToCanonical(input.parserLanguage)
970
+ };
511
971
  }
512
- }
513
- if (cursor <= lastRow) {
514
- segments.push({ startRow: cursor, endRow: lastRow });
515
- }
516
972
 
517
- const chunks: ChunkingOutput[] = [];
518
- for (const segment of segments) {
519
- if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
520
- continue;
973
+ const segments: Array<{ startRow: number; endRow: number; boundary: boolean }> = [];
974
+ let cursor = 0;
975
+ const lastRow = Math.max(0, input.lines.length - 1);
976
+ for (const node of boundaryNodes) {
977
+ const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
978
+ const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
979
+ if (startRow > cursor) {
980
+ segments.push({ startRow: cursor, endRow: startRow - 1, boundary: false });
981
+ }
982
+ segments.push({ startRow, endRow, boundary: true });
983
+ cursor = endRow + 1;
984
+ if (cursor > lastRow) {
985
+ break;
986
+ }
521
987
  }
522
- const pieces = splitRangeWithBudget({
523
- lines: input.lines,
524
- startRow: segment.startRow,
525
- endRow: segment.endRow,
526
- tokenize: input.tokenize,
527
- targetChunkTokens: input.config.target_chunk_tokens,
528
- overlapTokens: input.config.chunk_overlap_tokens,
529
- maxChunks: input.config.max_chunks_per_file - chunks.length
530
- });
531
- for (const piece of pieces) {
532
- const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
533
- if (!trimmed) {
988
+ if (cursor <= lastRow) {
989
+ segments.push({ startRow: cursor, endRow: lastRow, boundary: false });
990
+ }
991
+
992
+ for (const segment of segments) {
993
+ if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
534
994
  continue;
535
995
  }
536
- chunks.push({
537
- start_line: trimmed.start + 1,
538
- end_line: trimmed.end + 1,
539
- snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
540
- });
541
- if (chunks.length >= input.config.max_chunks_per_file) {
542
- break;
996
+ const segmentTokenCount = lineTokenCounts
997
+ .slice(segment.startRow, segment.endRow + 1)
998
+ .reduce((sum, value) => sum + value, 0);
999
+ const enableSemanticBoundarySplits =
1000
+ input.config.boundary_strictness === "semantic_js_ts" &&
1001
+ (input.parserLanguage === "javascript" ||
1002
+ input.parserLanguage === "jsx" ||
1003
+ input.parserLanguage === "typescript" ||
1004
+ input.parserLanguage === "tsx") &&
1005
+ segment.boundary;
1006
+ const softMaxChunkTokens = Math.floor(input.config.target_chunk_tokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
1007
+ const pieces =
1008
+ enableSemanticBoundarySplits && segmentTokenCount <= softMaxChunkTokens
1009
+ ? [{ startRow: segment.startRow, endRow: segment.endRow }]
1010
+ : splitRangeWithBudget({
1011
+ lines: input.lines,
1012
+ lineTokenCounts,
1013
+ startRow: segment.startRow,
1014
+ endRow: segment.endRow,
1015
+ targetChunkTokens: input.config.target_chunk_tokens,
1016
+ overlapTokens: input.config.chunk_overlap_tokens,
1017
+ maxChunks: input.config.max_chunks_per_file - chunks.length,
1018
+ preferSafeBoundarySplit: enableSemanticBoundarySplits,
1019
+ softMaxChunkTokens
1020
+ });
1021
+ for (const piece of pieces) {
1022
+ const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
1023
+ if (!trimmed) {
1024
+ continue;
1025
+ }
1026
+ chunks.push({
1027
+ start_line: trimmed.start + 1,
1028
+ end_line: trimmed.end + 1,
1029
+ snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
1030
+ });
1031
+ if (chunks.length >= input.config.max_chunks_per_file) {
1032
+ break;
1033
+ }
543
1034
  }
544
1035
  }
545
1036
  }
@@ -551,7 +1042,8 @@ function buildLanguageAwareChunks(input: {
551
1042
  tokenize: input.tokenize,
552
1043
  targetChunkTokens: input.config.target_chunk_tokens,
553
1044
  overlapTokens: input.config.chunk_overlap_tokens,
554
- maxChunks: input.config.max_chunks_per_file
1045
+ maxChunks: input.config.max_chunks_per_file,
1046
+ lineTokenCounts
555
1047
  });
556
1048
  return {
557
1049
  chunks: slidingChunks,
@@ -569,7 +1061,8 @@ function buildLanguageAwareChunks(input: {
569
1061
  strategy: "language_aware",
570
1062
  parse_latency_ms: parseLatencyMs,
571
1063
  language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
572
- language: parserLanguageToCanonical(input.parserLanguage)
1064
+ language: parserLanguageToCanonical(input.parserLanguage),
1065
+ recursive_semantic_chunking_used: recursiveSemanticChunkingUsed
573
1066
  };
574
1067
  } catch {
575
1068
  const fallbackStart = Date.now();
@@ -578,7 +1071,8 @@ function buildLanguageAwareChunks(input: {
578
1071
  tokenize: input.tokenize,
579
1072
  targetChunkTokens: input.config.target_chunk_tokens,
580
1073
  overlapTokens: input.config.chunk_overlap_tokens,
581
- maxChunks: input.config.max_chunks_per_file
1074
+ maxChunks: input.config.max_chunks_per_file,
1075
+ lineTokenCounts
582
1076
  });
583
1077
  return {
584
1078
  chunks,