@rce-mcp/retrieval-core 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/chunking.js CHANGED
@@ -3,7 +3,7 @@ import Go from "tree-sitter-go";
3
3
  import JavaScriptV023 from "tree-sitter-javascript-v023";
4
4
  import PythonV023 from "tree-sitter-python-v023";
5
5
  import TypeScript from "tree-sitter-typescript";
6
- const DEFAULT_BOUNDARY_NODE_TYPES = {
6
+ const DEFAULT_BOUNDARY_NODE_TYPES_LEGACY = {
7
7
  typescript: new Set([
8
8
  "function_declaration",
9
9
  "generator_function_declaration",
@@ -20,11 +20,35 @@ const DEFAULT_BOUNDARY_NODE_TYPES = {
20
20
  "enum_declaration",
21
21
  "type_alias_declaration"
22
22
  ]),
23
- javascript: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
24
- jsx: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
23
+ javascript: new Set([
24
+ "function_declaration",
25
+ "generator_function_declaration",
26
+ "class_declaration",
27
+ "function_expression",
28
+ "arrow_function"
29
+ ]),
30
+ jsx: new Set([
31
+ "function_declaration",
32
+ "generator_function_declaration",
33
+ "class_declaration",
34
+ "function_expression",
35
+ "arrow_function"
36
+ ]),
25
37
  python: new Set(["function_definition", "class_definition"]),
26
38
  go: new Set(["function_declaration", "method_declaration", "type_declaration"])
27
39
  };
40
+ const DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS = {
41
+ ...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY,
42
+ typescript: new Set([
43
+ ...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.typescript,
44
+ "function_expression",
45
+ "arrow_function",
46
+ "method_definition"
47
+ ]),
48
+ tsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.tsx, "function_expression", "arrow_function", "method_definition"]),
49
+ javascript: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.javascript, "method_definition"]),
50
+ jsx: new Set([...DEFAULT_BOUNDARY_NODE_TYPES_LEGACY.jsx, "method_definition"])
51
+ };
28
52
  const parserAvailabilityCache = new Map();
29
53
  const parserInitAttempts = new Map();
30
54
  const parserLanguageLoaderOverrides = new Map();
@@ -34,6 +58,15 @@ const CANONICAL_TO_PARSER_LANGUAGE = {
34
58
  python: "python",
35
59
  go: "go"
36
60
  };
61
+ const JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES = new Set([
62
+ "assignment_expression",
63
+ "variable_declarator",
64
+ "pair",
65
+ "export_statement",
66
+ "public_field_definition",
67
+ "property_definition"
68
+ ]);
69
+ const SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER = 1.35;
37
70
  function parserLanguageToCanonical(language) {
38
71
  if (language === "tsx") {
39
72
  return "typescript";
@@ -70,7 +103,7 @@ function parserLanguageFromPath(path) {
70
103
  if (normalized.endsWith(".tsx")) {
71
104
  return "tsx";
72
105
  }
73
- if (normalized.endsWith(".ts")) {
106
+ if (normalized.endsWith(".ts") || normalized.endsWith(".mts") || normalized.endsWith(".cts")) {
74
107
  return "typescript";
75
108
  }
76
109
  if (normalized.endsWith(".jsx")) {
@@ -238,36 +271,82 @@ function trimLineRange(lines, startRow, endRow) {
238
271
  return { start, end };
239
272
  }
240
273
  function splitRangeWithBudget(input) {
274
+ const rangeTokenCount = (startRow, endRow) => {
275
+ let total = 0;
276
+ for (let row = startRow; row <= endRow; row += 1) {
277
+ total += input.lineTokenCounts[row] ?? 0;
278
+ }
279
+ return total;
280
+ };
281
+ const isSafeSplitBoundaryLine = (line) => {
282
+ const trimmed = line.trim();
283
+ if (trimmed.length === 0) {
284
+ return true;
285
+ }
286
+ return trimmed.endsWith(";") || trimmed.endsWith("}") || trimmed.endsWith("{");
287
+ };
241
288
  const segments = [];
242
289
  let start = input.startRow;
243
290
  while (start <= input.endRow && segments.length < input.maxChunks) {
244
291
  let tokens = 0;
245
- let end = start;
246
- while (end <= input.endRow) {
247
- tokens += input.tokenize(input.lines[end] ?? "").length;
248
- if (tokens >= input.targetChunkTokens) {
292
+ let end = start - 1;
293
+ while (end < input.endRow) {
294
+ const nextEnd = end + 1;
295
+ tokens += input.lineTokenCounts[nextEnd] ?? 0;
296
+ end = nextEnd;
297
+ if (tokens >= input.targetChunkTokens && end >= start) {
249
298
  break;
250
299
  }
251
- end += 1;
252
300
  }
253
- const safeEnd = Math.min(end, input.endRow);
301
+ let safeEnd = Math.min(Math.max(start, end), input.endRow);
302
+ if (input.preferSafeBoundarySplit && safeEnd > start) {
303
+ let adjusted = safeEnd;
304
+ for (let row = safeEnd; row > start; row -= 1) {
305
+ if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
306
+ adjusted = row;
307
+ break;
308
+ }
309
+ }
310
+ if (adjusted === safeEnd && typeof input.softMaxChunkTokens === "number" && input.softMaxChunkTokens > input.targetChunkTokens) {
311
+ for (let row = safeEnd + 1; row <= input.endRow; row += 1) {
312
+ if (rangeTokenCount(start, row) > input.softMaxChunkTokens) {
313
+ break;
314
+ }
315
+ if (isSafeSplitBoundaryLine(input.lines[row] ?? "")) {
316
+ adjusted = row;
317
+ break;
318
+ }
319
+ }
320
+ }
321
+ safeEnd = Math.max(start, adjusted);
322
+ }
254
323
  if (safeEnd >= start) {
255
324
  segments.push({ startRow: start, endRow: safeEnd });
256
325
  }
257
326
  if (safeEnd >= input.endRow) {
258
327
  break;
259
328
  }
260
- const rewind = Math.max(1, Math.floor(input.overlapTokens / 4));
261
- start = Math.max(start + 1, safeEnd - rewind + 1);
329
+ let nextStart = safeEnd + 1;
330
+ if (input.overlapTokens > 0) {
331
+ let overlap = 0;
332
+ let cursor = safeEnd;
333
+ while (cursor >= start && overlap < input.overlapTokens) {
334
+ overlap += input.lineTokenCounts[cursor] ?? 0;
335
+ cursor -= 1;
336
+ }
337
+ nextStart = Math.max(start + 1, cursor + 1);
338
+ }
339
+ start = Math.max(start + 1, nextStart);
262
340
  }
263
341
  return segments;
264
342
  }
265
343
  function buildSlidingChunks(input) {
344
+ const lineTokenCounts = input.lineTokenCounts ?? computeLineTokenCounts(input.lines, input.tokenize);
266
345
  const rawSegments = splitRangeWithBudget({
267
346
  lines: input.lines,
347
+ lineTokenCounts,
268
348
  startRow: 0,
269
349
  endRow: Math.max(0, input.lines.length - 1),
270
- tokenize: input.tokenize,
271
350
  targetChunkTokens: input.targetChunkTokens,
272
351
  overlapTokens: input.overlapTokens,
273
352
  maxChunks: input.maxChunks
@@ -296,8 +375,281 @@ function hasBoundaryAncestor(node, boundaryTypes) {
296
375
  }
297
376
  return false;
298
377
  }
378
+ function getBoundaryTypes(parserLanguage, boundaryStrictness) {
379
+ if (boundaryStrictness === "semantic_js_ts") {
380
+ return DEFAULT_BOUNDARY_NODE_TYPES_SEMANTIC_JS_TS[parserLanguage];
381
+ }
382
+ return DEFAULT_BOUNDARY_NODE_TYPES_LEGACY[parserLanguage];
383
+ }
384
+ function isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness) {
385
+ if (boundaryStrictness === "semantic_js_ts") {
386
+ return (parserLanguage === "javascript" ||
387
+ parserLanguage === "jsx" ||
388
+ parserLanguage === "typescript" ||
389
+ parserLanguage === "tsx");
390
+ }
391
+ return parserLanguage === "javascript" || parserLanguage === "jsx";
392
+ }
393
+ function isLanguageBoundaryCandidate(parserLanguage, node, boundaryStrictness) {
394
+ if (!isExpressionBoundaryLanguage(parserLanguage, boundaryStrictness)) {
395
+ return true;
396
+ }
397
+ if (node.type !== "function_expression" && node.type !== "arrow_function") {
398
+ return true;
399
+ }
400
+ const parentType = node.parent?.type;
401
+ if (!parentType) {
402
+ return false;
403
+ }
404
+ return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(parentType);
405
+ }
406
+ export function __isChunkingBoundaryCandidateForTests(input) {
407
+ const strictness = input.boundaryStrictness ?? "legacy";
408
+ if (!isExpressionBoundaryLanguage(input.parserLanguage, strictness)) {
409
+ return true;
410
+ }
411
+ if (input.nodeType !== "function_expression" && input.nodeType !== "arrow_function") {
412
+ return true;
413
+ }
414
+ if (!input.parentType) {
415
+ return false;
416
+ }
417
+ return JAVASCRIPT_EXPRESSION_BOUNDARY_PARENT_TYPES.has(input.parentType);
418
+ }
419
+ function computeLineTokenCounts(lines, tokenize) {
420
+ return lines.map((line) => tokenize(line ?? "").length);
421
+ }
422
+ function rangeTokenCount(lineTokenCounts, startRow, endRow) {
423
+ let total = 0;
424
+ for (let row = startRow; row <= endRow; row += 1) {
425
+ total += lineTokenCounts[row] ?? 0;
426
+ }
427
+ return total;
428
+ }
429
+ function listNamedChildren(node) {
430
+ const children = [];
431
+ for (let index = 0; index < node.namedChildCount; index += 1) {
432
+ const child = node.namedChild(index);
433
+ if (child) {
434
+ children.push(child);
435
+ }
436
+ }
437
+ return children;
438
+ }
439
+ function normalizeNodeWindow(input) {
440
+ const startRow = Math.max(0, Math.min(input.lastRow, input.node.startPosition.row));
441
+ const endRow = Math.max(startRow, Math.min(input.lastRow, toInclusiveEndRow(input.node)));
442
+ const trimmed = trimLineRange(input.lines, startRow, endRow);
443
+ if (!trimmed) {
444
+ return undefined;
445
+ }
446
+ return {
447
+ startRow: trimmed.start,
448
+ endRow: trimmed.end
449
+ };
450
+ }
451
+ function buildRecursiveSemanticWindows(input) {
452
+ const lastRow = Math.max(0, input.lines.length - 1);
453
+ const windows = [];
454
+ const softMaxChunkTokens = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
455
+ const seen = new Set();
456
+ const pushSplitWindows = (startRow, endRow) => {
457
+ if (startRow > endRow || windows.length >= input.maxChunks) {
458
+ return;
459
+ }
460
+ const segments = splitRangeWithBudget({
461
+ lines: input.lines,
462
+ lineTokenCounts: input.lineTokenCounts,
463
+ startRow,
464
+ endRow,
465
+ targetChunkTokens: input.targetChunkTokens,
466
+ overlapTokens: 0,
467
+ maxChunks: input.maxChunks - windows.length,
468
+ preferSafeBoundarySplit: input.boundaryStrictness === "semantic_js_ts",
469
+ softMaxChunkTokens
470
+ });
471
+ for (const segment of segments) {
472
+ const trimmed = trimLineRange(input.lines, segment.startRow, segment.endRow);
473
+ if (!trimmed) {
474
+ continue;
475
+ }
476
+ const key = `${trimmed.start}:${trimmed.end}`;
477
+ if (seen.has(key)) {
478
+ continue;
479
+ }
480
+ seen.add(key);
481
+ windows.push({ startRow: trimmed.start, endRow: trimmed.end });
482
+ if (windows.length >= input.maxChunks) {
483
+ return;
484
+ }
485
+ }
486
+ };
487
+ const visitNode = (node) => {
488
+ if (windows.length >= input.maxChunks) {
489
+ return;
490
+ }
491
+ const range = normalizeNodeWindow({
492
+ node,
493
+ lines: input.lines,
494
+ lastRow
495
+ });
496
+ if (!range) {
497
+ return;
498
+ }
499
+ const tokenCount = rangeTokenCount(input.lineTokenCounts, range.startRow, range.endRow);
500
+ if (tokenCount <= input.targetChunkTokens) {
501
+ const key = `${range.startRow}:${range.endRow}`;
502
+ if (!seen.has(key)) {
503
+ seen.add(key);
504
+ windows.push(range);
505
+ }
506
+ return;
507
+ }
508
+ const children = listNamedChildren(node)
509
+ .map((child) => ({
510
+ node: child,
511
+ range: normalizeNodeWindow({
512
+ node: child,
513
+ lines: input.lines,
514
+ lastRow
515
+ })
516
+ }))
517
+ .filter((child) => Boolean(child.range))
518
+ .sort((a, b) => a.range.startRow - b.range.startRow || a.range.endRow - b.range.endRow);
519
+ if (children.length === 0) {
520
+ pushSplitWindows(range.startRow, range.endRow);
521
+ return;
522
+ }
523
+ let cursor = range.startRow;
524
+ for (const child of children) {
525
+ if (windows.length >= input.maxChunks) {
526
+ return;
527
+ }
528
+ if (child.range.endRow < cursor) {
529
+ continue;
530
+ }
531
+ if (child.range.startRow > cursor) {
532
+ pushSplitWindows(cursor, child.range.startRow - 1);
533
+ }
534
+ visitNode(child.node);
535
+ cursor = Math.max(cursor, child.range.endRow + 1);
536
+ if (cursor > range.endRow) {
537
+ return;
538
+ }
539
+ }
540
+ if (cursor <= range.endRow) {
541
+ pushSplitWindows(cursor, range.endRow);
542
+ }
543
+ };
544
+ visitNode(input.root);
545
+ return windows.sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
546
+ }
547
+ function mergeSemanticWindows(input) {
548
+ if (input.windows.length <= 1) {
549
+ return [...input.windows];
550
+ }
551
+ const ordered = [...input.windows].sort((a, b) => a.startRow - b.startRow || a.endRow - b.endRow);
552
+ const merged = [];
553
+ const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
554
+ for (const window of ordered) {
555
+ const last = merged[merged.length - 1];
556
+ if (!last) {
557
+ merged.push({ ...window });
558
+ continue;
559
+ }
560
+ const gapLines = Math.max(0, window.startRow - last.endRow - 1);
561
+ const nextStartRow = Math.min(last.startRow, window.startRow);
562
+ const nextEndRow = Math.max(last.endRow, window.endRow);
563
+ const nextSpanLines = nextEndRow - nextStartRow + 1;
564
+ const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, nextStartRow, nextEndRow);
565
+ const canMerge = gapLines <= input.semanticMergeGapLines &&
566
+ nextSpanLines <= input.semanticMergeMaxSpanLines &&
567
+ mergedTokenCount <= mergeTokenBudget;
568
+ if (!canMerge) {
569
+ merged.push({ ...window });
570
+ continue;
571
+ }
572
+ last.startRow = nextStartRow;
573
+ last.endRow = nextEndRow;
574
+ }
575
+ return merged;
576
+ }
577
+ function isCommentOnlyLine(line) {
578
+ const trimmed = line.trim();
579
+ if (trimmed.length === 0) {
580
+ return true;
581
+ }
582
+ return (trimmed.startsWith("//") ||
583
+ trimmed.startsWith("/*") ||
584
+ trimmed.startsWith("*") ||
585
+ trimmed.startsWith("*/") ||
586
+ trimmed.startsWith("#"));
587
+ }
588
+ function windowLooksCommentOnly(input) {
589
+ for (let row = input.startRow; row <= input.endRow; row += 1) {
590
+ if (!isCommentOnlyLine(input.lines[row] ?? "")) {
591
+ return false;
592
+ }
593
+ }
594
+ return true;
595
+ }
596
+ function absorbForwardCommentWindows(input) {
597
+ if (input.windows.length <= 1) {
598
+ return [...input.windows];
599
+ }
600
+ const output = [];
601
+ const mergeTokenBudget = Math.floor(input.targetChunkTokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
602
+ for (let index = 0; index < input.windows.length; index += 1) {
603
+ const current = input.windows[index];
604
+ const next = input.windows[index + 1];
605
+ if (!current) {
606
+ continue;
607
+ }
608
+ if (!next) {
609
+ output.push({ ...current });
610
+ continue;
611
+ }
612
+ if (!windowLooksCommentOnly({ lines: input.lines, startRow: current.startRow, endRow: current.endRow })) {
613
+ output.push({ ...current });
614
+ continue;
615
+ }
616
+ const gapLines = Math.max(0, next.startRow - current.endRow - 1);
617
+ const nextSpanLines = next.endRow - current.startRow + 1;
618
+ const mergedTokenCount = rangeTokenCount(input.lineTokenCounts, current.startRow, next.endRow);
619
+ const canAbsorb = gapLines <= 1 && nextSpanLines <= input.semanticMergeMaxSpanLines && mergedTokenCount <= mergeTokenBudget;
620
+ if (!canAbsorb) {
621
+ output.push({ ...current });
622
+ continue;
623
+ }
624
+ output.push({
625
+ startRow: current.startRow,
626
+ endRow: next.endRow
627
+ });
628
+ index += 1;
629
+ }
630
+ return output;
631
+ }
632
+ function windowsToChunks(input) {
633
+ const chunks = [];
634
+ for (const window of input.windows) {
635
+ if (chunks.length >= input.maxChunks) {
636
+ break;
637
+ }
638
+ const trimmed = trimLineRange(input.lines, window.startRow, window.endRow);
639
+ if (!trimmed) {
640
+ continue;
641
+ }
642
+ chunks.push({
643
+ start_line: trimmed.start + 1,
644
+ end_line: trimmed.end + 1,
645
+ snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
646
+ });
647
+ }
648
+ return chunks;
649
+ }
299
650
  function buildLanguageAwareChunks(input) {
300
651
  const languageAwareAttemptStart = Date.now();
652
+ const lineTokenCounts = computeLineTokenCounts(input.lines, input.tokenize);
301
653
  const parser = getParser(input.parserLanguage);
302
654
  if (!parser) {
303
655
  const fallbackStart = Date.now();
@@ -306,7 +658,8 @@ function buildLanguageAwareChunks(input) {
306
658
  tokenize: input.tokenize,
307
659
  targetChunkTokens: input.config.target_chunk_tokens,
308
660
  overlapTokens: input.config.chunk_overlap_tokens,
309
- maxChunks: input.config.max_chunks_per_file
661
+ maxChunks: input.config.max_chunks_per_file,
662
+ lineTokenCounts
310
663
  });
311
664
  return {
312
665
  chunks,
@@ -329,7 +682,8 @@ function buildLanguageAwareChunks(input) {
329
682
  tokenize: input.tokenize,
330
683
  targetChunkTokens: input.config.target_chunk_tokens,
331
684
  overlapTokens: input.config.chunk_overlap_tokens,
332
- maxChunks: input.config.max_chunks_per_file
685
+ maxChunks: input.config.max_chunks_per_file,
686
+ lineTokenCounts
333
687
  });
334
688
  return {
335
689
  chunks,
@@ -349,7 +703,8 @@ function buildLanguageAwareChunks(input) {
349
703
  tokenize: input.tokenize,
350
704
  targetChunkTokens: input.config.target_chunk_tokens,
351
705
  overlapTokens: input.config.chunk_overlap_tokens,
352
- maxChunks: input.config.max_chunks_per_file
706
+ maxChunks: input.config.max_chunks_per_file,
707
+ lineTokenCounts
353
708
  });
354
709
  return {
355
710
  chunks,
@@ -361,74 +716,127 @@ function buildLanguageAwareChunks(input) {
361
716
  language: parserLanguageToCanonical(input.parserLanguage)
362
717
  };
363
718
  }
364
- const boundaryTypes = DEFAULT_BOUNDARY_NODE_TYPES[input.parserLanguage];
365
- const candidates = root.descendantsOfType([...boundaryTypes]);
366
- const boundaryNodes = candidates
367
- .filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
368
- .sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
369
- if (boundaryNodes.length === 0) {
370
- const fallbackStart = Date.now();
371
- const chunks = buildSlidingChunks({
719
+ let chunks = [];
720
+ let recursiveSemanticChunkingUsed = false;
721
+ if (input.config.recursive_semantic_chunking_enabled) {
722
+ const semanticMergeGapLines = input.config.semantic_merge_gap_lines ?? 6;
723
+ const semanticMergeMaxSpanLines = input.config.semantic_merge_max_span_lines ?? 220;
724
+ const recursiveWindows = buildRecursiveSemanticWindows({
725
+ root,
372
726
  lines: input.lines,
373
- tokenize: input.tokenize,
727
+ lineTokenCounts,
374
728
  targetChunkTokens: input.config.target_chunk_tokens,
375
- overlapTokens: input.config.chunk_overlap_tokens,
729
+ maxChunks: input.config.max_chunks_per_file,
730
+ boundaryStrictness: input.config.boundary_strictness
731
+ });
732
+ const mergedWindows = mergeSemanticWindows({
733
+ windows: recursiveWindows,
734
+ lineTokenCounts,
735
+ targetChunkTokens: input.config.target_chunk_tokens,
736
+ semanticMergeGapLines,
737
+ semanticMergeMaxSpanLines
738
+ });
739
+ const absorbedWindows = input.config.comment_forward_absorb_enabled === false
740
+ ? mergedWindows
741
+ : absorbForwardCommentWindows({
742
+ windows: mergedWindows,
743
+ lines: input.lines,
744
+ lineTokenCounts,
745
+ targetChunkTokens: input.config.target_chunk_tokens,
746
+ semanticMergeMaxSpanLines
747
+ });
748
+ chunks = windowsToChunks({
749
+ windows: absorbedWindows,
750
+ lines: input.lines,
376
751
  maxChunks: input.config.max_chunks_per_file
377
752
  });
378
- return {
379
- chunks,
380
- strategy: "sliding",
381
- fallback_reason: "empty_language_boundaries",
382
- parse_latency_ms: parseLatencyMs,
383
- language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
384
- fallback_path_latency_ms: Date.now() - fallbackStart,
385
- language: parserLanguageToCanonical(input.parserLanguage)
386
- };
753
+ recursiveSemanticChunkingUsed = chunks.length > 0;
387
754
  }
388
- const segments = [];
389
- let cursor = 0;
390
- const lastRow = Math.max(0, input.lines.length - 1);
391
- for (const node of boundaryNodes) {
392
- const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
393
- const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
394
- if (startRow > cursor) {
395
- segments.push({ startRow: cursor, endRow: startRow - 1 });
755
+ else {
756
+ const boundaryTypes = getBoundaryTypes(input.parserLanguage, input.config.boundary_strictness);
757
+ const candidates = root.descendantsOfType([...boundaryTypes]);
758
+ const boundaryNodes = candidates
759
+ .filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
760
+ .filter((node) => isLanguageBoundaryCandidate(input.parserLanguage, node, input.config.boundary_strictness))
761
+ .sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
762
+ if (boundaryNodes.length === 0) {
763
+ const fallbackStart = Date.now();
764
+ const fallbackChunks = buildSlidingChunks({
765
+ lines: input.lines,
766
+ tokenize: input.tokenize,
767
+ targetChunkTokens: input.config.target_chunk_tokens,
768
+ overlapTokens: input.config.chunk_overlap_tokens,
769
+ maxChunks: input.config.max_chunks_per_file,
770
+ lineTokenCounts
771
+ });
772
+ return {
773
+ chunks: fallbackChunks,
774
+ strategy: "sliding",
775
+ fallback_reason: "empty_language_boundaries",
776
+ parse_latency_ms: parseLatencyMs,
777
+ language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
778
+ fallback_path_latency_ms: Date.now() - fallbackStart,
779
+ language: parserLanguageToCanonical(input.parserLanguage)
780
+ };
396
781
  }
397
- segments.push({ startRow, endRow });
398
- cursor = endRow + 1;
399
- if (cursor > lastRow) {
400
- break;
782
+ const segments = [];
783
+ let cursor = 0;
784
+ const lastRow = Math.max(0, input.lines.length - 1);
785
+ for (const node of boundaryNodes) {
786
+ const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
787
+ const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
788
+ if (startRow > cursor) {
789
+ segments.push({ startRow: cursor, endRow: startRow - 1, boundary: false });
790
+ }
791
+ segments.push({ startRow, endRow, boundary: true });
792
+ cursor = endRow + 1;
793
+ if (cursor > lastRow) {
794
+ break;
795
+ }
401
796
  }
402
- }
403
- if (cursor <= lastRow) {
404
- segments.push({ startRow: cursor, endRow: lastRow });
405
- }
406
- const chunks = [];
407
- for (const segment of segments) {
408
- if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
409
- continue;
797
+ if (cursor <= lastRow) {
798
+ segments.push({ startRow: cursor, endRow: lastRow, boundary: false });
410
799
  }
411
- const pieces = splitRangeWithBudget({
412
- lines: input.lines,
413
- startRow: segment.startRow,
414
- endRow: segment.endRow,
415
- tokenize: input.tokenize,
416
- targetChunkTokens: input.config.target_chunk_tokens,
417
- overlapTokens: input.config.chunk_overlap_tokens,
418
- maxChunks: input.config.max_chunks_per_file - chunks.length
419
- });
420
- for (const piece of pieces) {
421
- const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
422
- if (!trimmed) {
800
+ for (const segment of segments) {
801
+ if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
423
802
  continue;
424
803
  }
425
- chunks.push({
426
- start_line: trimmed.start + 1,
427
- end_line: trimmed.end + 1,
428
- snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
429
- });
430
- if (chunks.length >= input.config.max_chunks_per_file) {
431
- break;
804
+ const segmentTokenCount = lineTokenCounts
805
+ .slice(segment.startRow, segment.endRow + 1)
806
+ .reduce((sum, value) => sum + value, 0);
807
+ const enableSemanticBoundarySplits = input.config.boundary_strictness === "semantic_js_ts" &&
808
+ (input.parserLanguage === "javascript" ||
809
+ input.parserLanguage === "jsx" ||
810
+ input.parserLanguage === "typescript" ||
811
+ input.parserLanguage === "tsx") &&
812
+ segment.boundary;
813
+ const softMaxChunkTokens = Math.floor(input.config.target_chunk_tokens * SEMANTIC_JS_TS_SOFT_MAX_MULTIPLIER);
814
+ const pieces = enableSemanticBoundarySplits && segmentTokenCount <= softMaxChunkTokens
815
+ ? [{ startRow: segment.startRow, endRow: segment.endRow }]
816
+ : splitRangeWithBudget({
817
+ lines: input.lines,
818
+ lineTokenCounts,
819
+ startRow: segment.startRow,
820
+ endRow: segment.endRow,
821
+ targetChunkTokens: input.config.target_chunk_tokens,
822
+ overlapTokens: input.config.chunk_overlap_tokens,
823
+ maxChunks: input.config.max_chunks_per_file - chunks.length,
824
+ preferSafeBoundarySplit: enableSemanticBoundarySplits,
825
+ softMaxChunkTokens
826
+ });
827
+ for (const piece of pieces) {
828
+ const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
829
+ if (!trimmed) {
830
+ continue;
831
+ }
832
+ chunks.push({
833
+ start_line: trimmed.start + 1,
834
+ end_line: trimmed.end + 1,
835
+ snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
836
+ });
837
+ if (chunks.length >= input.config.max_chunks_per_file) {
838
+ break;
839
+ }
432
840
  }
433
841
  }
434
842
  }
@@ -439,7 +847,8 @@ function buildLanguageAwareChunks(input) {
439
847
  tokenize: input.tokenize,
440
848
  targetChunkTokens: input.config.target_chunk_tokens,
441
849
  overlapTokens: input.config.chunk_overlap_tokens,
442
- maxChunks: input.config.max_chunks_per_file
850
+ maxChunks: input.config.max_chunks_per_file,
851
+ lineTokenCounts
443
852
  });
444
853
  return {
445
854
  chunks: slidingChunks,
@@ -456,7 +865,8 @@ function buildLanguageAwareChunks(input) {
456
865
  strategy: "language_aware",
457
866
  parse_latency_ms: parseLatencyMs,
458
867
  language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
459
- language: parserLanguageToCanonical(input.parserLanguage)
868
+ language: parserLanguageToCanonical(input.parserLanguage),
869
+ recursive_semantic_chunking_used: recursiveSemanticChunkingUsed
460
870
  };
461
871
  }
462
872
  catch {
@@ -466,7 +876,8 @@ function buildLanguageAwareChunks(input) {
466
876
  tokenize: input.tokenize,
467
877
  targetChunkTokens: input.config.target_chunk_tokens,
468
878
  overlapTokens: input.config.chunk_overlap_tokens,
469
- maxChunks: input.config.max_chunks_per_file
879
+ maxChunks: input.config.max_chunks_per_file,
880
+ lineTokenCounts
470
881
  });
471
882
  return {
472
883
  chunks,