flappa-doormal 2.6.2 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -48,16 +48,6 @@ src/
48
48
  ├── fuzzy.test.ts # Fuzzy matching tests
49
49
  ├── textUtils.test.ts # Text utility tests
50
50
  └── match-utils.test.ts # Utility function tests
51
-
52
- test/
53
- ├── 2576.json # Test data for book 2576 (Sahih Bukhari)
54
- └── 2588.json # Test data for book 2588 (Al-Mughni)
55
-
56
- docs/
57
- ├── checkpoints/ # AI agent handoff documentation
58
- │ └── 2025-12-09-handoff.md
59
- └── reviews/ # Performance analysis reports
60
- └── 2025-12-10/
61
51
  ```
62
52
 
63
53
  ### Core Components
@@ -92,7 +82,9 @@ docs/
92
82
  - `buildExcludeSet()` - Create Set from PageRange[] for O(1) lookups
93
83
  - `createSegment()` - Create segment with optional to/meta fields
94
84
  - `expandBreakpoints()` - Expand patterns with pre-compiled regexes
95
- - `findActualEndPage()` - Search backwards for ending page using progressive prefix matching (handles mid-page splits)
85
+ - `buildBoundaryPositions()` - Build position map of page boundaries for O(log n) lookups
86
+ - `findPageIndexForPosition()` - Binary search to find page index for a character position
87
+ - `estimateStartOffsetInCurrentPage()` - Estimate offset when segment starts mid-page
96
88
  - `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
97
89
  - `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
98
90
  - `findBreakPosition()` - Find break position using breakpoint patterns
@@ -362,6 +354,8 @@ bunx biome lint .
362
354
 
363
355
  10. **Page boundary detection needs progressive prefixes**: When breakpoints split content mid-page, checking only the first N characters of a page to detect if the segment ends on that page can fail. Solution: try progressively shorter prefixes (`[80, 60, 40, 30, 20, 15, 12, 10, 8, 6]`) via `JOINER_PREFIX_LENGTHS`. The check uses `indexOf(...) > 0` (not `>= 0`) to avoid false positives when a page prefix appears at position 0 (which indicates the segment *starts* with that page, not *ends* on it).
364
356
 
357
+ 11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
358
+
365
359
  ### Architecture Insights
366
360
 
367
361
  - **Declarative > Imperative**: Users describe patterns, library handles regex
package/dist/index.mjs CHANGED
@@ -411,6 +411,72 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
411
411
  return -1;
412
412
  };
413
413
  /**
414
+ * Builds a boundary position map for pages within the given range.
415
+ *
416
+ * This function computes page boundaries once per segment and enables
417
+ * O(log n) page lookups via binary search with `findPageIndexForPosition`.
418
+ *
419
+ * Boundaries are derived from segmentContent (post-structural-rules).
420
+ * When the segment starts mid-page, an offset correction is applied to
421
+ * keep boundary estimates aligned with the segment's actual content space.
422
+ *
423
+ * @param segmentContent - Full segment content (already processed by structural rules)
424
+ * @param fromIdx - Starting page index
425
+ * @param toIdx - Ending page index
426
+ * @param pageIds - Array of all page IDs
427
+ * @param normalizedPages - Map of page ID to normalized content
428
+ * @param cumulativeOffsets - Cumulative character offsets (for estimates)
429
+ * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
430
+ * with a sentinel boundary at segmentContent.length as the last element
431
+ *
432
+ * @example
433
+ * // For a 3-page segment:
434
+ * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
435
+ * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
436
+ */
437
+ const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
438
+ const boundaryPositions = [0];
439
+ const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
440
+ for (let i = fromIdx + 1; i <= toIdx; i++) {
441
+ const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
442
+ const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
443
+ const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
444
+ if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
445
+ else {
446
+ const estimate = Math.max(prevBoundary + 1, expectedBoundary);
447
+ boundaryPositions.push(Math.min(estimate, segmentContent.length));
448
+ }
449
+ }
450
+ boundaryPositions.push(segmentContent.length);
451
+ return boundaryPositions;
452
+ };
453
+ /**
454
+ * Binary search to find which page a position falls within.
455
+ * Uses "largest i where boundaryPositions[i] <= position" semantics.
456
+ *
457
+ * @param position - Character position in segmentContent
458
+ * @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
459
+ * @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
460
+ * @returns Page index in pageIds array
461
+ *
462
+ * @example
463
+ * // With boundaries [0, 20, 40, 60] and fromIdx=0:
464
+ * findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
465
+ * findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
466
+ * findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
467
+ */
468
+ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
469
+ if (boundaryPositions.length <= 1) return fromIdx;
470
+ let left = 0;
471
+ let right = boundaryPositions.length - 2;
472
+ while (left < right) {
473
+ const mid = Math.ceil((left + right) / 2);
474
+ if (boundaryPositions[mid] <= position) left = mid;
475
+ else right = mid - 1;
476
+ }
477
+ return fromIdx + left;
478
+ };
479
+ /**
414
480
  * Finds the end position of a breakpoint window inside `remainingContent`.
415
481
  *
416
482
  * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
@@ -446,59 +512,6 @@ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds
446
512
  return -1;
447
513
  };
448
514
  /**
449
- * Finds the actual ending page index by searching backwards for page content prefix.
450
- * Used to determine which page a segment actually ends on based on content matching.
451
- *
452
- * @param pieceContent - Content of the segment piece
453
- * @param currentFromIdx - Current starting index in pageIds
454
- * @param toIdx - Maximum ending index to search
455
- * @param pageIds - Array of page IDs
456
- * @param normalizedPages - Map of page ID to normalized content
457
- * @returns The actual ending page index
458
- */
459
- const findActualEndPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
460
- for (let pi = toIdx; pi > currentFromIdx; pi--) {
461
- const pageData = normalizedPages.get(pageIds[pi]);
462
- if (!pageData) continue;
463
- const trimmedContent = pageData.content.trimStart();
464
- for (const len of JOINER_PREFIX_LENGTHS) {
465
- const checkPortion = trimmedContent.slice(0, Math.min(len, trimmedContent.length)).trim();
466
- if (checkPortion.length > 0 && pieceContent.indexOf(checkPortion) > 0) return pi;
467
- }
468
- }
469
- return currentFromIdx;
470
- };
471
- /**
472
- * Finds the actual starting page index by searching forwards for page content prefix.
473
- * Used to determine which page content actually starts from based on content matching.
474
- *
475
- * This is the counterpart to findActualEndPage - it searches forward to find which
476
- * page the content starts on, rather than which page it ends on.
477
- *
478
- * @param pieceContent - Content of the segment piece
479
- * @param currentFromIdx - Current starting index in pageIds
480
- * @param toIdx - Maximum ending index to search
481
- * @param pageIds - Array of page IDs
482
- * @param normalizedPages - Map of page ID to normalized content
483
- * @returns The actual starting page index
484
- */
485
- const findActualStartPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
486
- const trimmedPiece = pieceContent.trimStart();
487
- if (!trimmedPiece) return currentFromIdx;
488
- for (let pi = currentFromIdx; pi <= toIdx; pi++) {
489
- const pageData = normalizedPages.get(pageIds[pi]);
490
- if (pageData) {
491
- const pagePrefix = pageData.content.slice(0, Math.min(30, pageData.length)).trim();
492
- const piecePrefix = trimmedPiece.slice(0, Math.min(30, trimmedPiece.length));
493
- if (pagePrefix.length > 0) {
494
- if (trimmedPiece.startsWith(pagePrefix)) return pi;
495
- if (pageData.content.trimStart().startsWith(piecePrefix)) return pi;
496
- }
497
- }
498
- }
499
- return currentFromIdx;
500
- };
501
- /**
502
515
  * Checks if any page in a range is excluded by the given exclude set.
503
516
  *
504
517
  * @param excludeSet - Set of excluded page IDs
@@ -630,10 +643,22 @@ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
630
643
  };
631
644
  const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
632
645
  const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
633
- const computePiecePages = (pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages) => {
634
- const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
646
+ /**
647
+ * Computes the actual start and end page indices for a piece using
648
+ * precomputed boundary positions and binary search.
649
+ *
650
+ * @param pieceStartPos - Start position of the piece in the full segment content
651
+ * @param pieceEndPos - End position (exclusive) of the piece
652
+ * @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
653
+ * @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
654
+ * @param toIdx - Maximum page index
655
+ * @returns Object with actualStartIdx and actualEndIdx
656
+ */
657
+ const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
658
+ const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
659
+ const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
635
660
  return {
636
- actualEndIdx: pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx,
661
+ actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
637
662
  actualStartIdx
638
663
  };
639
664
  };
@@ -650,79 +675,87 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
650
675
  return nextFromIdx;
651
676
  };
652
677
  const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
678
+ /**
679
+ * Finds the break offset within a window, trying exclusions first, then patterns.
680
+ *
681
+ * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
682
+ */
683
+ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
684
+ if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
685
+ const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
686
+ if (exclusionBreak > 0) return exclusionBreak;
687
+ }
688
+ const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
689
+ expandedBreakpoints,
690
+ normalizedPages,
691
+ pageIds,
692
+ prefer
693
+ });
694
+ return patternBreak > 0 ? patternBreak : windowEndPosition;
695
+ };
696
+ /**
697
+ * Advances cursor position past any leading whitespace.
698
+ */
699
+ const skipWhitespace = (content, startPos) => {
700
+ let pos = startPos;
701
+ while (pos < content.length && /\s/.test(content[pos])) pos++;
702
+ return pos;
703
+ };
704
+ /**
705
+ * Processes an oversized segment by iterating through the content and
706
+ * breaking it into smaller pieces that fit within maxPages constraints.
707
+ *
708
+ * Uses precomputed boundary positions for O(log n) page attribution lookups.
709
+ */
653
710
  const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
654
711
  const result = [];
655
- let remainingContent = segment.content;
712
+ const fullContent = segment.content;
713
+ let cursorPos = 0;
656
714
  let currentFromIdx = fromIdx;
657
715
  let isFirstPiece = true;
658
- let iterationCount = 0;
716
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
717
+ logger?.debug?.("[breakpoints] boundaryPositions built", {
718
+ boundaryPositions,
719
+ fromIdx,
720
+ fullContentLength: fullContent.length,
721
+ toIdx
722
+ });
659
723
  const maxIterations = 1e4;
660
- while (currentFromIdx <= toIdx) {
661
- iterationCount++;
662
- if (iterationCount > maxIterations) {
663
- logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
664
- break;
665
- }
666
- const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
724
+ for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
725
+ const remainingContent = fullContent.slice(cursorPos);
726
+ if (!remainingContent.trim()) break;
667
727
  const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
728
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
668
729
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
669
730
  const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
670
731
  if (finalSeg) result.push(finalSeg);
671
732
  break;
672
733
  }
673
734
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
674
- logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
675
- currentFromIdx,
676
- currentFromPageId: pageIds[currentFromIdx],
677
- remainingContentStart: remainingContent.slice(0, 50),
678
- remainingContentLength: remainingContent.length,
679
- remainingSpan,
680
- toIdx,
681
- toPageId: pageIds[toIdx],
682
- windowEndIdx,
683
- windowEndPageId: pageIds[windowEndIdx]
684
- });
685
735
  const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
686
- const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
687
- let breakPosition = -1;
688
- if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
689
- if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
690
- expandedBreakpoints,
691
- normalizedPages,
692
- pageIds,
693
- prefer
736
+ logger?.debug?.(`[breakpoints] iteration=${i}`, {
737
+ currentFromIdx,
738
+ cursorPos,
739
+ windowEndIdx
694
740
  });
695
- if (breakPosition <= 0) breakPosition = windowEndPosition;
696
- const pieceContent = remainingContent.slice(0, breakPosition).trim();
697
- logger?.debug?.("[breakpoints] selectedBreak", {
698
- breakPosition,
699
- pieceContentEnd: pieceContent.slice(-50),
700
- pieceContentLength: pieceContent.length,
701
- windowEndPosition
741
+ const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
742
+ const breakPos = cursorPos + breakOffset;
743
+ const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
744
+ const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
745
+ logger?.trace?.("[breakpoints] piece", {
746
+ actualEndIdx,
747
+ actualStartIdx,
748
+ pieceLength: pieceContent.length
702
749
  });
703
- const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
704
750
  if (pieceContent) {
705
751
  const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
706
752
  if (pieceSeg) result.push(pieceSeg);
707
753
  }
708
- remainingContent = remainingContent.slice(breakPosition).trim();
709
- logger?.debug?.("[breakpoints] afterSlice", {
710
- actualEndIdx,
711
- remainingContentLength: remainingContent.length,
712
- remainingContentStart: remainingContent.slice(0, 60)
713
- });
714
- if (!remainingContent) {
715
- logger?.debug?.("[breakpoints] done: no remaining content");
716
- break;
717
- }
718
- currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
719
- logger?.debug?.("[breakpoints] nextIteration", {
720
- currentFromIdx,
721
- currentFromPageId: pageIds[currentFromIdx]
722
- });
754
+ cursorPos = skipWhitespace(fullContent, breakPos);
755
+ currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
723
756
  isFirstPiece = false;
724
757
  }
725
- logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
758
+ logger?.debug?.("[breakpoints] done", { resultCount: result.length });
726
759
  return result;
727
760
  };
728
761
  /**
@@ -905,6 +938,77 @@ const anyRuleAllowsId = (rules, pageId) => {
905
938
  });
906
939
  };
907
940
 
941
+ //#endregion
942
+ //#region src/segmentation/replace.ts
943
+ const DEFAULT_REPLACE_FLAGS = "gu";
944
+ const normalizeReplaceFlags = (flags) => {
945
+ if (!flags) return DEFAULT_REPLACE_FLAGS;
946
+ const allowed = new Set([
947
+ "g",
948
+ "i",
949
+ "m",
950
+ "s",
951
+ "u",
952
+ "y"
953
+ ]);
954
+ const set = /* @__PURE__ */ new Set();
955
+ for (const ch of flags) {
956
+ if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
957
+ set.add(ch);
958
+ }
959
+ set.add("g");
960
+ set.add("u");
961
+ return [
962
+ "g",
963
+ "i",
964
+ "m",
965
+ "s",
966
+ "y",
967
+ "u"
968
+ ].filter((c) => set.has(c)).join("");
969
+ };
970
+ const compileReplaceRules = (rules) => {
971
+ const compiled = [];
972
+ for (const r of rules) {
973
+ if (r.pageIds && r.pageIds.length === 0) continue;
974
+ const flags = normalizeReplaceFlags(r.flags);
975
+ const re = new RegExp(r.regex, flags);
976
+ compiled.push({
977
+ pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
978
+ re,
979
+ replacement: r.replacement
980
+ });
981
+ }
982
+ return compiled;
983
+ };
984
+ /**
985
+ * Applies ordered regex replacements to page content (per page).
986
+ *
987
+ * - Replacement rules are applied in array order.
988
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
989
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
990
+ *
991
+ * This function is intentionally **pure**:
992
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
993
+ */
994
+ const applyReplacements = (pages, rules) => {
995
+ if (!rules || rules.length === 0 || pages.length === 0) return pages;
996
+ const compiled = compileReplaceRules(rules);
997
+ if (compiled.length === 0) return pages;
998
+ return pages.map((p) => {
999
+ let content = p.content;
1000
+ for (const rule of compiled) {
1001
+ if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1002
+ content = content.replace(rule.re, rule.replacement);
1003
+ }
1004
+ if (content === p.content) return p;
1005
+ return {
1006
+ ...p,
1007
+ content
1008
+ };
1009
+ });
1010
+ };
1011
+
908
1012
  //#endregion
909
1013
  //#region src/segmentation/tokens.ts
910
1014
  /**
@@ -1482,77 +1586,6 @@ const buildRuleRegex = (rule, capturePrefix) => {
1482
1586
  };
1483
1587
  };
1484
1588
 
1485
- //#endregion
1486
- //#region src/segmentation/replace.ts
1487
- const DEFAULT_REPLACE_FLAGS = "gu";
1488
- const normalizeReplaceFlags = (flags) => {
1489
- if (!flags) return DEFAULT_REPLACE_FLAGS;
1490
- const allowed = new Set([
1491
- "g",
1492
- "i",
1493
- "m",
1494
- "s",
1495
- "u",
1496
- "y"
1497
- ]);
1498
- const set = /* @__PURE__ */ new Set();
1499
- for (const ch of flags) {
1500
- if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
1501
- set.add(ch);
1502
- }
1503
- set.add("g");
1504
- set.add("u");
1505
- return [
1506
- "g",
1507
- "i",
1508
- "m",
1509
- "s",
1510
- "y",
1511
- "u"
1512
- ].filter((c) => set.has(c)).join("");
1513
- };
1514
- const compileReplaceRules = (rules) => {
1515
- const compiled = [];
1516
- for (const r of rules) {
1517
- if (r.pageIds && r.pageIds.length === 0) continue;
1518
- const flags = normalizeReplaceFlags(r.flags);
1519
- const re = new RegExp(r.regex, flags);
1520
- compiled.push({
1521
- pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
1522
- re,
1523
- replacement: r.replacement
1524
- });
1525
- }
1526
- return compiled;
1527
- };
1528
- /**
1529
- * Applies ordered regex replacements to page content (per page).
1530
- *
1531
- * - Replacement rules are applied in array order.
1532
- * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
1533
- * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
1534
- *
1535
- * This function is intentionally **pure**:
1536
- * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
1537
- */
1538
- const applyReplacements = (pages, rules) => {
1539
- if (!rules || rules.length === 0 || pages.length === 0) return pages;
1540
- const compiled = compileReplaceRules(rules);
1541
- if (compiled.length === 0) return pages;
1542
- return pages.map((p) => {
1543
- let content = p.content;
1544
- for (const rule of compiled) {
1545
- if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1546
- content = content.replace(rule.re, rule.replacement);
1547
- }
1548
- if (content === p.content) return p;
1549
- return {
1550
- ...p,
1551
- content
1552
- };
1553
- });
1554
- };
1555
-
1556
1589
  //#endregion
1557
1590
  //#region src/segmentation/fast-fuzzy-prefix.ts
1558
1591
  /**
@@ -2122,14 +2155,43 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2122
2155
  */
2123
2156
  const segmentPages = (pages, options) => {
2124
2157
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2158
+ logger?.info?.("[segmenter] starting segmentation", {
2159
+ breakpointCount: breakpoints.length,
2160
+ maxPages,
2161
+ pageCount: pages.length,
2162
+ prefer,
2163
+ ruleCount: rules.length
2164
+ });
2125
2165
  const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
2126
2166
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
2127
- let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
2167
+ logger?.debug?.("[segmenter] content built", {
2168
+ pageIds: pageMap.pageIds,
2169
+ totalContentLength: matchContent.length
2170
+ });
2171
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
2172
+ const unique = dedupeSplitPoints(splitPoints);
2173
+ logger?.debug?.("[segmenter] split points collected", {
2174
+ rawSplitPoints: splitPoints.length,
2175
+ uniqueSplitPoints: unique.length
2176
+ });
2177
+ let segments = buildSegments(unique, matchContent, pageMap, rules);
2178
+ logger?.debug?.("[segmenter] structural segments built", {
2179
+ segmentCount: segments.length,
2180
+ segments: segments.map((s) => ({
2181
+ contentLength: s.content.length,
2182
+ from: s.from,
2183
+ to: s.to
2184
+ }))
2185
+ });
2128
2186
  segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
2129
2187
  if (maxPages >= 0 && breakpoints.length) {
2188
+ logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2130
2189
  const patternProcessor = (p) => processPattern(p, false).pattern;
2131
- return applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2190
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2191
+ logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2192
+ return result;
2132
2193
  }
2194
+ logger?.info?.("[segmenter] segmentation complete (structural only)", { finalSegmentCount: segments.length });
2133
2195
  return segments;
2134
2196
  };
2135
2197
  /**