flappa-doormal 2.6.2 → 2.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +5 -11
- package/dist/index.mjs +243 -181
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/AGENTS.md
CHANGED
|
@@ -48,16 +48,6 @@ src/
|
|
|
48
48
|
├── fuzzy.test.ts # Fuzzy matching tests
|
|
49
49
|
├── textUtils.test.ts # Text utility tests
|
|
50
50
|
└── match-utils.test.ts # Utility function tests
|
|
51
|
-
|
|
52
|
-
test/
|
|
53
|
-
├── 2576.json # Test data for book 2576 (Sahih Bukhari)
|
|
54
|
-
└── 2588.json # Test data for book 2588 (Al-Mughni)
|
|
55
|
-
|
|
56
|
-
docs/
|
|
57
|
-
├── checkpoints/ # AI agent handoff documentation
|
|
58
|
-
│ └── 2025-12-09-handoff.md
|
|
59
|
-
└── reviews/ # Performance analysis reports
|
|
60
|
-
└── 2025-12-10/
|
|
61
51
|
```
|
|
62
52
|
|
|
63
53
|
### Core Components
|
|
@@ -92,7 +82,9 @@ docs/
|
|
|
92
82
|
- `buildExcludeSet()` - Create Set from PageRange[] for O(1) lookups
|
|
93
83
|
- `createSegment()` - Create segment with optional to/meta fields
|
|
94
84
|
- `expandBreakpoints()` - Expand patterns with pre-compiled regexes
|
|
95
|
-
- `
|
|
85
|
+
- `buildBoundaryPositions()` - Build position map of page boundaries for O(log n) lookups
|
|
86
|
+
- `findPageIndexForPosition()` - Binary search to find page index for a character position
|
|
87
|
+
- `estimateStartOffsetInCurrentPage()` - Estimate offset when segment starts mid-page
|
|
96
88
|
- `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
|
|
97
89
|
- `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
|
|
98
90
|
- `findBreakPosition()` - Find break position using breakpoint patterns
|
|
@@ -362,6 +354,8 @@ bunx biome lint .
|
|
|
362
354
|
|
|
363
355
|
10. **Page boundary detection needs progressive prefixes**: When breakpoints split content mid-page, checking only the first N characters of a page to detect if the segment ends on that page can fail. Solution: try progressively shorter prefixes (`[80, 60, 40, 30, 20, 15, 12, 10, 8, 6]`) via `JOINER_PREFIX_LENGTHS`. The check uses `indexOf(...) > 0` (not `>= 0`) to avoid false positives when a page prefix appears at position 0 (which indicates the segment *starts* with that page, not *ends* on it).
|
|
364
356
|
|
|
357
|
+
11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
|
|
358
|
+
|
|
365
359
|
### Architecture Insights
|
|
366
360
|
|
|
367
361
|
- **Declarative > Imperative**: Users describe patterns, library handles regex
|
package/dist/index.mjs
CHANGED
|
@@ -411,6 +411,72 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
411
411
|
return -1;
|
|
412
412
|
};
|
|
413
413
|
/**
|
|
414
|
+
* Builds a boundary position map for pages within the given range.
|
|
415
|
+
*
|
|
416
|
+
* This function computes page boundaries once per segment and enables
|
|
417
|
+
* O(log n) page lookups via binary search with `findPageIndexForPosition`.
|
|
418
|
+
*
|
|
419
|
+
* Boundaries are derived from segmentContent (post-structural-rules).
|
|
420
|
+
* When the segment starts mid-page, an offset correction is applied to
|
|
421
|
+
* keep boundary estimates aligned with the segment's actual content space.
|
|
422
|
+
*
|
|
423
|
+
* @param segmentContent - Full segment content (already processed by structural rules)
|
|
424
|
+
* @param fromIdx - Starting page index
|
|
425
|
+
* @param toIdx - Ending page index
|
|
426
|
+
* @param pageIds - Array of all page IDs
|
|
427
|
+
* @param normalizedPages - Map of page ID to normalized content
|
|
428
|
+
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
429
|
+
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
430
|
+
* with a sentinel boundary at segmentContent.length as the last element
|
|
431
|
+
*
|
|
432
|
+
* @example
|
|
433
|
+
* // For a 3-page segment:
|
|
434
|
+
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
435
|
+
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
436
|
+
*/
|
|
437
|
+
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
438
|
+
const boundaryPositions = [0];
|
|
439
|
+
const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
440
|
+
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
441
|
+
const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
|
|
442
|
+
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
|
|
443
|
+
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
444
|
+
if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
|
|
445
|
+
else {
|
|
446
|
+
const estimate = Math.max(prevBoundary + 1, expectedBoundary);
|
|
447
|
+
boundaryPositions.push(Math.min(estimate, segmentContent.length));
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
boundaryPositions.push(segmentContent.length);
|
|
451
|
+
return boundaryPositions;
|
|
452
|
+
};
|
|
453
|
+
/**
|
|
454
|
+
* Binary search to find which page a position falls within.
|
|
455
|
+
* Uses "largest i where boundaryPositions[i] <= position" semantics.
|
|
456
|
+
*
|
|
457
|
+
* @param position - Character position in segmentContent
|
|
458
|
+
* @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
|
|
459
|
+
* @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
|
|
460
|
+
* @returns Page index in pageIds array
|
|
461
|
+
*
|
|
462
|
+
* @example
|
|
463
|
+
* // With boundaries [0, 20, 40, 60] and fromIdx=0:
|
|
464
|
+
* findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
|
|
465
|
+
* findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
|
|
466
|
+
* findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
|
|
467
|
+
*/
|
|
468
|
+
const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
469
|
+
if (boundaryPositions.length <= 1) return fromIdx;
|
|
470
|
+
let left = 0;
|
|
471
|
+
let right = boundaryPositions.length - 2;
|
|
472
|
+
while (left < right) {
|
|
473
|
+
const mid = Math.ceil((left + right) / 2);
|
|
474
|
+
if (boundaryPositions[mid] <= position) left = mid;
|
|
475
|
+
else right = mid - 1;
|
|
476
|
+
}
|
|
477
|
+
return fromIdx + left;
|
|
478
|
+
};
|
|
479
|
+
/**
|
|
414
480
|
* Finds the end position of a breakpoint window inside `remainingContent`.
|
|
415
481
|
*
|
|
416
482
|
* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
|
|
@@ -446,59 +512,6 @@ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds
|
|
|
446
512
|
return -1;
|
|
447
513
|
};
|
|
448
514
|
/**
|
|
449
|
-
* Finds the actual ending page index by searching backwards for page content prefix.
|
|
450
|
-
* Used to determine which page a segment actually ends on based on content matching.
|
|
451
|
-
*
|
|
452
|
-
* @param pieceContent - Content of the segment piece
|
|
453
|
-
* @param currentFromIdx - Current starting index in pageIds
|
|
454
|
-
* @param toIdx - Maximum ending index to search
|
|
455
|
-
* @param pageIds - Array of page IDs
|
|
456
|
-
* @param normalizedPages - Map of page ID to normalized content
|
|
457
|
-
* @returns The actual ending page index
|
|
458
|
-
*/
|
|
459
|
-
const findActualEndPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
|
|
460
|
-
for (let pi = toIdx; pi > currentFromIdx; pi--) {
|
|
461
|
-
const pageData = normalizedPages.get(pageIds[pi]);
|
|
462
|
-
if (!pageData) continue;
|
|
463
|
-
const trimmedContent = pageData.content.trimStart();
|
|
464
|
-
for (const len of JOINER_PREFIX_LENGTHS) {
|
|
465
|
-
const checkPortion = trimmedContent.slice(0, Math.min(len, trimmedContent.length)).trim();
|
|
466
|
-
if (checkPortion.length > 0 && pieceContent.indexOf(checkPortion) > 0) return pi;
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
return currentFromIdx;
|
|
470
|
-
};
|
|
471
|
-
/**
|
|
472
|
-
* Finds the actual starting page index by searching forwards for page content prefix.
|
|
473
|
-
* Used to determine which page content actually starts from based on content matching.
|
|
474
|
-
*
|
|
475
|
-
* This is the counterpart to findActualEndPage - it searches forward to find which
|
|
476
|
-
* page the content starts on, rather than which page it ends on.
|
|
477
|
-
*
|
|
478
|
-
* @param pieceContent - Content of the segment piece
|
|
479
|
-
* @param currentFromIdx - Current starting index in pageIds
|
|
480
|
-
* @param toIdx - Maximum ending index to search
|
|
481
|
-
* @param pageIds - Array of page IDs
|
|
482
|
-
* @param normalizedPages - Map of page ID to normalized content
|
|
483
|
-
* @returns The actual starting page index
|
|
484
|
-
*/
|
|
485
|
-
const findActualStartPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
|
|
486
|
-
const trimmedPiece = pieceContent.trimStart();
|
|
487
|
-
if (!trimmedPiece) return currentFromIdx;
|
|
488
|
-
for (let pi = currentFromIdx; pi <= toIdx; pi++) {
|
|
489
|
-
const pageData = normalizedPages.get(pageIds[pi]);
|
|
490
|
-
if (pageData) {
|
|
491
|
-
const pagePrefix = pageData.content.slice(0, Math.min(30, pageData.length)).trim();
|
|
492
|
-
const piecePrefix = trimmedPiece.slice(0, Math.min(30, trimmedPiece.length));
|
|
493
|
-
if (pagePrefix.length > 0) {
|
|
494
|
-
if (trimmedPiece.startsWith(pagePrefix)) return pi;
|
|
495
|
-
if (pageData.content.trimStart().startsWith(piecePrefix)) return pi;
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
return currentFromIdx;
|
|
500
|
-
};
|
|
501
|
-
/**
|
|
502
515
|
* Checks if any page in a range is excluded by the given exclude set.
|
|
503
516
|
*
|
|
504
517
|
* @param excludeSet - Set of excluded page IDs
|
|
@@ -630,10 +643,22 @@ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
|
|
|
630
643
|
};
|
|
631
644
|
const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
|
|
632
645
|
const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
|
|
633
|
-
|
|
634
|
-
|
|
646
|
+
/**
|
|
647
|
+
* Computes the actual start and end page indices for a piece using
|
|
648
|
+
* precomputed boundary positions and binary search.
|
|
649
|
+
*
|
|
650
|
+
* @param pieceStartPos - Start position of the piece in the full segment content
|
|
651
|
+
* @param pieceEndPos - End position (exclusive) of the piece
|
|
652
|
+
* @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
|
|
653
|
+
* @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
|
|
654
|
+
* @param toIdx - Maximum page index
|
|
655
|
+
* @returns Object with actualStartIdx and actualEndIdx
|
|
656
|
+
*/
|
|
657
|
+
const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
|
|
658
|
+
const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
|
|
659
|
+
const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
|
|
635
660
|
return {
|
|
636
|
-
actualEndIdx:
|
|
661
|
+
actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
|
|
637
662
|
actualStartIdx
|
|
638
663
|
};
|
|
639
664
|
};
|
|
@@ -650,79 +675,87 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
|
|
|
650
675
|
return nextFromIdx;
|
|
651
676
|
};
|
|
652
677
|
const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
|
|
678
|
+
/**
|
|
679
|
+
* Finds the break offset within a window, trying exclusions first, then patterns.
|
|
680
|
+
*
|
|
681
|
+
* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
|
|
682
|
+
*/
|
|
683
|
+
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
684
|
+
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
685
|
+
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
686
|
+
if (exclusionBreak > 0) return exclusionBreak;
|
|
687
|
+
}
|
|
688
|
+
const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
689
|
+
expandedBreakpoints,
|
|
690
|
+
normalizedPages,
|
|
691
|
+
pageIds,
|
|
692
|
+
prefer
|
|
693
|
+
});
|
|
694
|
+
return patternBreak > 0 ? patternBreak : windowEndPosition;
|
|
695
|
+
};
|
|
696
|
+
/**
|
|
697
|
+
* Advances cursor position past any leading whitespace.
|
|
698
|
+
*/
|
|
699
|
+
const skipWhitespace = (content, startPos) => {
|
|
700
|
+
let pos = startPos;
|
|
701
|
+
while (pos < content.length && /\s/.test(content[pos])) pos++;
|
|
702
|
+
return pos;
|
|
703
|
+
};
|
|
704
|
+
/**
|
|
705
|
+
* Processes an oversized segment by iterating through the content and
|
|
706
|
+
* breaking it into smaller pieces that fit within maxPages constraints.
|
|
707
|
+
*
|
|
708
|
+
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
709
|
+
*/
|
|
653
710
|
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
654
711
|
const result = [];
|
|
655
|
-
|
|
712
|
+
const fullContent = segment.content;
|
|
713
|
+
let cursorPos = 0;
|
|
656
714
|
let currentFromIdx = fromIdx;
|
|
657
715
|
let isFirstPiece = true;
|
|
658
|
-
|
|
716
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
717
|
+
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
718
|
+
boundaryPositions,
|
|
719
|
+
fromIdx,
|
|
720
|
+
fullContentLength: fullContent.length,
|
|
721
|
+
toIdx
|
|
722
|
+
});
|
|
659
723
|
const maxIterations = 1e4;
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
if (
|
|
663
|
-
logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
|
|
664
|
-
break;
|
|
665
|
-
}
|
|
666
|
-
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
724
|
+
for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
|
|
725
|
+
const remainingContent = fullContent.slice(cursorPos);
|
|
726
|
+
if (!remainingContent.trim()) break;
|
|
667
727
|
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
728
|
+
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
668
729
|
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
669
730
|
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
670
731
|
if (finalSeg) result.push(finalSeg);
|
|
671
732
|
break;
|
|
672
733
|
}
|
|
673
734
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
674
|
-
logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
|
|
675
|
-
currentFromIdx,
|
|
676
|
-
currentFromPageId: pageIds[currentFromIdx],
|
|
677
|
-
remainingContentStart: remainingContent.slice(0, 50),
|
|
678
|
-
remainingContentLength: remainingContent.length,
|
|
679
|
-
remainingSpan,
|
|
680
|
-
toIdx,
|
|
681
|
-
toPageId: pageIds[toIdx],
|
|
682
|
-
windowEndIdx,
|
|
683
|
-
windowEndPageId: pageIds[windowEndIdx]
|
|
684
|
-
});
|
|
685
735
|
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
expandedBreakpoints,
|
|
691
|
-
normalizedPages,
|
|
692
|
-
pageIds,
|
|
693
|
-
prefer
|
|
736
|
+
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
737
|
+
currentFromIdx,
|
|
738
|
+
cursorPos,
|
|
739
|
+
windowEndIdx
|
|
694
740
|
});
|
|
695
|
-
|
|
696
|
-
const
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
741
|
+
const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
742
|
+
const breakPos = cursorPos + breakOffset;
|
|
743
|
+
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
744
|
+
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
745
|
+
logger?.trace?.("[breakpoints] piece", {
|
|
746
|
+
actualEndIdx,
|
|
747
|
+
actualStartIdx,
|
|
748
|
+
pieceLength: pieceContent.length
|
|
702
749
|
});
|
|
703
|
-
const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
|
|
704
750
|
if (pieceContent) {
|
|
705
751
|
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
706
752
|
if (pieceSeg) result.push(pieceSeg);
|
|
707
753
|
}
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
actualEndIdx,
|
|
711
|
-
remainingContentLength: remainingContent.length,
|
|
712
|
-
remainingContentStart: remainingContent.slice(0, 60)
|
|
713
|
-
});
|
|
714
|
-
if (!remainingContent) {
|
|
715
|
-
logger?.debug?.("[breakpoints] done: no remaining content");
|
|
716
|
-
break;
|
|
717
|
-
}
|
|
718
|
-
currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
719
|
-
logger?.debug?.("[breakpoints] nextIteration", {
|
|
720
|
-
currentFromIdx,
|
|
721
|
-
currentFromPageId: pageIds[currentFromIdx]
|
|
722
|
-
});
|
|
754
|
+
cursorPos = skipWhitespace(fullContent, breakPos);
|
|
755
|
+
currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
723
756
|
isFirstPiece = false;
|
|
724
757
|
}
|
|
725
|
-
logger?.debug?.("[breakpoints]
|
|
758
|
+
logger?.debug?.("[breakpoints] done", { resultCount: result.length });
|
|
726
759
|
return result;
|
|
727
760
|
};
|
|
728
761
|
/**
|
|
@@ -905,6 +938,77 @@ const anyRuleAllowsId = (rules, pageId) => {
|
|
|
905
938
|
});
|
|
906
939
|
};
|
|
907
940
|
|
|
941
|
+
//#endregion
|
|
942
|
+
//#region src/segmentation/replace.ts
|
|
943
|
+
const DEFAULT_REPLACE_FLAGS = "gu";
|
|
944
|
+
const normalizeReplaceFlags = (flags) => {
|
|
945
|
+
if (!flags) return DEFAULT_REPLACE_FLAGS;
|
|
946
|
+
const allowed = new Set([
|
|
947
|
+
"g",
|
|
948
|
+
"i",
|
|
949
|
+
"m",
|
|
950
|
+
"s",
|
|
951
|
+
"u",
|
|
952
|
+
"y"
|
|
953
|
+
]);
|
|
954
|
+
const set = /* @__PURE__ */ new Set();
|
|
955
|
+
for (const ch of flags) {
|
|
956
|
+
if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
|
|
957
|
+
set.add(ch);
|
|
958
|
+
}
|
|
959
|
+
set.add("g");
|
|
960
|
+
set.add("u");
|
|
961
|
+
return [
|
|
962
|
+
"g",
|
|
963
|
+
"i",
|
|
964
|
+
"m",
|
|
965
|
+
"s",
|
|
966
|
+
"y",
|
|
967
|
+
"u"
|
|
968
|
+
].filter((c) => set.has(c)).join("");
|
|
969
|
+
};
|
|
970
|
+
const compileReplaceRules = (rules) => {
|
|
971
|
+
const compiled = [];
|
|
972
|
+
for (const r of rules) {
|
|
973
|
+
if (r.pageIds && r.pageIds.length === 0) continue;
|
|
974
|
+
const flags = normalizeReplaceFlags(r.flags);
|
|
975
|
+
const re = new RegExp(r.regex, flags);
|
|
976
|
+
compiled.push({
|
|
977
|
+
pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
|
|
978
|
+
re,
|
|
979
|
+
replacement: r.replacement
|
|
980
|
+
});
|
|
981
|
+
}
|
|
982
|
+
return compiled;
|
|
983
|
+
};
|
|
984
|
+
/**
|
|
985
|
+
* Applies ordered regex replacements to page content (per page).
|
|
986
|
+
*
|
|
987
|
+
* - Replacement rules are applied in array order.
|
|
988
|
+
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
989
|
+
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
990
|
+
*
|
|
991
|
+
* This function is intentionally **pure**:
|
|
992
|
+
* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
|
|
993
|
+
*/
|
|
994
|
+
const applyReplacements = (pages, rules) => {
|
|
995
|
+
if (!rules || rules.length === 0 || pages.length === 0) return pages;
|
|
996
|
+
const compiled = compileReplaceRules(rules);
|
|
997
|
+
if (compiled.length === 0) return pages;
|
|
998
|
+
return pages.map((p) => {
|
|
999
|
+
let content = p.content;
|
|
1000
|
+
for (const rule of compiled) {
|
|
1001
|
+
if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
|
|
1002
|
+
content = content.replace(rule.re, rule.replacement);
|
|
1003
|
+
}
|
|
1004
|
+
if (content === p.content) return p;
|
|
1005
|
+
return {
|
|
1006
|
+
...p,
|
|
1007
|
+
content
|
|
1008
|
+
};
|
|
1009
|
+
});
|
|
1010
|
+
};
|
|
1011
|
+
|
|
908
1012
|
//#endregion
|
|
909
1013
|
//#region src/segmentation/tokens.ts
|
|
910
1014
|
/**
|
|
@@ -1482,77 +1586,6 @@ const buildRuleRegex = (rule, capturePrefix) => {
|
|
|
1482
1586
|
};
|
|
1483
1587
|
};
|
|
1484
1588
|
|
|
1485
|
-
//#endregion
|
|
1486
|
-
//#region src/segmentation/replace.ts
|
|
1487
|
-
const DEFAULT_REPLACE_FLAGS = "gu";
|
|
1488
|
-
const normalizeReplaceFlags = (flags) => {
|
|
1489
|
-
if (!flags) return DEFAULT_REPLACE_FLAGS;
|
|
1490
|
-
const allowed = new Set([
|
|
1491
|
-
"g",
|
|
1492
|
-
"i",
|
|
1493
|
-
"m",
|
|
1494
|
-
"s",
|
|
1495
|
-
"u",
|
|
1496
|
-
"y"
|
|
1497
|
-
]);
|
|
1498
|
-
const set = /* @__PURE__ */ new Set();
|
|
1499
|
-
for (const ch of flags) {
|
|
1500
|
-
if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
|
|
1501
|
-
set.add(ch);
|
|
1502
|
-
}
|
|
1503
|
-
set.add("g");
|
|
1504
|
-
set.add("u");
|
|
1505
|
-
return [
|
|
1506
|
-
"g",
|
|
1507
|
-
"i",
|
|
1508
|
-
"m",
|
|
1509
|
-
"s",
|
|
1510
|
-
"y",
|
|
1511
|
-
"u"
|
|
1512
|
-
].filter((c) => set.has(c)).join("");
|
|
1513
|
-
};
|
|
1514
|
-
const compileReplaceRules = (rules) => {
|
|
1515
|
-
const compiled = [];
|
|
1516
|
-
for (const r of rules) {
|
|
1517
|
-
if (r.pageIds && r.pageIds.length === 0) continue;
|
|
1518
|
-
const flags = normalizeReplaceFlags(r.flags);
|
|
1519
|
-
const re = new RegExp(r.regex, flags);
|
|
1520
|
-
compiled.push({
|
|
1521
|
-
pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
|
|
1522
|
-
re,
|
|
1523
|
-
replacement: r.replacement
|
|
1524
|
-
});
|
|
1525
|
-
}
|
|
1526
|
-
return compiled;
|
|
1527
|
-
};
|
|
1528
|
-
/**
|
|
1529
|
-
* Applies ordered regex replacements to page content (per page).
|
|
1530
|
-
*
|
|
1531
|
-
* - Replacement rules are applied in array order.
|
|
1532
|
-
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
1533
|
-
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
1534
|
-
*
|
|
1535
|
-
* This function is intentionally **pure**:
|
|
1536
|
-
* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
|
|
1537
|
-
*/
|
|
1538
|
-
const applyReplacements = (pages, rules) => {
|
|
1539
|
-
if (!rules || rules.length === 0 || pages.length === 0) return pages;
|
|
1540
|
-
const compiled = compileReplaceRules(rules);
|
|
1541
|
-
if (compiled.length === 0) return pages;
|
|
1542
|
-
return pages.map((p) => {
|
|
1543
|
-
let content = p.content;
|
|
1544
|
-
for (const rule of compiled) {
|
|
1545
|
-
if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
|
|
1546
|
-
content = content.replace(rule.re, rule.replacement);
|
|
1547
|
-
}
|
|
1548
|
-
if (content === p.content) return p;
|
|
1549
|
-
return {
|
|
1550
|
-
...p,
|
|
1551
|
-
content
|
|
1552
|
-
};
|
|
1553
|
-
});
|
|
1554
|
-
};
|
|
1555
|
-
|
|
1556
1589
|
//#endregion
|
|
1557
1590
|
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1558
1591
|
/**
|
|
@@ -2122,14 +2155,43 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2122
2155
|
*/
|
|
2123
2156
|
const segmentPages = (pages, options) => {
|
|
2124
2157
|
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
2158
|
+
logger?.info?.("[segmenter] starting segmentation", {
|
|
2159
|
+
breakpointCount: breakpoints.length,
|
|
2160
|
+
maxPages,
|
|
2161
|
+
pageCount: pages.length,
|
|
2162
|
+
prefer,
|
|
2163
|
+
ruleCount: rules.length
|
|
2164
|
+
});
|
|
2125
2165
|
const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
|
|
2126
2166
|
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
|
|
2127
|
-
|
|
2167
|
+
logger?.debug?.("[segmenter] content built", {
|
|
2168
|
+
pageIds: pageMap.pageIds,
|
|
2169
|
+
totalContentLength: matchContent.length
|
|
2170
|
+
});
|
|
2171
|
+
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
|
|
2172
|
+
const unique = dedupeSplitPoints(splitPoints);
|
|
2173
|
+
logger?.debug?.("[segmenter] split points collected", {
|
|
2174
|
+
rawSplitPoints: splitPoints.length,
|
|
2175
|
+
uniqueSplitPoints: unique.length
|
|
2176
|
+
});
|
|
2177
|
+
let segments = buildSegments(unique, matchContent, pageMap, rules);
|
|
2178
|
+
logger?.debug?.("[segmenter] structural segments built", {
|
|
2179
|
+
segmentCount: segments.length,
|
|
2180
|
+
segments: segments.map((s) => ({
|
|
2181
|
+
contentLength: s.content.length,
|
|
2182
|
+
from: s.from,
|
|
2183
|
+
to: s.to
|
|
2184
|
+
}))
|
|
2185
|
+
});
|
|
2128
2186
|
segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
|
|
2129
2187
|
if (maxPages >= 0 && breakpoints.length) {
|
|
2188
|
+
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
2130
2189
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2131
|
-
|
|
2190
|
+
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
2191
|
+
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
2192
|
+
return result;
|
|
2132
2193
|
}
|
|
2194
|
+
logger?.info?.("[segmenter] segmentation complete (structural only)", { finalSegmentCount: segments.length });
|
|
2133
2195
|
return segments;
|
|
2134
2196
|
};
|
|
2135
2197
|
/**
|