flappa-doormal 2.2.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -157,6 +157,26 @@ const makeDiacriticInsensitive = (text) => {
157
157
 
158
158
  //#endregion
159
159
  //#region src/segmentation/breakpoint-utils.ts
160
+ const WINDOW_PREFIX_LENGTHS = [
161
+ 80,
162
+ 60,
163
+ 40,
164
+ 30,
165
+ 20,
166
+ 15
167
+ ];
168
+ const JOINER_PREFIX_LENGTHS = [
169
+ 80,
170
+ 60,
171
+ 40,
172
+ 30,
173
+ 20,
174
+ 15,
175
+ 12,
176
+ 10,
177
+ 8,
178
+ 6
179
+ ];
160
180
  /**
161
181
  * Normalizes a breakpoint to the object form.
162
182
  * Strings are converted to { pattern: str } with no constraints.
@@ -312,6 +332,120 @@ const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp
312
332
  }
313
333
  });
314
334
  /**
335
+ * Applies a configured joiner at detected page boundaries within a multi-page content chunk.
336
+ *
337
+ * This is used for breakpoint-generated segments which don't have access to the original
338
+ * `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
339
+ * prefix after the previous boundary, then replace ONLY the single newline immediately before
340
+ * that page start.
341
+ *
342
+ * This avoids converting real in-page newlines, while still normalizing page joins consistently.
343
+ */
344
+ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
345
+ if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
346
+ let updated = content;
347
+ let searchFrom = 0;
348
+ for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
349
+ const pageData = normalizedPages.get(pageIds[pi]);
350
+ if (!pageData) continue;
351
+ const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
352
+ if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
353
+ if (found > 0) searchFrom = found;
354
+ }
355
+ return updated;
356
+ };
357
+ /**
358
+ * Finds the position of a page prefix in content, trying multiple prefix lengths.
359
+ */
360
+ const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
361
+ for (const len of JOINER_PREFIX_LENGTHS) {
362
+ const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
363
+ if (!prefix) continue;
364
+ const pos = content.indexOf(prefix, searchFrom);
365
+ if (pos > 0) return pos;
366
+ }
367
+ return -1;
368
+ };
369
+ /**
370
+ * Estimates how far into the current page `remainingContent` begins.
371
+ *
372
+ * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
373
+ * When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
374
+ * expected boundary positions. This helper computes an approximate starting offset by matching
375
+ * a short prefix of `remainingContent` inside the current page content.
376
+ */
377
+ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
378
+ const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
379
+ if (!currentPageData) return 0;
380
+ const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
381
+ const needle = remStart.slice(0, Math.min(30, remStart.length));
382
+ if (!needle) return 0;
383
+ const idx = currentPageData.content.indexOf(needle);
384
+ return idx > 0 ? idx : 0;
385
+ };
386
+ /**
387
+ * Attempts to find the start position of a target page within remainingContent,
388
+ * anchored near an expected boundary position to reduce collisions.
389
+ *
390
+ * This is used to define breakpoint windows in terms of actual content being split, rather than
391
+ * raw per-page offsets which can desync when structural rules strip markers.
392
+ */
393
+ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
394
+ const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
395
+ if (!targetPageData) return -1;
396
+ const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
397
+ const searchStart = Math.max(0, approx - 1e4);
398
+ const searchEnd = Math.min(remainingContent.length, approx + 2e3);
399
+ const targetTrimmed = targetPageData.content.trimStart();
400
+ for (const len of WINDOW_PREFIX_LENGTHS) {
401
+ const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
402
+ if (!prefix) continue;
403
+ let pos = remainingContent.indexOf(prefix, searchStart);
404
+ while (pos !== -1 && pos <= searchEnd) {
405
+ if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
406
+ pos = remainingContent.indexOf(prefix, pos + 1);
407
+ }
408
+ const last = remainingContent.lastIndexOf(prefix, approx);
409
+ if (last > 0) return last;
410
+ }
411
+ return -1;
412
+ };
413
+ /**
414
+ * Finds the end position of a breakpoint window inside `remainingContent`.
415
+ *
416
+ * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
417
+ * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
418
+ * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
419
+ */
420
+ const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
421
+ if (windowEndIdx >= toIdx) return remainingContent.length;
422
+ const desiredNextIdx = windowEndIdx + 1;
423
+ const minNextIdx = currentFromIdx + 1;
424
+ const maxNextIdx = Math.min(desiredNextIdx, toIdx);
425
+ const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
426
+ for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
427
+ const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
428
+ const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
429
+ if (pos > 0) return pos;
430
+ }
431
+ return remainingContent.length;
432
+ };
433
+ /**
434
+ * Finds exclusion-based break position using raw cumulative offsets.
435
+ *
436
+ * This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
437
+ * Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
438
+ */
439
+ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
440
+ const startingPageId = pageIds[currentFromIdx];
441
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
442
+ for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
443
+ const pageId = pageIds[pageIdx];
444
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
445
+ }
446
+ return -1;
447
+ };
448
+ /**
315
449
  * Finds the actual ending page index by searching backwards for page content prefix.
316
450
  * Used to determine which page a segment actually ends on based on content matching.
317
451
  *
@@ -414,6 +548,21 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
414
548
  return selected.index + selected.length;
415
549
  };
416
550
  /**
551
+ * Handles page boundary breakpoint (empty pattern).
552
+ * Returns break position or -1 if no valid position found.
553
+ */
554
+ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
555
+ const nextPageIdx = windowEndIdx + 1;
556
+ if (nextPageIdx <= toIdx) {
557
+ const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
558
+ if (nextPageData) {
559
+ const pos = findNextPagePosition(remainingContent, nextPageData);
560
+ if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
561
+ }
562
+ }
563
+ return Math.min(windowEndPosition, remainingContent.length);
564
+ };
565
+ /**
417
566
  * Tries to find a break position within the current window using breakpoint patterns.
418
567
  * Returns the break position or -1 if no suitable break was found.
419
568
  *
@@ -424,30 +573,165 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
424
573
  * @param ctx - Breakpoint context with page data and patterns
425
574
  * @returns Break position in the content, or -1 if no break found
426
575
  */
427
- const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, ctx) => {
428
- const { pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, prefer } = ctx;
576
+ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
577
+ const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
429
578
  for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
430
579
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
431
580
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
432
581
  if (skipWhenRegex?.test(remainingContent)) continue;
433
- if (regex === null) {
434
- const nextPageIdx = windowEndIdx + 1;
435
- if (nextPageIdx <= toIdx) {
436
- const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
437
- if (nextPageData) {
438
- const pos = findNextPagePosition(remainingContent, nextPageData);
439
- if (pos > 0) return pos;
440
- }
441
- }
442
- return Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
443
- }
444
- const windowEndPosition = Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
445
- const breakPos = findPatternBreakPosition(remainingContent.slice(0, windowEndPosition), regex, prefer);
582
+ if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
583
+ const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
446
584
  if (breakPos > 0) return breakPos;
447
585
  }
448
586
  return -1;
449
587
  };
450
588
 
589
+ //#endregion
590
+ //#region src/segmentation/breakpoint-processor.ts
591
+ /**
592
+ * Breakpoint post-processing engine extracted from segmenter.ts.
593
+ *
594
+ * This module is intentionally split into small helpers to reduce cognitive complexity
595
+ * and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
596
+ */
597
+ const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
598
+ const buildNormalizedPagesMap = (pages, normalizedContent) => {
599
+ const normalizedPages = /* @__PURE__ */ new Map();
600
+ for (let i = 0; i < pages.length; i++) {
601
+ const content = normalizedContent[i];
602
+ normalizedPages.set(pages[i].id, {
603
+ content,
604
+ index: i,
605
+ length: content.length
606
+ });
607
+ }
608
+ return normalizedPages;
609
+ };
610
+ const buildCumulativeOffsets = (pageIds, normalizedPages) => {
611
+ const cumulativeOffsets = [0];
612
+ let totalOffset = 0;
613
+ for (let i = 0; i < pageIds.length; i++) {
614
+ const pageData = normalizedPages.get(pageIds[i]);
615
+ totalOffset += pageData ? pageData.length : 0;
616
+ if (i < pageIds.length - 1) totalOffset += 1;
617
+ cumulativeOffsets.push(totalOffset);
618
+ }
619
+ return cumulativeOffsets;
620
+ };
621
+ const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
622
+ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
623
+ const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
624
+ let windowEndIdx = currentFromIdx;
625
+ for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
626
+ else break;
627
+ return windowEndIdx;
628
+ };
629
+ const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
630
+ const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
631
+ const computePiecePages = (pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages) => {
632
+ const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
633
+ return {
634
+ actualEndIdx: pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx,
635
+ actualStartIdx
636
+ };
637
+ };
638
+ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
639
+ let nextFromIdx = actualEndIdx;
640
+ if (remainingContent && actualEndIdx + 1 <= toIdx) {
641
+ const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
642
+ if (nextPageData) {
643
+ const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
644
+ const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
645
+ if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
646
+ }
647
+ }
648
+ return nextFromIdx;
649
+ };
650
+ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
651
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
652
+ const result = [];
653
+ let remainingContent = segment.content;
654
+ let currentFromIdx = fromIdx;
655
+ let isFirstPiece = true;
656
+ let iterationCount = 0;
657
+ const maxIterations = 1e4;
658
+ while (currentFromIdx <= toIdx) {
659
+ iterationCount++;
660
+ if (iterationCount > maxIterations) {
661
+ logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
662
+ break;
663
+ }
664
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
665
+ if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
666
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
667
+ if (finalSeg) result.push(finalSeg);
668
+ break;
669
+ }
670
+ const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
671
+ const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
672
+ const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
673
+ let breakPosition = -1;
674
+ if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
675
+ if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
676
+ expandedBreakpoints,
677
+ normalizedPages,
678
+ pageIds,
679
+ prefer
680
+ });
681
+ if (breakPosition <= 0) breakPosition = windowEndPosition;
682
+ const pieceContent = remainingContent.slice(0, breakPosition).trim();
683
+ const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
684
+ if (pieceContent) {
685
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
686
+ if (pieceSeg) result.push(pieceSeg);
687
+ }
688
+ remainingContent = remainingContent.slice(breakPosition).trim();
689
+ if (!remainingContent) break;
690
+ currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
691
+ isFirstPiece = false;
692
+ }
693
+ return result;
694
+ };
695
+ /**
696
+ * Applies breakpoints to oversized segments.
697
+ *
698
+ * Note: This is an internal engine used by `segmentPages()`.
699
+ */
700
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
701
+ const pageIds = pages.map((p) => p.id);
702
+ const pageIdToIndex = buildPageIdToIndexMap(pageIds);
703
+ const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
704
+ const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
705
+ const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
706
+ const result = [];
707
+ logger?.info?.("Starting breakpoint processing", {
708
+ maxPages,
709
+ segmentCount: segments.length
710
+ });
711
+ for (const segment of segments) {
712
+ const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
713
+ const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
714
+ const segmentSpan = (segment.to ?? segment.from) - segment.from;
715
+ const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
716
+ if (segmentSpan <= maxPages && !hasExclusions) {
717
+ result.push(segment);
718
+ continue;
719
+ }
720
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
721
+ result.push(...broken.map((s) => {
722
+ const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
723
+ const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
724
+ if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
725
+ ...s,
726
+ content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
727
+ };
728
+ return s;
729
+ }));
730
+ }
731
+ logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
732
+ return result;
733
+ };
734
+
451
735
  //#endregion
452
736
  //#region src/segmentation/match-utils.ts
453
737
  /**
@@ -613,28 +897,6 @@ const anyRuleAllowsId = (rules, pageId) => {
613
897
  });
614
898
  };
615
899
 
616
- //#endregion
617
- //#region src/segmentation/textUtils.ts
618
- /**
619
- * Strip all HTML tags from content, keeping only text.
620
- *
621
- * @param html - HTML content
622
- * @returns Plain text content
623
- */
624
- const stripHtmlTags = (html) => {
625
- return html.replace(/<[^>]*>/g, "");
626
- };
627
- /**
628
- * Normalizes line endings to Unix-style (`\n`).
629
- *
630
- * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
631
- * for consistent pattern matching across platforms.
632
- *
633
- * @param content - Raw content with potentially mixed line endings
634
- * @returns Content with all line endings normalized to `\n`
635
- */
636
- const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
637
-
638
900
  //#endregion
639
901
  //#region src/segmentation/tokens.ts
640
902
  /**
@@ -721,6 +983,7 @@ const BASE_TOKENS = {
721
983
  dash: "[-–—ـ]",
722
984
  fasl: ["مسألة", "فصل"].join("|"),
723
985
  harf: "[أ-ي]",
986
+ harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
724
987
  kitab: "كتاب",
725
988
  naql: [
726
989
  "حدثني",
@@ -1004,15 +1267,12 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
1004
1267
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1005
1268
 
1006
1269
  //#endregion
1007
- //#region src/segmentation/segmenter.ts
1270
+ //#region src/segmentation/rule-regex.ts
1008
1271
  /**
1009
- * Core segmentation engine for splitting Arabic text pages into logical segments.
1272
+ * Split rule compiled regex builder.
1010
1273
  *
1011
- * The segmenter takes an array of pages and applies pattern-based rules to
1012
- * identify split points, producing segments with content, page references,
1013
- * and optional metadata.
1014
- *
1015
- * @module segmenter
1274
+ * Extracted from `segmenter.ts` to reduce cognitive complexity and enable
1275
+ * independent unit testing of regex compilation and token expansion behavior.
1016
1276
  */
1017
1277
  /**
1018
1278
  * Checks if a regex pattern contains standard (anonymous) capturing groups.
@@ -1023,35 +1283,41 @@ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1023
1283
  * - Lookbehind assertions `(?<=...)` and `(?<!...)`
1024
1284
  * - Named groups `(?<name>...)` (start with `(?` so excluded here)
1025
1285
  *
1026
- * **Note**: Named capture groups `(?<name>...)` ARE capturing groups but are
1027
- * excluded by this check because they are tracked separately via the
1028
- * `captureNames` array from token expansion. This function only detects
1029
- * anonymous capturing groups like `(.*)`.
1030
- *
1031
- * @param pattern - Regex pattern string to analyze
1032
- * @returns `true` if the pattern contains at least one anonymous capturing group
1286
+ * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
1033
1287
  */
1034
1288
  const hasCapturingGroup = (pattern) => {
1035
1289
  return /\((?!\?)/.test(pattern);
1036
1290
  };
1037
1291
  /**
1038
- * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
1292
+ * Extracts named capture group names from a regex pattern.
1039
1293
  *
1040
- * Fuzzy matching makes Arabic text diacritic-insensitive. When enabled, the
1041
- * transform is applied to token patterns BEFORE wrapping with capture groups,
1042
- * ensuring regex metacharacters (`(`, `)`, `|`, etc.) are not corrupted.
1043
- *
1044
- * @param pattern - Pattern string potentially containing `{{token}}` placeholders
1045
- * @param fuzzy - Whether to apply diacritic-insensitive transformation
1046
- * @returns Processed pattern with expanded tokens and capture names
1294
+ * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
1047
1295
  *
1048
1296
  * @example
1049
- * processPattern('{{raqms:num}} {{dash}}', false)
1050
- * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ]', captureNames: ['num'] }
1297
+ * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
1298
+ * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
1299
+ * extractNamedCaptureNames('^\\d+') // []
1300
+ */
1301
+ const extractNamedCaptureNames = (pattern) => {
1302
+ const names = [];
1303
+ for (const match of pattern.matchAll(/\(\?<([^>]+)>/g)) names.push(match[1]);
1304
+ return names;
1305
+ };
1306
+ /**
1307
+ * Safely compiles a regex pattern, throwing a helpful error if invalid.
1308
+ */
1309
+ const compileRuleRegex = (pattern) => {
1310
+ try {
1311
+ return new RegExp(pattern, "gmu");
1312
+ } catch (error) {
1313
+ const message = error instanceof Error ? error.message : String(error);
1314
+ throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
1315
+ }
1316
+ };
1317
+ /**
1318
+ * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
1051
1319
  *
1052
- * @example
1053
- * processPattern('{{naql}}', true)
1054
- * // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
1320
+ * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1055
1321
  */
1056
1322
  const processPattern = (pattern, fuzzy) => {
1057
1323
  const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
@@ -1060,77 +1326,116 @@ const processPattern = (pattern, fuzzy) => {
1060
1326
  pattern: expanded
1061
1327
  };
1062
1328
  };
1329
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
1330
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1331
+ const union = processed.map((p) => p.pattern).join("|");
1332
+ return {
1333
+ captureNames: processed.flatMap((p) => p.captureNames),
1334
+ regex: `^(?:${union})(.*)`
1335
+ };
1336
+ };
1337
+ const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
1338
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1339
+ const union = processed.map((p) => p.pattern).join("|");
1340
+ return {
1341
+ captureNames: processed.flatMap((p) => p.captureNames),
1342
+ regex: `^(?:${union})`
1343
+ };
1344
+ };
1345
+ const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
1346
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1347
+ const union = processed.map((p) => p.pattern).join("|");
1348
+ return {
1349
+ captureNames: processed.flatMap((p) => p.captureNames),
1350
+ regex: `(?:${union})$`
1351
+ };
1352
+ };
1353
+ const buildTemplateRegexSource = (template) => {
1354
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
1355
+ return {
1356
+ captureNames,
1357
+ regex: pattern
1358
+ };
1359
+ };
1360
+ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(regexSource);
1063
1361
  /**
1064
1362
  * Builds a compiled regex and metadata from a split rule.
1065
1363
  *
1066
- * Handles all pattern types:
1067
- * - `regex`: Used as-is (no token expansion)
1068
- * - `template`: Tokens expanded via `expandTokensWithCaptures`
1069
- * - `lineStartsWith`: Converted to `^(?:patterns...)`
1070
- * - `lineStartsAfter`: Converted to `^(?:patterns...)(.*)`
1071
- * - `lineEndsWith`: Converted to `(?:patterns...)$`
1072
- *
1073
- * @param rule - Split rule containing pattern and options
1074
- * @returns Compiled regex with capture metadata
1364
+ * Behavior mirrors the previous implementation in `segmenter.ts`.
1075
1365
  */
1076
1366
  const buildRuleRegex = (rule) => {
1077
1367
  const s = { ...rule };
1078
1368
  const fuzzy = rule.fuzzy ?? false;
1079
1369
  let allCaptureNames = [];
1080
- /**
1081
- * Safely compiles a regex pattern, throwing a helpful error if invalid.
1082
- *
1083
- * @remarks
1084
- * This catches syntax errors only. It does NOT protect against ReDoS
1085
- * (catastrophic backtracking) from pathological patterns. Avoid compiling
1086
- * patterns from untrusted sources.
1087
- */
1088
- const compileRegex = (pattern) => {
1089
- try {
1090
- return new RegExp(pattern, "gmu");
1091
- } catch (error) {
1092
- const message = error instanceof Error ? error.message : String(error);
1093
- throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
1094
- }
1095
- };
1096
1370
  if (s.lineStartsAfter?.length) {
1097
- const processed = s.lineStartsAfter.map((p) => processPattern(p, fuzzy));
1098
- const patterns = processed.map((p) => p.pattern).join("|");
1099
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1100
- s.regex = `^(?:${patterns})(.*)`;
1371
+ const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
1372
+ allCaptureNames = captureNames;
1101
1373
  return {
1102
1374
  captureNames: allCaptureNames,
1103
- regex: compileRegex(s.regex),
1375
+ regex: compileRuleRegex(regex),
1104
1376
  usesCapture: true,
1105
1377
  usesLineStartsAfter: true
1106
1378
  };
1107
1379
  }
1108
1380
  if (s.lineStartsWith?.length) {
1109
- const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
1110
- const patterns = processed.map((p) => p.pattern).join("|");
1111
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1112
- s.regex = `^(?:${patterns})`;
1381
+ const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
1382
+ s.regex = regex;
1383
+ allCaptureNames = captureNames;
1113
1384
  }
1114
1385
  if (s.lineEndsWith?.length) {
1115
- const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
1116
- const patterns = processed.map((p) => p.pattern).join("|");
1117
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1118
- s.regex = `(?:${patterns})$`;
1386
+ const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
1387
+ s.regex = regex;
1388
+ allCaptureNames = captureNames;
1119
1389
  }
1120
1390
  if (s.template) {
1121
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(s.template));
1122
- s.regex = pattern;
1391
+ const { regex, captureNames } = buildTemplateRegexSource(s.template);
1392
+ s.regex = regex;
1123
1393
  allCaptureNames = [...allCaptureNames, ...captureNames];
1124
1394
  }
1125
1395
  if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1126
- const usesCapture = hasCapturingGroup(s.regex) || allCaptureNames.length > 0;
1396
+ if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(s.regex);
1397
+ const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
1127
1398
  return {
1128
1399
  captureNames: allCaptureNames,
1129
- regex: compileRegex(s.regex),
1400
+ regex: compileRuleRegex(s.regex),
1130
1401
  usesCapture,
1131
1402
  usesLineStartsAfter: false
1132
1403
  };
1133
1404
  };
1405
+
1406
+ //#endregion
1407
+ //#region src/segmentation/textUtils.ts
1408
+ /**
1409
+ * Strip all HTML tags from content, keeping only text.
1410
+ *
1411
+ * @param html - HTML content
1412
+ * @returns Plain text content
1413
+ */
1414
+ const stripHtmlTags = (html) => {
1415
+ return html.replace(/<[^>]*>/g, "");
1416
+ };
1417
+ /**
1418
+ * Normalizes line endings to Unix-style (`\n`).
1419
+ *
1420
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
1421
+ * for consistent pattern matching across platforms.
1422
+ *
1423
+ * @param content - Raw content with potentially mixed line endings
1424
+ * @returns Content with all line endings normalized to `\n`
1425
+ */
1426
+ const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1427
+
1428
+ //#endregion
1429
+ //#region src/segmentation/segmenter.ts
1430
+ /**
1431
+ * Core segmentation engine for splitting Arabic text pages into logical segments.
1432
+ *
1433
+ * The segmenter takes an array of pages and applies pattern-based rules to
1434
+ * identify split points, producing segments with content, page references,
1435
+ * and optional metadata.
1436
+ *
1437
+ * @module segmenter
1438
+ */
1134
1439
  /**
1135
1440
  * Builds a concatenated content string and page mapping from input pages.
1136
1441
  *
@@ -1200,6 +1505,62 @@ const buildPageMap = (pages) => {
1200
1505
  };
1201
1506
  };
1202
1507
  /**
1508
+ * Deduplicate split points by index, preferring ones with more information.
1509
+ *
1510
+ * Preference rules (when same index):
1511
+ * - Prefer a split with `contentStartOffset` (needed for `lineStartsAfter` marker stripping)
1512
+ * - Otherwise prefer a split with `meta` over one without
1513
+ */
1514
+ const dedupeSplitPoints = (splitPoints) => {
1515
+ const byIndex = /* @__PURE__ */ new Map();
1516
+ for (const p of splitPoints) {
1517
+ const existing = byIndex.get(p.index);
1518
+ if (!existing) {
1519
+ byIndex.set(p.index, p);
1520
+ continue;
1521
+ }
1522
+ if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
1523
+ }
1524
+ const unique = [...byIndex.values()];
1525
+ unique.sort((a, b) => a.index - b.index);
1526
+ return unique;
1527
+ };
1528
+ /**
1529
+ * If no structural rules produced segments, create a single segment spanning all pages.
1530
+ * This allows breakpoint processing to still run.
1531
+ */
1532
+ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
1533
+ if (segments.length > 0 || pages.length === 0) return segments;
1534
+ const firstPage = pages[0];
1535
+ const lastPage = pages[pages.length - 1];
1536
+ const joinChar = pageJoiner === "newline" ? "\n" : " ";
1537
+ const allContent = normalizedContent.join(joinChar).trim();
1538
+ if (!allContent) return segments;
1539
+ const initialSeg = {
1540
+ content: allContent,
1541
+ from: firstPage.id
1542
+ };
1543
+ if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1544
+ return [initialSeg];
1545
+ };
1546
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1547
+ const collectSplitPointsFromRule = (rule) => {
1548
+ const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1549
+ return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
1550
+ const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1551
+ const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1552
+ return {
1553
+ capturedContent: isLineStartsAfter ? void 0 : m.captured,
1554
+ contentStartOffset: isLineStartsAfter ? markerLength : void 0,
1555
+ index: (rule.split ?? "at") === "at" ? m.start : m.end,
1556
+ meta: rule.meta,
1557
+ namedCaptures: m.namedCaptures
1558
+ };
1559
+ });
1560
+ };
1561
+ return rules.flatMap(collectSplitPointsFromRule);
1562
+ };
1563
+ /**
1203
1564
  * Executes a regex against content and extracts match results with capture information.
1204
1565
  *
1205
1566
  * @param content - Full content string to search
@@ -1282,202 +1643,6 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1282
1643
  * @param prefer - 'longer' for last match, 'shorter' for first match
1283
1644
  * @returns Processed segments with oversized ones broken up
1284
1645
  */
1285
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger) => {
1286
- const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
1287
- const startingPageId = pageIds$1[currentFromIdx];
1288
- if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
1289
- for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
1290
- const pageId = pageIds$1[pageIdx];
1291
- if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets$1[pageIdx] - cumulativeOffsets$1[currentFromIdx];
1292
- }
1293
- return -1;
1294
- };
1295
- const pageIds = pages.map((p) => p.id);
1296
- const pageIdToIndex = new Map(pageIds.map((id, i) => [id, i]));
1297
- const normalizedPages = /* @__PURE__ */ new Map();
1298
- for (let i = 0; i < pages.length; i++) {
1299
- const content = normalizedContent[i];
1300
- normalizedPages.set(pages[i].id, {
1301
- content,
1302
- index: i,
1303
- length: content.length
1304
- });
1305
- }
1306
- const cumulativeOffsets = [0];
1307
- let totalOffset = 0;
1308
- for (let i = 0; i < pageIds.length; i++) {
1309
- const pageData = normalizedPages.get(pageIds[i]);
1310
- totalOffset += pageData ? pageData.length : 0;
1311
- if (i < pageIds.length - 1) totalOffset += 1;
1312
- cumulativeOffsets.push(totalOffset);
1313
- }
1314
- const patternProcessor = (p) => processPattern(p, false).pattern;
1315
- const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
1316
- const result = [];
1317
- logger?.info?.("Starting breakpoint processing", {
1318
- maxPages,
1319
- segmentCount: segments.length
1320
- });
1321
- for (const segment of segments) {
1322
- const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
1323
- const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1324
- logger?.debug?.("Processing segment", {
1325
- contentLength: segment.content.length,
1326
- contentPreview: segment.content.slice(0, 100),
1327
- from: segment.from,
1328
- fromIdx,
1329
- to: segment.to,
1330
- toIdx
1331
- });
1332
- const segmentSpan = (segment.to ?? segment.from) - segment.from;
1333
- const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
1334
- if (segmentSpan <= maxPages && !hasExclusions) {
1335
- logger?.trace?.("Segment within limit, keeping as-is");
1336
- result.push(segment);
1337
- continue;
1338
- }
1339
- logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
1340
- let remainingContent = segment.content;
1341
- let currentFromIdx = fromIdx;
1342
- let isFirstPiece = true;
1343
- let iterationCount = 0;
1344
- const maxIterations = 1e4;
1345
- while (currentFromIdx <= toIdx) {
1346
- iterationCount++;
1347
- if (iterationCount > maxIterations) {
1348
- logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
1349
- logger?.error?.("Loop state", {
1350
- currentFromIdx,
1351
- remainingContentLength: remainingContent.length,
1352
- toIdx
1353
- });
1354
- break;
1355
- }
1356
- const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
1357
- logger?.trace?.("Loop iteration", {
1358
- currentFromIdx,
1359
- currentPageId: pageIds[currentFromIdx],
1360
- iterationCount,
1361
- remainingContentLength: remainingContent.length,
1362
- remainingContentPreview: remainingContent.slice(0, 80),
1363
- remainingSpan,
1364
- toIdx,
1365
- toPageId: pageIds[toIdx]
1366
- });
1367
- const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
1368
- if (remainingSpan <= maxPages && !remainingHasExclusions) {
1369
- logger?.debug?.("Remaining span within limit, outputting final segment");
1370
- const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1371
- if (finalSeg) result.push(finalSeg);
1372
- break;
1373
- }
1374
- const currentPageId = pageIds[currentFromIdx];
1375
- const maxWindowPageId = currentPageId + maxPages;
1376
- let windowEndIdx = currentFromIdx;
1377
- for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
1378
- else break;
1379
- logger?.trace?.("Window calculation", {
1380
- currentPageId,
1381
- maxWindowPageId,
1382
- windowEndIdx,
1383
- windowEndPageId: pageIds[windowEndIdx]
1384
- });
1385
- const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
1386
- let breakPosition = -1;
1387
- if (windowHasExclusions) {
1388
- logger?.trace?.("Window has exclusions, finding exclusion break position");
1389
- breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1390
- logger?.trace?.("Exclusion break position", { breakPosition });
1391
- }
1392
- if (breakPosition <= 0) {
1393
- const breakpointCtx = {
1394
- cumulativeOffsets,
1395
- expandedBreakpoints,
1396
- normalizedPages,
1397
- pageIds,
1398
- prefer
1399
- };
1400
- logger?.trace?.("Finding break position using patterns...");
1401
- breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
1402
- logger?.trace?.("Pattern break position", { breakPosition });
1403
- }
1404
- if (breakPosition <= 0) {
1405
- logger?.debug?.("No pattern matched, falling back to page boundary");
1406
- if (windowEndIdx === currentFromIdx) {
1407
- logger?.trace?.("Single page window, outputting page and advancing");
1408
- const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
1409
- const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
1410
- if (pageSeg) result.push(pageSeg);
1411
- remainingContent = remainingContent.slice(pageContent.length).trim();
1412
- currentFromIdx++;
1413
- isFirstPiece = false;
1414
- logger?.trace?.("After single page", {
1415
- currentFromIdx,
1416
- remainingContentLength: remainingContent.length
1417
- });
1418
- continue;
1419
- }
1420
- breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
1421
- logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
1422
- }
1423
- const pieceContent = remainingContent.slice(0, breakPosition).trim();
1424
- logger?.trace?.("Piece extracted", {
1425
- breakPosition,
1426
- pieceContentLength: pieceContent.length,
1427
- pieceContentPreview: pieceContent.slice(0, 80)
1428
- });
1429
- const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
1430
- const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
1431
- logger?.trace?.("Actual page indices", {
1432
- actualEndIdx,
1433
- actualStartIdx,
1434
- pieceHasContent: !!pieceContent
1435
- });
1436
- if (pieceContent) {
1437
- const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1438
- if (pieceSeg) {
1439
- result.push(pieceSeg);
1440
- logger?.debug?.("Created segment", {
1441
- contentLength: pieceSeg.content.length,
1442
- from: pieceSeg.from,
1443
- to: pieceSeg.to
1444
- });
1445
- }
1446
- }
1447
- const prevRemainingLength = remainingContent.length;
1448
- remainingContent = remainingContent.slice(breakPosition).trim();
1449
- logger?.trace?.("After slicing remainingContent", {
1450
- newLength: remainingContent.length,
1451
- prevLength: prevRemainingLength,
1452
- slicedAmount: breakPosition
1453
- });
1454
- if (!remainingContent) {
1455
- logger?.debug?.("No remaining content, breaking out of loop");
1456
- break;
1457
- }
1458
- let nextFromIdx = actualEndIdx;
1459
- if (remainingContent && actualEndIdx + 1 <= toIdx) {
1460
- const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
1461
- if (nextPageData) {
1462
- const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
1463
- if (nextPrefix && remainingContent.startsWith(nextPrefix)) {
1464
- nextFromIdx = actualEndIdx + 1;
1465
- logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
1466
- }
1467
- }
1468
- }
1469
- logger?.trace?.("End of iteration", {
1470
- nextFromIdx,
1471
- prevCurrentFromIdx: currentFromIdx,
1472
- willAdvance: nextFromIdx !== currentFromIdx
1473
- });
1474
- currentFromIdx = nextFromIdx;
1475
- isFirstPiece = false;
1476
- }
1477
- }
1478
- logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
1479
- return result;
1480
- };
1481
1646
  /**
1482
1647
  * Segments pages of content based on pattern-matching rules.
1483
1648
  *
@@ -1521,45 +1686,15 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1521
1686
  * });
1522
1687
  */
1523
1688
  const segmentPages = (pages, options) => {
1524
- const { rules = [], maxPages, breakpoints, prefer = "longer", logger } = options;
1689
+ const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
1525
1690
  if (!pages.length) return [];
1526
1691
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1527
- const splitPoints = [];
1528
- for (const rule of rules) {
1529
- const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1530
- const finalMatches = filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence);
1531
- for (const m of finalMatches) {
1532
- const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1533
- const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1534
- splitPoints.push({
1535
- capturedContent: isLineStartsAfter ? void 0 : m.captured,
1536
- contentStartOffset: isLineStartsAfter ? markerLength : void 0,
1537
- index: (rule.split ?? "at") === "at" ? m.start : m.end,
1538
- meta: rule.meta,
1539
- namedCaptures: m.namedCaptures
1540
- });
1541
- }
1542
- }
1543
- const byIndex = /* @__PURE__ */ new Map();
1544
- for (const p of splitPoints) {
1545
- const existing = byIndex.get(p.index);
1546
- if (!existing) byIndex.set(p.index, p);
1547
- else if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
1548
- }
1549
- const unique = [...byIndex.values()];
1550
- unique.sort((a, b) => a.index - b.index);
1551
- let segments = buildSegments(unique, matchContent, pageMap, rules);
1552
- if (segments.length === 0 && pages.length > 0) {
1553
- const firstPage = pages[0];
1554
- const lastPage = pages[pages.length - 1];
1555
- const initialSeg = {
1556
- content: normalizedContent.join("\n").trim(),
1557
- from: firstPage.id
1558
- };
1559
- if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1560
- if (initialSeg.content) segments = [initialSeg];
1692
+ let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
1693
+ segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
1694
+ if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
1695
+ const patternProcessor = (p) => processPattern(p, false).pattern;
1696
+ return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
1561
1697
  }
1562
- if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger);
1563
1698
  return segments;
1564
1699
  };
1565
1700
  /**