flappa-doormal 2.2.1 → 2.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -157,6 +157,26 @@ const makeDiacriticInsensitive = (text) => {
157
157
 
158
158
  //#endregion
159
159
  //#region src/segmentation/breakpoint-utils.ts
160
+ const WINDOW_PREFIX_LENGTHS = [
161
+ 80,
162
+ 60,
163
+ 40,
164
+ 30,
165
+ 20,
166
+ 15
167
+ ];
168
+ const JOINER_PREFIX_LENGTHS = [
169
+ 80,
170
+ 60,
171
+ 40,
172
+ 30,
173
+ 20,
174
+ 15,
175
+ 12,
176
+ 10,
177
+ 8,
178
+ 6
179
+ ];
160
180
  /**
161
181
  * Normalizes a breakpoint to the object form.
162
182
  * Strings are converted to { pattern: str } with no constraints.
@@ -312,6 +332,120 @@ const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp
312
332
  }
313
333
  });
314
334
  /**
335
+ * Applies a configured joiner at detected page boundaries within a multi-page content chunk.
336
+ *
337
+ * This is used for breakpoint-generated segments which don't have access to the original
338
+ * `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
339
+ * prefix after the previous boundary, then replace ONLY the single newline immediately before
340
+ * that page start.
341
+ *
342
+ * This avoids converting real in-page newlines, while still normalizing page joins consistently.
343
+ */
344
+ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
345
+ if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
346
+ let updated = content;
347
+ let searchFrom = 0;
348
+ for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
349
+ const pageData = normalizedPages.get(pageIds[pi]);
350
+ if (!pageData) continue;
351
+ const trimmed = pageData.content.trimStart();
352
+ let found = -1;
353
+ for (const len of JOINER_PREFIX_LENGTHS) {
354
+ const prefix = trimmed.slice(0, Math.min(len, trimmed.length)).trim();
355
+ if (!prefix) continue;
356
+ const pos = updated.indexOf(prefix, searchFrom);
357
+ if (pos > 0) {
358
+ found = pos;
359
+ break;
360
+ }
361
+ }
362
+ if (found > 0) {
363
+ if (updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
364
+ searchFrom = found;
365
+ }
366
+ }
367
+ return updated;
368
+ };
369
+ /**
370
+ * Estimates how far into the current page `remainingContent` begins.
371
+ *
372
+ * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
373
+ * When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
374
+ * expected boundary positions. This helper computes an approximate starting offset by matching
375
+ * a short prefix of `remainingContent` inside the current page content.
376
+ */
377
+ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
378
+ const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
379
+ if (!currentPageData) return 0;
380
+ const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
381
+ const needle = remStart.slice(0, Math.min(30, remStart.length));
382
+ if (!needle) return 0;
383
+ const idx = currentPageData.content.indexOf(needle);
384
+ return idx > 0 ? idx : 0;
385
+ };
386
+ /**
387
+ * Attempts to find the start position of a target page within remainingContent,
388
+ * anchored near an expected boundary position to reduce collisions.
389
+ *
390
+ * This is used to define breakpoint windows in terms of actual content being split, rather than
391
+ * raw per-page offsets which can desync when structural rules strip markers.
392
+ */
393
+ const findPageStartNearExpectedBoundary = (remainingContent, currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
394
+ const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
395
+ if (!targetPageData) return -1;
396
+ const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
397
+ const searchStart = Math.max(0, approx - 1e4);
398
+ const searchEnd = Math.min(remainingContent.length, approx + 2e3);
399
+ const targetTrimmed = targetPageData.content.trimStart();
400
+ for (const len of WINDOW_PREFIX_LENGTHS) {
401
+ const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
402
+ if (!prefix) continue;
403
+ let pos = remainingContent.indexOf(prefix, searchStart);
404
+ while (pos !== -1 && pos <= searchEnd) {
405
+ if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
406
+ pos = remainingContent.indexOf(prefix, pos + 1);
407
+ }
408
+ const last = remainingContent.lastIndexOf(prefix, approx);
409
+ if (last > 0) return last;
410
+ }
411
+ return -1;
412
+ };
413
+ /**
414
+ * Finds the end position of a breakpoint window inside `remainingContent`.
415
+ *
416
+ * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
417
+ * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
418
+ * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
419
+ */
420
+ const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
421
+ if (windowEndIdx >= toIdx) return remainingContent.length;
422
+ const desiredNextIdx = windowEndIdx + 1;
423
+ const minNextIdx = currentFromIdx + 1;
424
+ const maxNextIdx = Math.min(desiredNextIdx, toIdx);
425
+ const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
426
+ for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
427
+ const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
428
+ const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
429
+ if (pos > 0) return pos;
430
+ }
431
+ return remainingContent.length;
432
+ };
433
+ /**
434
+ * Finds exclusion-based break position using raw cumulative offsets.
435
+ *
436
+ * This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
437
+ * Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
438
+ */
439
+ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
440
+ const startingPageId = pageIds[currentFromIdx];
441
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
442
+ for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
443
+ const pageId = pageIds[pageIdx];
444
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
445
+ }
446
+ return -1;
447
+ };
448
+ /**
315
449
  * Finds the actual ending page index by searching backwards for page content prefix.
316
450
  * Used to determine which page a segment actually ends on based on content matching.
317
451
  *
@@ -424,8 +558,8 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
424
558
  * @param ctx - Breakpoint context with page data and patterns
425
559
  * @returns Break position in the content, or -1 if no break found
426
560
  */
427
- const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, ctx) => {
428
- const { pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, prefer } = ctx;
561
+ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
562
+ const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
429
563
  for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
430
564
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
431
565
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
@@ -436,18 +570,162 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
436
570
  const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
437
571
  if (nextPageData) {
438
572
  const pos = findNextPagePosition(remainingContent, nextPageData);
439
- if (pos > 0) return pos;
573
+ if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
440
574
  }
441
575
  }
442
- return Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
576
+ return Math.min(windowEndPosition, remainingContent.length);
443
577
  }
444
- const windowEndPosition = Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
445
- const breakPos = findPatternBreakPosition(remainingContent.slice(0, windowEndPosition), regex, prefer);
578
+ const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
446
579
  if (breakPos > 0) return breakPos;
447
580
  }
448
581
  return -1;
449
582
  };
450
583
 
584
+ //#endregion
585
+ //#region src/segmentation/breakpoint-processor.ts
586
+ /**
587
+ * Breakpoint post-processing engine extracted from segmenter.ts.
588
+ *
589
+ * This module is intentionally split into small helpers to reduce cognitive complexity
590
+ * and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
591
+ */
592
+ const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
593
+ const buildNormalizedPagesMap = (pages, normalizedContent) => {
594
+ const normalizedPages = /* @__PURE__ */ new Map();
595
+ for (let i = 0; i < pages.length; i++) {
596
+ const content = normalizedContent[i];
597
+ normalizedPages.set(pages[i].id, {
598
+ content,
599
+ index: i,
600
+ length: content.length
601
+ });
602
+ }
603
+ return normalizedPages;
604
+ };
605
+ const buildCumulativeOffsets = (pageIds, normalizedPages) => {
606
+ const cumulativeOffsets = [0];
607
+ let totalOffset = 0;
608
+ for (let i = 0; i < pageIds.length; i++) {
609
+ const pageData = normalizedPages.get(pageIds[i]);
610
+ totalOffset += pageData ? pageData.length : 0;
611
+ if (i < pageIds.length - 1) totalOffset += 1;
612
+ cumulativeOffsets.push(totalOffset);
613
+ }
614
+ return cumulativeOffsets;
615
+ };
616
+ const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
617
+ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
618
+ const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
619
+ let windowEndIdx = currentFromIdx;
620
+ for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
621
+ else break;
622
+ return windowEndIdx;
623
+ };
624
+ const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
625
+ const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
626
+ const computePiecePages = (pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages) => {
627
+ const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
628
+ return {
629
+ actualEndIdx: pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx,
630
+ actualStartIdx
631
+ };
632
+ };
633
+ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
634
+ let nextFromIdx = actualEndIdx;
635
+ if (remainingContent && actualEndIdx + 1 <= toIdx) {
636
+ const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
637
+ if (nextPageData) {
638
+ const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
639
+ if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
640
+ }
641
+ }
642
+ return nextFromIdx;
643
+ };
644
+ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
645
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
646
+ const result = [];
647
+ let remainingContent = segment.content;
648
+ let currentFromIdx = fromIdx;
649
+ let isFirstPiece = true;
650
+ let iterationCount = 0;
651
+ const maxIterations = 1e4;
652
+ while (currentFromIdx <= toIdx) {
653
+ iterationCount++;
654
+ if (iterationCount > maxIterations) {
655
+ logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
656
+ break;
657
+ }
658
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
659
+ if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
660
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
661
+ if (finalSeg) result.push(finalSeg);
662
+ break;
663
+ }
664
+ const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
665
+ const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
666
+ const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
667
+ let breakPosition = -1;
668
+ if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
669
+ if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
670
+ expandedBreakpoints,
671
+ normalizedPages,
672
+ pageIds,
673
+ prefer
674
+ });
675
+ if (breakPosition <= 0) breakPosition = windowEndPosition;
676
+ const pieceContent = remainingContent.slice(0, breakPosition).trim();
677
+ const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
678
+ if (pieceContent) {
679
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
680
+ if (pieceSeg) result.push(pieceSeg);
681
+ }
682
+ remainingContent = remainingContent.slice(breakPosition).trim();
683
+ if (!remainingContent) break;
684
+ currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
685
+ isFirstPiece = false;
686
+ }
687
+ return result;
688
+ };
689
+ /**
690
+ * Applies breakpoints to oversized segments.
691
+ *
692
+ * Note: This is an internal engine used by `segmentPages()`.
693
+ */
694
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
695
+ const pageIds = pages.map((p) => p.id);
696
+ const pageIdToIndex = buildPageIdToIndexMap(pageIds);
697
+ const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
698
+ const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
699
+ const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
700
+ const result = [];
701
+ logger?.info?.("Starting breakpoint processing", {
702
+ maxPages,
703
+ segmentCount: segments.length
704
+ });
705
+ for (const segment of segments) {
706
+ const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
707
+ const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
708
+ const segmentSpan = (segment.to ?? segment.from) - segment.from;
709
+ const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
710
+ if (segmentSpan <= maxPages && !hasExclusions) {
711
+ result.push(segment);
712
+ continue;
713
+ }
714
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
715
+ result.push(...broken.map((s) => {
716
+ const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
717
+ const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
718
+ if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
719
+ ...s,
720
+ content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
721
+ };
722
+ return s;
723
+ }));
724
+ }
725
+ logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
726
+ return result;
727
+ };
728
+
451
729
  //#endregion
452
730
  //#region src/segmentation/match-utils.ts
453
731
  /**
@@ -613,28 +891,6 @@ const anyRuleAllowsId = (rules, pageId) => {
613
891
  });
614
892
  };
615
893
 
616
- //#endregion
617
- //#region src/segmentation/textUtils.ts
618
- /**
619
- * Strip all HTML tags from content, keeping only text.
620
- *
621
- * @param html - HTML content
622
- * @returns Plain text content
623
- */
624
- const stripHtmlTags = (html) => {
625
- return html.replace(/<[^>]*>/g, "");
626
- };
627
- /**
628
- * Normalizes line endings to Unix-style (`\n`).
629
- *
630
- * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
631
- * for consistent pattern matching across platforms.
632
- *
633
- * @param content - Raw content with potentially mixed line endings
634
- * @returns Content with all line endings normalized to `\n`
635
- */
636
- const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
637
-
638
894
  //#endregion
639
895
  //#region src/segmentation/tokens.ts
640
896
  /**
@@ -716,13 +972,21 @@ const escapeTemplateBrackets = (pattern) => {
716
972
  */
717
973
  const BASE_TOKENS = {
718
974
  bab: "باب",
719
- basmalah: "بسم الله|﷽",
975
+ basmalah: ["بسم الله", "﷽"].join("|"),
720
976
  bullet: "[•*°]",
721
977
  dash: "[-–—ـ]",
722
- fasl: "فصل|مسألة",
978
+ fasl: ["مسألة", "فصل"].join("|"),
723
979
  harf: "[أ-ي]",
724
980
  kitab: "كتاب",
725
- naql: "حدثنا|أخبرنا|حدثني|وحدثنا|أنبأنا|سمعت",
981
+ naql: [
982
+ "حدثني",
983
+ "وأخبرنا",
984
+ "حدثنا",
985
+ "سمعت",
986
+ "أنبأنا",
987
+ "وحدثنا",
988
+ "أخبرنا"
989
+ ].join("|"),
726
990
  raqm: "[\\u0660-\\u0669]",
727
991
  raqms: "[\\u0660-\\u0669]+",
728
992
  tarqim: "[.!?؟؛]"
@@ -858,6 +1122,16 @@ const containsTokens = (query) => {
858
1122
  */
859
1123
  const expandTokensWithCaptures = (query, fuzzyTransform) => {
860
1124
  const captureNames = [];
1125
+ const captureNameCounts = /* @__PURE__ */ new Map();
1126
+ /**
1127
+ * Gets a unique capture name, appending _2, _3, etc. for duplicates.
1128
+ * This prevents invalid regex with duplicate named groups.
1129
+ */
1130
+ const getUniqueCaptureName = (baseName) => {
1131
+ const count = captureNameCounts.get(baseName) ?? 0;
1132
+ captureNameCounts.set(baseName, count + 1);
1133
+ return count === 0 ? baseName : `${baseName}_${count + 1}`;
1134
+ };
861
1135
  const segments = [];
862
1136
  let lastIndex = 0;
863
1137
  TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
@@ -887,15 +1161,17 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
887
1161
  if (!tokenMatch) return segment.value;
888
1162
  const [, tokenName, captureName] = tokenMatch;
889
1163
  if (!tokenName && captureName) {
890
- captureNames.push(captureName);
891
- return `(?<${captureName}>.+)`;
1164
+ const uniqueName = getUniqueCaptureName(captureName);
1165
+ captureNames.push(uniqueName);
1166
+ return `(?<${uniqueName}>.+)`;
892
1167
  }
893
1168
  let tokenPattern = TOKEN_PATTERNS[tokenName];
894
1169
  if (!tokenPattern) return segment.value;
895
1170
  if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
896
1171
  if (captureName) {
897
- captureNames.push(captureName);
898
- return `(?<${captureName}>${tokenPattern})`;
1172
+ const uniqueName = getUniqueCaptureName(captureName);
1173
+ captureNames.push(uniqueName);
1174
+ return `(?<${uniqueName}>${tokenPattern})`;
899
1175
  }
900
1176
  return tokenPattern;
901
1177
  });
@@ -984,15 +1260,12 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
984
1260
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
985
1261
 
986
1262
  //#endregion
987
- //#region src/segmentation/segmenter.ts
1263
+ //#region src/segmentation/rule-regex.ts
988
1264
  /**
989
- * Core segmentation engine for splitting Arabic text pages into logical segments.
1265
+ * Split rule compiled regex builder.
990
1266
  *
991
- * The segmenter takes an array of pages and applies pattern-based rules to
992
- * identify split points, producing segments with content, page references,
993
- * and optional metadata.
994
- *
995
- * @module segmenter
1267
+ * Extracted from `segmenter.ts` to reduce cognitive complexity and enable
1268
+ * independent unit testing of regex compilation and token expansion behavior.
996
1269
  */
997
1270
  /**
998
1271
  * Checks if a regex pattern contains standard (anonymous) capturing groups.
@@ -1003,35 +1276,26 @@ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1003
1276
  * - Lookbehind assertions `(?<=...)` and `(?<!...)`
1004
1277
  * - Named groups `(?<name>...)` (start with `(?` so excluded here)
1005
1278
  *
1006
- * **Note**: Named capture groups `(?<name>...)` ARE capturing groups but are
1007
- * excluded by this check because they are tracked separately via the
1008
- * `captureNames` array from token expansion. This function only detects
1009
- * anonymous capturing groups like `(.*)`.
1010
- *
1011
- * @param pattern - Regex pattern string to analyze
1012
- * @returns `true` if the pattern contains at least one anonymous capturing group
1279
+ * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
1013
1280
  */
1014
1281
  const hasCapturingGroup = (pattern) => {
1015
1282
  return /\((?!\?)/.test(pattern);
1016
1283
  };
1017
1284
  /**
1285
+ * Safely compiles a regex pattern, throwing a helpful error if invalid.
1286
+ */
1287
+ const compileRuleRegex = (pattern) => {
1288
+ try {
1289
+ return new RegExp(pattern, "gmu");
1290
+ } catch (error) {
1291
+ const message = error instanceof Error ? error.message : String(error);
1292
+ throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
1293
+ }
1294
+ };
1295
+ /**
1018
1296
  * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
1019
1297
  *
1020
- * Fuzzy matching makes Arabic text diacritic-insensitive. When enabled, the
1021
- * transform is applied to token patterns BEFORE wrapping with capture groups,
1022
- * ensuring regex metacharacters (`(`, `)`, `|`, etc.) are not corrupted.
1023
- *
1024
- * @param pattern - Pattern string potentially containing `{{token}}` placeholders
1025
- * @param fuzzy - Whether to apply diacritic-insensitive transformation
1026
- * @returns Processed pattern with expanded tokens and capture names
1027
- *
1028
- * @example
1029
- * processPattern('{{raqms:num}} {{dash}}', false)
1030
- * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ]', captureNames: ['num'] }
1031
- *
1032
- * @example
1033
- * processPattern('{{naql}}', true)
1034
- * // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
1298
+ * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1035
1299
  */
1036
1300
  const processPattern = (pattern, fuzzy) => {
1037
1301
  const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
@@ -1040,77 +1304,115 @@ const processPattern = (pattern, fuzzy) => {
1040
1304
  pattern: expanded
1041
1305
  };
1042
1306
  };
1307
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
1308
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1309
+ const union = processed.map((p) => p.pattern).join("|");
1310
+ return {
1311
+ captureNames: processed.flatMap((p) => p.captureNames),
1312
+ regex: `^(?:${union})(.*)`
1313
+ };
1314
+ };
1315
+ const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
1316
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1317
+ const union = processed.map((p) => p.pattern).join("|");
1318
+ return {
1319
+ captureNames: processed.flatMap((p) => p.captureNames),
1320
+ regex: `^(?:${union})`
1321
+ };
1322
+ };
1323
+ const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
1324
+ const processed = patterns.map((p) => processPattern(p, fuzzy));
1325
+ const union = processed.map((p) => p.pattern).join("|");
1326
+ return {
1327
+ captureNames: processed.flatMap((p) => p.captureNames),
1328
+ regex: `(?:${union})$`
1329
+ };
1330
+ };
1331
+ const buildTemplateRegexSource = (template) => {
1332
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
1333
+ return {
1334
+ captureNames,
1335
+ regex: pattern
1336
+ };
1337
+ };
1338
+ const determineUsesCapture = (regexSource, captureNames) => hasCapturingGroup(regexSource) || captureNames.length > 0;
1043
1339
  /**
1044
1340
  * Builds a compiled regex and metadata from a split rule.
1045
1341
  *
1046
- * Handles all pattern types:
1047
- * - `regex`: Used as-is (no token expansion)
1048
- * - `template`: Tokens expanded via `expandTokensWithCaptures`
1049
- * - `lineStartsWith`: Converted to `^(?:patterns...)`
1050
- * - `lineStartsAfter`: Converted to `^(?:patterns...)(.*)`
1051
- * - `lineEndsWith`: Converted to `(?:patterns...)$`
1052
- *
1053
- * @param rule - Split rule containing pattern and options
1054
- * @returns Compiled regex with capture metadata
1342
+ * Behavior mirrors the previous implementation in `segmenter.ts`.
1055
1343
  */
1056
1344
  const buildRuleRegex = (rule) => {
1057
1345
  const s = { ...rule };
1058
1346
  const fuzzy = rule.fuzzy ?? false;
1059
1347
  let allCaptureNames = [];
1060
- /**
1061
- * Safely compiles a regex pattern, throwing a helpful error if invalid.
1062
- *
1063
- * @remarks
1064
- * This catches syntax errors only. It does NOT protect against ReDoS
1065
- * (catastrophic backtracking) from pathological patterns. Avoid compiling
1066
- * patterns from untrusted sources.
1067
- */
1068
- const compileRegex = (pattern) => {
1069
- try {
1070
- return new RegExp(pattern, "gmu");
1071
- } catch (error) {
1072
- const message = error instanceof Error ? error.message : String(error);
1073
- throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
1074
- }
1075
- };
1076
1348
  if (s.lineStartsAfter?.length) {
1077
- const processed = s.lineStartsAfter.map((p) => processPattern(p, fuzzy));
1078
- const patterns = processed.map((p) => p.pattern).join("|");
1079
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1080
- s.regex = `^(?:${patterns})(.*)`;
1349
+ const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
1350
+ allCaptureNames = captureNames;
1081
1351
  return {
1082
1352
  captureNames: allCaptureNames,
1083
- regex: compileRegex(s.regex),
1353
+ regex: compileRuleRegex(regex),
1084
1354
  usesCapture: true,
1085
1355
  usesLineStartsAfter: true
1086
1356
  };
1087
1357
  }
1088
1358
  if (s.lineStartsWith?.length) {
1089
- const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
1090
- const patterns = processed.map((p) => p.pattern).join("|");
1091
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1092
- s.regex = `^(?:${patterns})`;
1359
+ const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
1360
+ s.regex = regex;
1361
+ allCaptureNames = captureNames;
1093
1362
  }
1094
1363
  if (s.lineEndsWith?.length) {
1095
- const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
1096
- const patterns = processed.map((p) => p.pattern).join("|");
1097
- allCaptureNames = processed.flatMap((p) => p.captureNames);
1098
- s.regex = `(?:${patterns})$`;
1364
+ const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
1365
+ s.regex = regex;
1366
+ allCaptureNames = captureNames;
1099
1367
  }
1100
1368
  if (s.template) {
1101
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(s.template));
1102
- s.regex = pattern;
1369
+ const { regex, captureNames } = buildTemplateRegexSource(s.template);
1370
+ s.regex = regex;
1103
1371
  allCaptureNames = [...allCaptureNames, ...captureNames];
1104
1372
  }
1105
1373
  if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1106
- const usesCapture = hasCapturingGroup(s.regex) || allCaptureNames.length > 0;
1374
+ const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
1107
1375
  return {
1108
1376
  captureNames: allCaptureNames,
1109
- regex: compileRegex(s.regex),
1377
+ regex: compileRuleRegex(s.regex),
1110
1378
  usesCapture,
1111
1379
  usesLineStartsAfter: false
1112
1380
  };
1113
1381
  };
1382
+
1383
+ //#endregion
1384
+ //#region src/segmentation/textUtils.ts
1385
+ /**
1386
+ * Strip all HTML tags from content, keeping only text.
1387
+ *
1388
+ * @param html - HTML content
1389
+ * @returns Plain text content
1390
+ */
1391
+ const stripHtmlTags = (html) => {
1392
+ return html.replace(/<[^>]*>/g, "");
1393
+ };
1394
+ /**
1395
+ * Normalizes line endings to Unix-style (`\n`).
1396
+ *
1397
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
1398
+ * for consistent pattern matching across platforms.
1399
+ *
1400
+ * @param content - Raw content with potentially mixed line endings
1401
+ * @returns Content with all line endings normalized to `\n`
1402
+ */
1403
+ const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1404
+
1405
+ //#endregion
1406
+ //#region src/segmentation/segmenter.ts
1407
+ /**
1408
+ * Core segmentation engine for splitting Arabic text pages into logical segments.
1409
+ *
1410
+ * The segmenter takes an array of pages and applies pattern-based rules to
1411
+ * identify split points, producing segments with content, page references,
1412
+ * and optional metadata.
1413
+ *
1414
+ * @module segmenter
1415
+ */
1114
1416
  /**
1115
1417
  * Builds a concatenated content string and page mapping from input pages.
1116
1418
  *
@@ -1180,6 +1482,62 @@ const buildPageMap = (pages) => {
1180
1482
  };
1181
1483
  };
1182
1484
  /**
1485
+ * Deduplicate split points by index, preferring ones with more information.
1486
+ *
1487
+ * Preference rules (when same index):
1488
+ * - Prefer a split with `contentStartOffset` (needed for `lineStartsAfter` marker stripping)
1489
+ * - Otherwise prefer a split with `meta` over one without
1490
+ */
1491
+ const dedupeSplitPoints = (splitPoints) => {
1492
+ const byIndex = /* @__PURE__ */ new Map();
1493
+ for (const p of splitPoints) {
1494
+ const existing = byIndex.get(p.index);
1495
+ if (!existing) {
1496
+ byIndex.set(p.index, p);
1497
+ continue;
1498
+ }
1499
+ if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
1500
+ }
1501
+ const unique = [...byIndex.values()];
1502
+ unique.sort((a, b) => a.index - b.index);
1503
+ return unique;
1504
+ };
1505
+ /**
1506
+ * If no structural rules produced segments, create a single segment spanning all pages.
1507
+ * This allows breakpoint processing to still run.
1508
+ */
1509
+ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
1510
+ if (segments.length > 0 || pages.length === 0) return segments;
1511
+ const firstPage = pages[0];
1512
+ const lastPage = pages[pages.length - 1];
1513
+ const joinChar = pageJoiner === "newline" ? "\n" : " ";
1514
+ const allContent = normalizedContent.join(joinChar).trim();
1515
+ if (!allContent) return segments;
1516
+ const initialSeg = {
1517
+ content: allContent,
1518
+ from: firstPage.id
1519
+ };
1520
+ if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1521
+ return [initialSeg];
1522
+ };
1523
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1524
+ const collectSplitPointsFromRule = (rule) => {
1525
+ const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1526
+ return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
1527
+ const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1528
+ const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1529
+ return {
1530
+ capturedContent: isLineStartsAfter ? void 0 : m.captured,
1531
+ contentStartOffset: isLineStartsAfter ? markerLength : void 0,
1532
+ index: (rule.split ?? "at") === "at" ? m.start : m.end,
1533
+ meta: rule.meta,
1534
+ namedCaptures: m.namedCaptures
1535
+ };
1536
+ });
1537
+ };
1538
+ return rules.flatMap(collectSplitPointsFromRule);
1539
+ };
1540
+ /**
1183
1541
  * Executes a regex against content and extracts match results with capture information.
1184
1542
  *
1185
1543
  * @param content - Full content string to search
@@ -1262,202 +1620,6 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1262
1620
  * @param prefer - 'longer' for last match, 'shorter' for first match
1263
1621
  * @returns Processed segments with oversized ones broken up
1264
1622
  */
1265
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger) => {
1266
- const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
1267
- const startingPageId = pageIds$1[currentFromIdx];
1268
- if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
1269
- for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
1270
- const pageId = pageIds$1[pageIdx];
1271
- if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets$1[pageIdx] - cumulativeOffsets$1[currentFromIdx];
1272
- }
1273
- return -1;
1274
- };
1275
- const pageIds = pages.map((p) => p.id);
1276
- const pageIdToIndex = new Map(pageIds.map((id, i) => [id, i]));
1277
- const normalizedPages = /* @__PURE__ */ new Map();
1278
- for (let i = 0; i < pages.length; i++) {
1279
- const content = normalizedContent[i];
1280
- normalizedPages.set(pages[i].id, {
1281
- content,
1282
- index: i,
1283
- length: content.length
1284
- });
1285
- }
1286
- const cumulativeOffsets = [0];
1287
- let totalOffset = 0;
1288
- for (let i = 0; i < pageIds.length; i++) {
1289
- const pageData = normalizedPages.get(pageIds[i]);
1290
- totalOffset += pageData ? pageData.length : 0;
1291
- if (i < pageIds.length - 1) totalOffset += 1;
1292
- cumulativeOffsets.push(totalOffset);
1293
- }
1294
- const patternProcessor = (p) => processPattern(p, false).pattern;
1295
- const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
1296
- const result = [];
1297
- logger?.info?.("Starting breakpoint processing", {
1298
- maxPages,
1299
- segmentCount: segments.length
1300
- });
1301
- for (const segment of segments) {
1302
- const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
1303
- const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1304
- logger?.debug?.("Processing segment", {
1305
- contentLength: segment.content.length,
1306
- contentPreview: segment.content.slice(0, 100),
1307
- from: segment.from,
1308
- fromIdx,
1309
- to: segment.to,
1310
- toIdx
1311
- });
1312
- const segmentSpan = (segment.to ?? segment.from) - segment.from;
1313
- const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
1314
- if (segmentSpan <= maxPages && !hasExclusions) {
1315
- logger?.trace?.("Segment within limit, keeping as-is");
1316
- result.push(segment);
1317
- continue;
1318
- }
1319
- logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
1320
- let remainingContent = segment.content;
1321
- let currentFromIdx = fromIdx;
1322
- let isFirstPiece = true;
1323
- let iterationCount = 0;
1324
- const maxIterations = 1e4;
1325
- while (currentFromIdx <= toIdx) {
1326
- iterationCount++;
1327
- if (iterationCount > maxIterations) {
1328
- logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
1329
- logger?.error?.("Loop state", {
1330
- currentFromIdx,
1331
- remainingContentLength: remainingContent.length,
1332
- toIdx
1333
- });
1334
- break;
1335
- }
1336
- const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
1337
- logger?.trace?.("Loop iteration", {
1338
- currentFromIdx,
1339
- currentPageId: pageIds[currentFromIdx],
1340
- iterationCount,
1341
- remainingContentLength: remainingContent.length,
1342
- remainingContentPreview: remainingContent.slice(0, 80),
1343
- remainingSpan,
1344
- toIdx,
1345
- toPageId: pageIds[toIdx]
1346
- });
1347
- const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
1348
- if (remainingSpan <= maxPages && !remainingHasExclusions) {
1349
- logger?.debug?.("Remaining span within limit, outputting final segment");
1350
- const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1351
- if (finalSeg) result.push(finalSeg);
1352
- break;
1353
- }
1354
- const currentPageId = pageIds[currentFromIdx];
1355
- const maxWindowPageId = currentPageId + maxPages;
1356
- let windowEndIdx = currentFromIdx;
1357
- for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
1358
- else break;
1359
- logger?.trace?.("Window calculation", {
1360
- currentPageId,
1361
- maxWindowPageId,
1362
- windowEndIdx,
1363
- windowEndPageId: pageIds[windowEndIdx]
1364
- });
1365
- const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
1366
- let breakPosition = -1;
1367
- if (windowHasExclusions) {
1368
- logger?.trace?.("Window has exclusions, finding exclusion break position");
1369
- breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1370
- logger?.trace?.("Exclusion break position", { breakPosition });
1371
- }
1372
- if (breakPosition <= 0) {
1373
- const breakpointCtx = {
1374
- cumulativeOffsets,
1375
- expandedBreakpoints,
1376
- normalizedPages,
1377
- pageIds,
1378
- prefer
1379
- };
1380
- logger?.trace?.("Finding break position using patterns...");
1381
- breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
1382
- logger?.trace?.("Pattern break position", { breakPosition });
1383
- }
1384
- if (breakPosition <= 0) {
1385
- logger?.debug?.("No pattern matched, falling back to page boundary");
1386
- if (windowEndIdx === currentFromIdx) {
1387
- logger?.trace?.("Single page window, outputting page and advancing");
1388
- const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
1389
- const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
1390
- if (pageSeg) result.push(pageSeg);
1391
- remainingContent = remainingContent.slice(pageContent.length).trim();
1392
- currentFromIdx++;
1393
- isFirstPiece = false;
1394
- logger?.trace?.("After single page", {
1395
- currentFromIdx,
1396
- remainingContentLength: remainingContent.length
1397
- });
1398
- continue;
1399
- }
1400
- breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
1401
- logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
1402
- }
1403
- const pieceContent = remainingContent.slice(0, breakPosition).trim();
1404
- logger?.trace?.("Piece extracted", {
1405
- breakPosition,
1406
- pieceContentLength: pieceContent.length,
1407
- pieceContentPreview: pieceContent.slice(0, 80)
1408
- });
1409
- const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
1410
- const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
1411
- logger?.trace?.("Actual page indices", {
1412
- actualEndIdx,
1413
- actualStartIdx,
1414
- pieceHasContent: !!pieceContent
1415
- });
1416
- if (pieceContent) {
1417
- const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1418
- if (pieceSeg) {
1419
- result.push(pieceSeg);
1420
- logger?.debug?.("Created segment", {
1421
- contentLength: pieceSeg.content.length,
1422
- from: pieceSeg.from,
1423
- to: pieceSeg.to
1424
- });
1425
- }
1426
- }
1427
- const prevRemainingLength = remainingContent.length;
1428
- remainingContent = remainingContent.slice(breakPosition).trim();
1429
- logger?.trace?.("After slicing remainingContent", {
1430
- newLength: remainingContent.length,
1431
- prevLength: prevRemainingLength,
1432
- slicedAmount: breakPosition
1433
- });
1434
- if (!remainingContent) {
1435
- logger?.debug?.("No remaining content, breaking out of loop");
1436
- break;
1437
- }
1438
- let nextFromIdx = actualEndIdx;
1439
- if (remainingContent && actualEndIdx + 1 <= toIdx) {
1440
- const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
1441
- if (nextPageData) {
1442
- const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
1443
- if (nextPrefix && remainingContent.startsWith(nextPrefix)) {
1444
- nextFromIdx = actualEndIdx + 1;
1445
- logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
1446
- }
1447
- }
1448
- }
1449
- logger?.trace?.("End of iteration", {
1450
- nextFromIdx,
1451
- prevCurrentFromIdx: currentFromIdx,
1452
- willAdvance: nextFromIdx !== currentFromIdx
1453
- });
1454
- currentFromIdx = nextFromIdx;
1455
- isFirstPiece = false;
1456
- }
1457
- }
1458
- logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
1459
- return result;
1460
- };
1461
1623
  /**
1462
1624
  * Segments pages of content based on pattern-matching rules.
1463
1625
  *
@@ -1501,45 +1663,15 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1501
1663
  * });
1502
1664
  */
1503
1665
  const segmentPages = (pages, options) => {
1504
- const { rules = [], maxPages, breakpoints, prefer = "longer", logger } = options;
1666
+ const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
1505
1667
  if (!pages.length) return [];
1506
1668
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1507
- const splitPoints = [];
1508
- for (const rule of rules) {
1509
- const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1510
- const finalMatches = filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence);
1511
- for (const m of finalMatches) {
1512
- const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1513
- const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1514
- splitPoints.push({
1515
- capturedContent: isLineStartsAfter ? void 0 : m.captured,
1516
- contentStartOffset: isLineStartsAfter ? markerLength : void 0,
1517
- index: rule.split === "at" ? m.start : m.end,
1518
- meta: rule.meta,
1519
- namedCaptures: m.namedCaptures
1520
- });
1521
- }
1522
- }
1523
- const byIndex = /* @__PURE__ */ new Map();
1524
- for (const p of splitPoints) {
1525
- const existing = byIndex.get(p.index);
1526
- if (!existing) byIndex.set(p.index, p);
1527
- else if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
1528
- }
1529
- const unique = [...byIndex.values()];
1530
- unique.sort((a, b) => a.index - b.index);
1531
- let segments = buildSegments(unique, matchContent, pageMap, rules);
1532
- if (segments.length === 0 && pages.length > 0) {
1533
- const firstPage = pages[0];
1534
- const lastPage = pages[pages.length - 1];
1535
- const initialSeg = {
1536
- content: normalizedContent.join("\n").trim(),
1537
- from: firstPage.id
1538
- };
1539
- if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1540
- if (initialSeg.content) segments = [initialSeg];
1669
+ let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
1670
+ segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
1671
+ if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
1672
+ const patternProcessor = (p) => processPattern(p, false).pattern;
1673
+ return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
1541
1674
  }
1542
- if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger);
1543
1675
  return segments;
1544
1676
  };
1545
1677
  /**