flappa-doormal 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1278,7 +1278,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
1278
1278
  * This is used to define breakpoint windows in terms of actual content being split, rather than
1279
1279
  * raw per-page offsets which can desync when structural rules strip markers.
1280
1280
  */
1281
- const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
1281
+ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
1282
1282
  const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
1283
1283
  if (!targetPageData) return -1;
1284
1284
  const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
@@ -1288,13 +1288,45 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1288
1288
  for (const len of WINDOW_PREFIX_LENGTHS) {
1289
1289
  const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
1290
1290
  if (!prefix) continue;
1291
+ const candidates = [];
1291
1292
  let pos = remainingContent.indexOf(prefix, searchStart);
1292
1293
  while (pos !== -1 && pos <= searchEnd) {
1293
- if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
1294
+ if (pos > 0) {
1295
+ const charBefore = remainingContent[pos - 1];
1296
+ if (charBefore === "\n") candidates.push({
1297
+ isNewline: true,
1298
+ pos
1299
+ });
1300
+ else if (/\s/.test(charBefore)) candidates.push({
1301
+ isNewline: false,
1302
+ pos
1303
+ });
1304
+ }
1294
1305
  pos = remainingContent.indexOf(prefix, pos + 1);
1295
1306
  }
1296
- const last = remainingContent.lastIndexOf(prefix, approx);
1297
- if (last > 0) return last;
1307
+ if (candidates.length > 0) {
1308
+ const newlineCandidates = candidates.filter((c) => c.isNewline);
1309
+ const pool = newlineCandidates.length > 0 ? newlineCandidates : candidates;
1310
+ let bestCandidate = pool[0];
1311
+ let bestDistance = Math.abs(pool[0].pos - expectedBoundary);
1312
+ for (let i = 1; i < pool.length; i++) {
1313
+ const dist = Math.abs(pool[i].pos - expectedBoundary);
1314
+ if (dist < bestDistance) {
1315
+ bestDistance = dist;
1316
+ bestCandidate = pool[i];
1317
+ }
1318
+ }
1319
+ const MAX_DEVIATION = 2e3;
1320
+ if (bestDistance <= MAX_DEVIATION) return bestCandidate.pos;
1321
+ logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
1322
+ bestDistance,
1323
+ expectedBoundary,
1324
+ matchPos: bestCandidate.pos,
1325
+ maxDeviation: MAX_DEVIATION,
1326
+ prefixLength: len,
1327
+ targetPageIdx
1328
+ });
1329
+ }
1298
1330
  }
1299
1331
  return -1;
1300
1332
  };
@@ -1314,6 +1346,7 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1314
1346
  * @param pageIds - Array of all page IDs
1315
1347
  * @param normalizedPages - Map of page ID to normalized content
1316
1348
  * @param cumulativeOffsets - Cumulative character offsets (for estimates)
1349
+ * @param logger - Optional logger for debugging
1317
1350
  * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
1318
1351
  * with a sentinel boundary at segmentContent.length as the last element
1319
1352
  *
@@ -1322,12 +1355,12 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1322
1355
  * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
1323
1356
  * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
1324
1357
  */
1325
- const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1358
+ const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
1326
1359
  const boundaryPositions = [0];
1327
1360
  const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
1328
1361
  for (let i = fromIdx + 1; i <= toIdx; i++) {
1329
1362
  const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
1330
- const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
1363
+ const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages, logger);
1331
1364
  const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
1332
1365
  if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
1333
1366
  else {
@@ -1371,18 +1404,20 @@ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
1371
1404
  * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
1372
1405
  * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
1373
1406
  */
1374
- const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1407
+ const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
1375
1408
  if (windowEndIdx >= toIdx) return remainingContent.length;
1376
1409
  const desiredNextIdx = windowEndIdx + 1;
1377
1410
  const minNextIdx = currentFromIdx + 1;
1378
1411
  const maxNextIdx = Math.min(desiredNextIdx, toIdx);
1379
1412
  const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
1413
+ let bestExpectedBoundary = remainingContent.length;
1380
1414
  for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
1381
1415
  const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
1382
- const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
1416
+ if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
1417
+ const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
1383
1418
  if (pos > 0) return pos;
1384
1419
  }
1385
- return remainingContent.length;
1420
+ return Math.min(bestExpectedBoundary, remainingContent.length);
1386
1421
  };
1387
1422
  /**
1388
1423
  * Finds exclusion-based break position using raw cumulative offsets.
@@ -1460,7 +1495,8 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
1460
1495
  const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
1461
1496
  if (nextPageData) {
1462
1497
  const pos = findNextPagePosition(remainingContent, nextPageData);
1463
- if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
1498
+ const tolerance = Math.max(2e3, windowEndPosition * .5);
1499
+ if (pos > 0 && Math.abs(pos - windowEndPosition) <= tolerance) return Math.min(pos, windowEndPosition, remainingContent.length);
1464
1500
  }
1465
1501
  }
1466
1502
  return Math.min(windowEndPosition, remainingContent.length);
@@ -1484,19 +1520,47 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
1484
1520
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1485
1521
  if (skipWhenRegex?.test(remainingContent)) continue;
1486
1522
  if (regex === null) return {
1487
- breakpointIndex: i,
1488
1523
  breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
1524
+ breakpointIndex: i,
1489
1525
  rule
1490
1526
  };
1491
1527
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1492
1528
  if (breakPos > 0) return {
1493
- breakpointIndex: i,
1494
1529
  breakPos,
1530
+ breakpointIndex: i,
1495
1531
  rule
1496
1532
  };
1497
1533
  }
1498
1534
  return null;
1499
1535
  };
1536
+ /**
1537
+ * Searches backward from a target position to find a "safe" split point.
1538
+ * A safe split point is after whitespace or punctuation.
1539
+ *
1540
+ * @param content The text content
1541
+ * @param targetPosition The desired split position (hard limit)
1542
+ * @param lookbackChars How far back to search for a safe break
1543
+ * @returns The new split position (index), or -1 if no safe break found
1544
+ */
1545
+ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
1546
+ const startSearch = Math.max(0, targetPosition - lookbackChars);
1547
+ for (let i = targetPosition - 1; i >= startSearch; i--) {
1548
+ const char = content[i];
1549
+ if (/[\s\n.,;!?؛،۔]/.test(char)) return i + 1;
1550
+ }
1551
+ return -1;
1552
+ };
1553
+ /**
1554
+ * Ensures the position does not split a surrogate pair.
1555
+ * If position is between High and Low surrogate, returns position - 1.
1556
+ */
1557
+ const adjustForSurrogate = (content, position) => {
1558
+ if (position <= 0 || position >= content.length) return position;
1559
+ const high = content.charCodeAt(position - 1);
1560
+ const low = content.charCodeAt(position);
1561
+ if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) return position - 1;
1562
+ return position;
1563
+ };
1500
1564
 
1501
1565
  //#endregion
1502
1566
  //#region src/segmentation/debug-meta.ts
@@ -1623,7 +1687,7 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
1623
1687
  *
1624
1688
  * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
1625
1689
  */
1626
- const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
1690
+ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength) => {
1627
1691
  if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
1628
1692
  const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1629
1693
  if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
@@ -1639,6 +1703,11 @@ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx
1639
1703
  breakpointIndex: patternMatch.breakpointIndex,
1640
1704
  breakpointRule: patternMatch.rule
1641
1705
  };
1706
+ if (maxContentLength && windowEndPosition === maxContentLength) {
1707
+ const safeOffset = findSafeBreakPosition(remainingContent, windowEndPosition);
1708
+ if (safeOffset !== -1) return { breakOffset: safeOffset };
1709
+ return { breakOffset: adjustForSurrogate(remainingContent, windowEndPosition) };
1710
+ }
1642
1711
  return { breakOffset: windowEndPosition };
1643
1712
  };
1644
1713
  /**
@@ -1655,71 +1724,118 @@ const skipWhitespace$1 = (content, startPos) => {
1655
1724
  *
1656
1725
  * Uses precomputed boundary positions for O(log n) page attribution lookups.
1657
1726
  */
1658
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
1727
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
1659
1728
  const result = [];
1660
1729
  const fullContent = segment.content;
1661
1730
  let cursorPos = 0;
1662
1731
  let currentFromIdx = fromIdx;
1663
1732
  let isFirstPiece = true;
1664
1733
  let lastBreakpoint = null;
1665
- const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1734
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
1666
1735
  logger?.debug?.("[breakpoints] boundaryPositions built", {
1667
1736
  boundaryPositions,
1668
1737
  fromIdx,
1669
1738
  fullContentLength: fullContent.length,
1670
1739
  toIdx
1671
1740
  });
1672
- const maxIterations = 1e4;
1673
- for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
1741
+ let i = 0;
1742
+ const MAX_SAFE_ITERATIONS = 1e5;
1743
+ while (cursorPos < fullContent.length && currentFromIdx <= toIdx && i < MAX_SAFE_ITERATIONS) {
1744
+ i++;
1674
1745
  const remainingContent = fullContent.slice(cursorPos);
1675
1746
  if (!remainingContent.trim()) break;
1676
- const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1677
- const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1678
- if (remainingSpan <= maxPages && !remainingHasExclusions) {
1679
- const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1680
- const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
1681
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
1682
- if (finalSeg) result.push(finalSeg);
1683
- break;
1684
- }
1747
+ if (handleOversizedSegmentFit(remainingContent, currentFromIdx, toIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result)) break;
1685
1748
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
1686
- const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1749
+ const windowEndPosition = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
1687
1750
  logger?.debug?.(`[breakpoints] iteration=${i}`, {
1688
1751
  currentFromIdx,
1689
1752
  cursorPos,
1690
- windowEndIdx
1753
+ windowEndIdx,
1754
+ windowEndPosition
1691
1755
  });
1692
- const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1756
+ const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
1757
+ let breakOffset = found.breakOffset;
1758
+ if (breakOffset <= 0) {
1759
+ const fallbackPos = maxContentLength ? Math.min(maxContentLength, remainingContent.length) : 1;
1760
+ breakOffset = Math.max(1, fallbackPos);
1761
+ logger?.warn?.("[breakpoints] No progress from findBreakOffsetForWindow; forcing forward movement", {
1762
+ breakOffset,
1763
+ cursorPos
1764
+ });
1765
+ }
1693
1766
  if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
1694
1767
  breakpointIndex: found.breakpointIndex,
1695
1768
  rule: found.breakpointRule
1696
1769
  };
1697
- const breakPos = cursorPos + found.breakOffset;
1770
+ const breakPos = cursorPos + breakOffset;
1698
1771
  const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
1699
- const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1700
- logger?.trace?.("[breakpoints] piece", {
1701
- actualEndIdx,
1702
- actualStartIdx,
1703
- pieceLength: pieceContent.length
1704
- });
1705
1772
  if (pieceContent) {
1706
- const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1707
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
1773
+ const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1774
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint), true);
1708
1775
  if (pieceSeg) result.push(pieceSeg);
1709
- }
1710
- cursorPos = skipWhitespace$1(fullContent, breakPos);
1711
- currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
1776
+ const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
1777
+ cursorPos = next.cursorPos;
1778
+ currentFromIdx = next.currentFromIdx;
1779
+ } else cursorPos = breakPos;
1712
1780
  isFirstPiece = false;
1713
1781
  }
1782
+ if (i >= MAX_SAFE_ITERATIONS) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
1783
+ cursorPos,
1784
+ fullContentLength: fullContent.length,
1785
+ iterations: i
1786
+ });
1714
1787
  logger?.debug?.("[breakpoints] done", { resultCount: result.length });
1715
1788
  return result;
1716
1789
  };
1717
1790
  /**
1791
+ * Checks if the remaining content fits within paged/length limits.
1792
+ * If so, pushes the final segment and returns true.
1793
+ */
1794
+ const handleOversizedSegmentFit = (remainingContent, currentFromIdx, toIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result) => {
1795
+ const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1796
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1797
+ const fitsInPages = remainingSpan <= maxPages;
1798
+ const fitsInLength = !maxContentLength || remainingContent.length <= maxContentLength;
1799
+ if (fitsInPages && fitsInLength && !remainingHasExclusions) {
1800
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1801
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint), includeMeta);
1802
+ if (finalSeg) result.push(finalSeg);
1803
+ return true;
1804
+ }
1805
+ return false;
1806
+ };
1807
+ /**
1808
+ * Builds metadata for a segment piece, optionally including debug info.
1809
+ */
1810
+ const getSegmentMetaWithDebug = (isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint) => {
1811
+ if (!(isFirstPiece || Boolean(debugMetaKey))) return;
1812
+ if (debugMetaKey && lastBreakpoint) return mergeDebugIntoMeta(isFirstPiece ? originalMeta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule));
1813
+ return isFirstPiece ? originalMeta : void 0;
1814
+ };
1815
+ /**
1816
+ * Calculates window end position, capped by maxContentLength if present.
1817
+ */
1818
+ const getWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger) => {
1819
+ let windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
1820
+ if (maxContentLength && maxContentLength < windowEndPosition) windowEndPosition = maxContentLength;
1821
+ return windowEndPosition;
1822
+ };
1823
+ /**
1824
+ * Advances cursorPos and currentFromIdx for the next iteration.
1825
+ */
1826
+ const advanceCursorAndIndex = (fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages) => {
1827
+ const nextCursorPos = skipWhitespace$1(fullContent, breakPos);
1828
+ return {
1829
+ currentFromIdx: computeNextFromIdx(fullContent.slice(nextCursorPos), actualEndIdx, toIdx, pageIds, normalizedPages),
1830
+ cursorPos: nextCursorPos
1831
+ };
1832
+ };
1833
+ /**
1718
1834
  * Applies breakpoints to oversized segments.
1719
1835
  *
1720
1836
  * Note: This is an internal engine used by `segmentPages()`.
1721
1837
  */
1722
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
1838
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey, maxContentLength) => {
1723
1839
  const pageIds = pages.map((p) => p.id);
1724
1840
  const pageIdToIndex = buildPageIdToIndexMap(pageIds);
1725
1841
  const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1743,11 +1859,13 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1743
1859
  const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1744
1860
  const segmentSpan = (segment.to ?? segment.from) - segment.from;
1745
1861
  const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
1746
- if (segmentSpan <= maxPages && !hasExclusions) {
1862
+ const fitsInPages = segmentSpan <= maxPages;
1863
+ const fitsInLength = !maxContentLength || segment.content.length <= maxContentLength;
1864
+ if (fitsInPages && fitsInLength && !hasExclusions) {
1747
1865
  result.push(segment);
1748
1866
  continue;
1749
1867
  }
1750
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
1868
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
1751
1869
  result.push(...broken.map((s) => {
1752
1870
  const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
1753
1871
  const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -2210,6 +2328,47 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
2210
2328
  return prevReq.test(lastChar);
2211
2329
  };
2212
2330
  };
2331
+ /**
2332
+ * Checks if a pageId matches the min/max/exclude constraints of a rule.
2333
+ */
2334
+ const passesRuleConstraints$1 = (rule, pageId) => {
2335
+ return (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
2336
+ };
2337
+ /**
2338
+ * Records a split point for a specific rule.
2339
+ */
2340
+ const recordSplitPointAt = (splitPointsByRule, ruleIndex, sp) => {
2341
+ const arr = splitPointsByRule.get(ruleIndex);
2342
+ if (!arr) {
2343
+ splitPointsByRule.set(ruleIndex, [sp]);
2344
+ return;
2345
+ }
2346
+ arr.push(sp);
2347
+ };
2348
+ /**
2349
+ * Processes matches for all fast-fuzzy rules at a specific line start.
2350
+ */
2351
+ const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart, splitPointsByRule) => {
2352
+ for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
2353
+ if (!passesRuleConstraints$1(rule, pageId)) continue;
2354
+ if (isPageStart && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
2355
+ const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
2356
+ if (end === null) continue;
2357
+ const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
2358
+ if (kind === "startsWith") recordSplitPointAt(splitPointsByRule, ruleIndex, {
2359
+ index: splitIndex,
2360
+ meta: rule.meta
2361
+ });
2362
+ else {
2363
+ const markerLength = end - lineStart;
2364
+ recordSplitPointAt(splitPointsByRule, ruleIndex, {
2365
+ contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
2366
+ index: splitIndex,
2367
+ meta: rule.meta
2368
+ });
2369
+ }
2370
+ }
2371
+ };
2213
2372
  const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
2214
2373
  const splitPointsByRule = /* @__PURE__ */ new Map();
2215
2374
  if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
@@ -2221,38 +2380,12 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
2221
2380
  currentBoundary = pageMap.boundaries[boundaryIdx];
2222
2381
  }
2223
2382
  };
2224
- const recordSplitPoint = (ruleIndex, sp) => {
2225
- const arr = splitPointsByRule.get(ruleIndex);
2226
- if (!arr) {
2227
- splitPointsByRule.set(ruleIndex, [sp]);
2228
- return;
2229
- }
2230
- arr.push(sp);
2231
- };
2232
2383
  const isPageStart = (offset) => offset === currentBoundary?.start;
2233
2384
  for (let lineStart = 0; lineStart <= matchContent.length;) {
2234
2385
  advanceBoundaryTo(lineStart);
2235
2386
  const pageId = currentBoundary?.id ?? 0;
2236
2387
  if (lineStart >= matchContent.length) break;
2237
- for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
2238
- if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
2239
- if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
2240
- const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
2241
- if (end === null) continue;
2242
- const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
2243
- if (kind === "startsWith") recordSplitPoint(ruleIndex, {
2244
- index: splitIndex,
2245
- meta: rule.meta
2246
- });
2247
- else {
2248
- const markerLength = end - lineStart;
2249
- recordSplitPoint(ruleIndex, {
2250
- contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
2251
- index: splitIndex,
2252
- meta: rule.meta
2253
- });
2254
- }
2255
- }
2388
+ processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart(lineStart), splitPointsByRule);
2256
2389
  const nextNl = matchContent.indexOf("\n", lineStart);
2257
2390
  if (nextNl === -1) break;
2258
2391
  lineStart = nextNl + 1;
@@ -2577,20 +2710,6 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2577
2710
  return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
2578
2711
  };
2579
2712
  /**
2580
- * Applies breakpoints to oversized segments.
2581
- *
2582
- * For each segment that spans more than maxPages, tries the breakpoint patterns
2583
- * in order to find a suitable split point. Structural markers (from rules) are
2584
- * always respected - segments are only broken within their boundaries.
2585
- *
2586
- * @param segments - Initial segments from rule processing
2587
- * @param pages - Original pages for page lookup
2588
- * @param maxPages - Maximum pages before breakpoints apply
2589
- * @param breakpoints - Patterns to try in order (tokens supported)
2590
- * @param prefer - 'longer' for last match, 'shorter' for first match
2591
- * @returns Processed segments with oversized ones broken up
2592
- */
2593
- /**
2594
2713
  * Segments pages of content based on pattern-matching rules.
2595
2714
  *
2596
2715
  * This is the main entry point for the segmentation engine. It takes an array
@@ -2633,11 +2752,14 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2633
2752
  * });
2634
2753
  */
2635
2754
  const segmentPages = (pages, options) => {
2636
- const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2755
+ const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength } = options;
2756
+ if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
2757
+ const maxPages = options.maxPages ?? (maxContentLength ? Number.MAX_SAFE_INTEGER : 0);
2637
2758
  const debug = resolveDebugConfig(options.debug);
2638
2759
  const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
2639
2760
  logger?.info?.("[segmenter] starting segmentation", {
2640
2761
  breakpointCount: breakpoints.length,
2762
+ maxContentLength,
2641
2763
  maxPages,
2642
2764
  pageCount: pages.length,
2643
2765
  prefer,
@@ -2665,10 +2787,10 @@ const segmentPages = (pages, options) => {
2665
2787
  }))
2666
2788
  });
2667
2789
  segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
2668
- if (maxPages >= 0 && breakpoints.length) {
2790
+ if ((maxPages >= 0 || maxContentLength && maxContentLength > 0) && breakpoints.length) {
2669
2791
  logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2670
2792
  const patternProcessor = (p) => processPattern(p, false).pattern;
2671
- const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
2793
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0, maxContentLength);
2672
2794
  logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2673
2795
  return result;
2674
2796
  }