flappa-doormal 2.10.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +91 -0
- package/LICENSE.md +1 -1
- package/README.md +110 -2
- package/dist/index.d.mts +12 -14
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +210 -88
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.mjs
CHANGED
|
@@ -1278,7 +1278,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
|
|
|
1278
1278
|
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
1279
1279
|
* raw per-page offsets which can desync when structural rules strip markers.
|
|
1280
1280
|
*/
|
|
1281
|
-
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
1281
|
+
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
|
|
1282
1282
|
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
1283
1283
|
if (!targetPageData) return -1;
|
|
1284
1284
|
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
@@ -1288,13 +1288,45 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1288
1288
|
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
1289
1289
|
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
1290
1290
|
if (!prefix) continue;
|
|
1291
|
+
const candidates = [];
|
|
1291
1292
|
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
1292
1293
|
while (pos !== -1 && pos <= searchEnd) {
|
|
1293
|
-
if (pos > 0
|
|
1294
|
+
if (pos > 0) {
|
|
1295
|
+
const charBefore = remainingContent[pos - 1];
|
|
1296
|
+
if (charBefore === "\n") candidates.push({
|
|
1297
|
+
isNewline: true,
|
|
1298
|
+
pos
|
|
1299
|
+
});
|
|
1300
|
+
else if (/\s/.test(charBefore)) candidates.push({
|
|
1301
|
+
isNewline: false,
|
|
1302
|
+
pos
|
|
1303
|
+
});
|
|
1304
|
+
}
|
|
1294
1305
|
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
1295
1306
|
}
|
|
1296
|
-
|
|
1297
|
-
|
|
1307
|
+
if (candidates.length > 0) {
|
|
1308
|
+
const newlineCandidates = candidates.filter((c) => c.isNewline);
|
|
1309
|
+
const pool = newlineCandidates.length > 0 ? newlineCandidates : candidates;
|
|
1310
|
+
let bestCandidate = pool[0];
|
|
1311
|
+
let bestDistance = Math.abs(pool[0].pos - expectedBoundary);
|
|
1312
|
+
for (let i = 1; i < pool.length; i++) {
|
|
1313
|
+
const dist = Math.abs(pool[i].pos - expectedBoundary);
|
|
1314
|
+
if (dist < bestDistance) {
|
|
1315
|
+
bestDistance = dist;
|
|
1316
|
+
bestCandidate = pool[i];
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
const MAX_DEVIATION = 2e3;
|
|
1320
|
+
if (bestDistance <= MAX_DEVIATION) return bestCandidate.pos;
|
|
1321
|
+
logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
|
|
1322
|
+
bestDistance,
|
|
1323
|
+
expectedBoundary,
|
|
1324
|
+
matchPos: bestCandidate.pos,
|
|
1325
|
+
maxDeviation: MAX_DEVIATION,
|
|
1326
|
+
prefixLength: len,
|
|
1327
|
+
targetPageIdx
|
|
1328
|
+
});
|
|
1329
|
+
}
|
|
1298
1330
|
}
|
|
1299
1331
|
return -1;
|
|
1300
1332
|
};
|
|
@@ -1314,6 +1346,7 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1314
1346
|
* @param pageIds - Array of all page IDs
|
|
1315
1347
|
* @param normalizedPages - Map of page ID to normalized content
|
|
1316
1348
|
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
1349
|
+
* @param logger - Optional logger for debugging
|
|
1317
1350
|
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
1318
1351
|
* with a sentinel boundary at segmentContent.length as the last element
|
|
1319
1352
|
*
|
|
@@ -1322,12 +1355,12 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1322
1355
|
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
1323
1356
|
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
1324
1357
|
*/
|
|
1325
|
-
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1358
|
+
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1326
1359
|
const boundaryPositions = [0];
|
|
1327
1360
|
const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
1328
1361
|
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
1329
1362
|
const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
|
|
1330
|
-
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
|
|
1363
|
+
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1331
1364
|
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
1332
1365
|
if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
|
|
1333
1366
|
else {
|
|
@@ -1371,18 +1404,20 @@ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
|
1371
1404
|
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
1372
1405
|
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
1373
1406
|
*/
|
|
1374
|
-
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1407
|
+
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1375
1408
|
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
1376
1409
|
const desiredNextIdx = windowEndIdx + 1;
|
|
1377
1410
|
const minNextIdx = currentFromIdx + 1;
|
|
1378
1411
|
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
1379
1412
|
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
1413
|
+
let bestExpectedBoundary = remainingContent.length;
|
|
1380
1414
|
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
1381
1415
|
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
1382
|
-
|
|
1416
|
+
if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
|
|
1417
|
+
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1383
1418
|
if (pos > 0) return pos;
|
|
1384
1419
|
}
|
|
1385
|
-
return remainingContent.length;
|
|
1420
|
+
return Math.min(bestExpectedBoundary, remainingContent.length);
|
|
1386
1421
|
};
|
|
1387
1422
|
/**
|
|
1388
1423
|
* Finds exclusion-based break position using raw cumulative offsets.
|
|
@@ -1460,7 +1495,8 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1460
1495
|
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
1461
1496
|
if (nextPageData) {
|
|
1462
1497
|
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
1463
|
-
|
|
1498
|
+
const tolerance = Math.max(2e3, windowEndPosition * .5);
|
|
1499
|
+
if (pos > 0 && Math.abs(pos - windowEndPosition) <= tolerance) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
1464
1500
|
}
|
|
1465
1501
|
}
|
|
1466
1502
|
return Math.min(windowEndPosition, remainingContent.length);
|
|
@@ -1484,19 +1520,47 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
1484
1520
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1485
1521
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1486
1522
|
if (regex === null) return {
|
|
1487
|
-
breakpointIndex: i,
|
|
1488
1523
|
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1524
|
+
breakpointIndex: i,
|
|
1489
1525
|
rule
|
|
1490
1526
|
};
|
|
1491
1527
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1492
1528
|
if (breakPos > 0) return {
|
|
1493
|
-
breakpointIndex: i,
|
|
1494
1529
|
breakPos,
|
|
1530
|
+
breakpointIndex: i,
|
|
1495
1531
|
rule
|
|
1496
1532
|
};
|
|
1497
1533
|
}
|
|
1498
1534
|
return null;
|
|
1499
1535
|
};
|
|
1536
|
+
/**
|
|
1537
|
+
* Searches backward from a target position to find a "safe" split point.
|
|
1538
|
+
* A safe split point is after whitespace or punctuation.
|
|
1539
|
+
*
|
|
1540
|
+
* @param content The text content
|
|
1541
|
+
* @param targetPosition The desired split position (hard limit)
|
|
1542
|
+
* @param lookbackChars How far back to search for a safe break
|
|
1543
|
+
* @returns The new split position (index), or -1 if no safe break found
|
|
1544
|
+
*/
|
|
1545
|
+
const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
|
|
1546
|
+
const startSearch = Math.max(0, targetPosition - lookbackChars);
|
|
1547
|
+
for (let i = targetPosition - 1; i >= startSearch; i--) {
|
|
1548
|
+
const char = content[i];
|
|
1549
|
+
if (/[\s\n.,;!?؛،۔]/.test(char)) return i + 1;
|
|
1550
|
+
}
|
|
1551
|
+
return -1;
|
|
1552
|
+
};
|
|
1553
|
+
/**
|
|
1554
|
+
* Ensures the position does not split a surrogate pair.
|
|
1555
|
+
* If position is between High and Low surrogate, returns position - 1.
|
|
1556
|
+
*/
|
|
1557
|
+
const adjustForSurrogate = (content, position) => {
|
|
1558
|
+
if (position <= 0 || position >= content.length) return position;
|
|
1559
|
+
const high = content.charCodeAt(position - 1);
|
|
1560
|
+
const low = content.charCodeAt(position);
|
|
1561
|
+
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) return position - 1;
|
|
1562
|
+
return position;
|
|
1563
|
+
};
|
|
1500
1564
|
|
|
1501
1565
|
//#endregion
|
|
1502
1566
|
//#region src/segmentation/debug-meta.ts
|
|
@@ -1623,7 +1687,7 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
|
|
|
1623
1687
|
*
|
|
1624
1688
|
* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
|
|
1625
1689
|
*/
|
|
1626
|
-
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
1690
|
+
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength) => {
|
|
1627
1691
|
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
1628
1692
|
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1629
1693
|
if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
|
|
@@ -1639,6 +1703,11 @@ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx
|
|
|
1639
1703
|
breakpointIndex: patternMatch.breakpointIndex,
|
|
1640
1704
|
breakpointRule: patternMatch.rule
|
|
1641
1705
|
};
|
|
1706
|
+
if (maxContentLength && windowEndPosition === maxContentLength) {
|
|
1707
|
+
const safeOffset = findSafeBreakPosition(remainingContent, windowEndPosition);
|
|
1708
|
+
if (safeOffset !== -1) return { breakOffset: safeOffset };
|
|
1709
|
+
return { breakOffset: adjustForSurrogate(remainingContent, windowEndPosition) };
|
|
1710
|
+
}
|
|
1642
1711
|
return { breakOffset: windowEndPosition };
|
|
1643
1712
|
};
|
|
1644
1713
|
/**
|
|
@@ -1655,71 +1724,118 @@ const skipWhitespace$1 = (content, startPos) => {
|
|
|
1655
1724
|
*
|
|
1656
1725
|
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
1657
1726
|
*/
|
|
1658
|
-
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
|
|
1727
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
|
|
1659
1728
|
const result = [];
|
|
1660
1729
|
const fullContent = segment.content;
|
|
1661
1730
|
let cursorPos = 0;
|
|
1662
1731
|
let currentFromIdx = fromIdx;
|
|
1663
1732
|
let isFirstPiece = true;
|
|
1664
1733
|
let lastBreakpoint = null;
|
|
1665
|
-
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1734
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1666
1735
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1667
1736
|
boundaryPositions,
|
|
1668
1737
|
fromIdx,
|
|
1669
1738
|
fullContentLength: fullContent.length,
|
|
1670
1739
|
toIdx
|
|
1671
1740
|
});
|
|
1672
|
-
|
|
1673
|
-
|
|
1741
|
+
let i = 0;
|
|
1742
|
+
const MAX_SAFE_ITERATIONS = 1e5;
|
|
1743
|
+
while (cursorPos < fullContent.length && currentFromIdx <= toIdx && i < MAX_SAFE_ITERATIONS) {
|
|
1744
|
+
i++;
|
|
1674
1745
|
const remainingContent = fullContent.slice(cursorPos);
|
|
1675
1746
|
if (!remainingContent.trim()) break;
|
|
1676
|
-
|
|
1677
|
-
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1678
|
-
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1679
|
-
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1680
|
-
const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
|
|
1681
|
-
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
|
|
1682
|
-
if (finalSeg) result.push(finalSeg);
|
|
1683
|
-
break;
|
|
1684
|
-
}
|
|
1747
|
+
if (handleOversizedSegmentFit(remainingContent, currentFromIdx, toIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result)) break;
|
|
1685
1748
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
1686
|
-
const windowEndPosition =
|
|
1749
|
+
const windowEndPosition = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
|
|
1687
1750
|
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
1688
1751
|
currentFromIdx,
|
|
1689
1752
|
cursorPos,
|
|
1690
|
-
windowEndIdx
|
|
1753
|
+
windowEndIdx,
|
|
1754
|
+
windowEndPosition
|
|
1691
1755
|
});
|
|
1692
|
-
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
1756
|
+
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
|
|
1757
|
+
let breakOffset = found.breakOffset;
|
|
1758
|
+
if (breakOffset <= 0) {
|
|
1759
|
+
const fallbackPos = maxContentLength ? Math.min(maxContentLength, remainingContent.length) : 1;
|
|
1760
|
+
breakOffset = Math.max(1, fallbackPos);
|
|
1761
|
+
logger?.warn?.("[breakpoints] No progress from findBreakOffsetForWindow; forcing forward movement", {
|
|
1762
|
+
breakOffset,
|
|
1763
|
+
cursorPos
|
|
1764
|
+
});
|
|
1765
|
+
}
|
|
1693
1766
|
if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
|
|
1694
1767
|
breakpointIndex: found.breakpointIndex,
|
|
1695
1768
|
rule: found.breakpointRule
|
|
1696
1769
|
};
|
|
1697
|
-
const breakPos = cursorPos +
|
|
1770
|
+
const breakPos = cursorPos + breakOffset;
|
|
1698
1771
|
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
1699
|
-
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1700
|
-
logger?.trace?.("[breakpoints] piece", {
|
|
1701
|
-
actualEndIdx,
|
|
1702
|
-
actualStartIdx,
|
|
1703
|
-
pieceLength: pieceContent.length
|
|
1704
|
-
});
|
|
1705
1772
|
if (pieceContent) {
|
|
1706
|
-
const
|
|
1707
|
-
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds,
|
|
1773
|
+
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1774
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint), true);
|
|
1708
1775
|
if (pieceSeg) result.push(pieceSeg);
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1776
|
+
const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
1777
|
+
cursorPos = next.cursorPos;
|
|
1778
|
+
currentFromIdx = next.currentFromIdx;
|
|
1779
|
+
} else cursorPos = breakPos;
|
|
1712
1780
|
isFirstPiece = false;
|
|
1713
1781
|
}
|
|
1782
|
+
if (i >= MAX_SAFE_ITERATIONS) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
|
|
1783
|
+
cursorPos,
|
|
1784
|
+
fullContentLength: fullContent.length,
|
|
1785
|
+
iterations: i
|
|
1786
|
+
});
|
|
1714
1787
|
logger?.debug?.("[breakpoints] done", { resultCount: result.length });
|
|
1715
1788
|
return result;
|
|
1716
1789
|
};
|
|
1717
1790
|
/**
|
|
1791
|
+
* Checks if the remaining content fits within paged/length limits.
|
|
1792
|
+
* If so, pushes the final segment and returns true.
|
|
1793
|
+
*/
|
|
1794
|
+
const handleOversizedSegmentFit = (remainingContent, currentFromIdx, toIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result) => {
|
|
1795
|
+
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
1796
|
+
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1797
|
+
const fitsInPages = remainingSpan <= maxPages;
|
|
1798
|
+
const fitsInLength = !maxContentLength || remainingContent.length <= maxContentLength;
|
|
1799
|
+
if (fitsInPages && fitsInLength && !remainingHasExclusions) {
|
|
1800
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1801
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint), includeMeta);
|
|
1802
|
+
if (finalSeg) result.push(finalSeg);
|
|
1803
|
+
return true;
|
|
1804
|
+
}
|
|
1805
|
+
return false;
|
|
1806
|
+
};
|
|
1807
|
+
/**
|
|
1808
|
+
* Builds metadata for a segment piece, optionally including debug info.
|
|
1809
|
+
*/
|
|
1810
|
+
const getSegmentMetaWithDebug = (isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint) => {
|
|
1811
|
+
if (!(isFirstPiece || Boolean(debugMetaKey))) return;
|
|
1812
|
+
if (debugMetaKey && lastBreakpoint) return mergeDebugIntoMeta(isFirstPiece ? originalMeta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule));
|
|
1813
|
+
return isFirstPiece ? originalMeta : void 0;
|
|
1814
|
+
};
|
|
1815
|
+
/**
|
|
1816
|
+
* Calculates window end position, capped by maxContentLength if present.
|
|
1817
|
+
*/
|
|
1818
|
+
const getWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger) => {
|
|
1819
|
+
let windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1820
|
+
if (maxContentLength && maxContentLength < windowEndPosition) windowEndPosition = maxContentLength;
|
|
1821
|
+
return windowEndPosition;
|
|
1822
|
+
};
|
|
1823
|
+
/**
|
|
1824
|
+
* Advances cursorPos and currentFromIdx for the next iteration.
|
|
1825
|
+
*/
|
|
1826
|
+
const advanceCursorAndIndex = (fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages) => {
|
|
1827
|
+
const nextCursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
1828
|
+
return {
|
|
1829
|
+
currentFromIdx: computeNextFromIdx(fullContent.slice(nextCursorPos), actualEndIdx, toIdx, pageIds, normalizedPages),
|
|
1830
|
+
cursorPos: nextCursorPos
|
|
1831
|
+
};
|
|
1832
|
+
};
|
|
1833
|
+
/**
|
|
1718
1834
|
* Applies breakpoints to oversized segments.
|
|
1719
1835
|
*
|
|
1720
1836
|
* Note: This is an internal engine used by `segmentPages()`.
|
|
1721
1837
|
*/
|
|
1722
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
|
|
1838
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey, maxContentLength) => {
|
|
1723
1839
|
const pageIds = pages.map((p) => p.id);
|
|
1724
1840
|
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
1725
1841
|
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
@@ -1743,11 +1859,13 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1743
1859
|
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
1744
1860
|
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
1745
1861
|
const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
|
|
1746
|
-
|
|
1862
|
+
const fitsInPages = segmentSpan <= maxPages;
|
|
1863
|
+
const fitsInLength = !maxContentLength || segment.content.length <= maxContentLength;
|
|
1864
|
+
if (fitsInPages && fitsInLength && !hasExclusions) {
|
|
1747
1865
|
result.push(segment);
|
|
1748
1866
|
continue;
|
|
1749
1867
|
}
|
|
1750
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
|
|
1868
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
|
|
1751
1869
|
result.push(...broken.map((s) => {
|
|
1752
1870
|
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
1753
1871
|
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
@@ -2210,6 +2328,47 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
|
2210
2328
|
return prevReq.test(lastChar);
|
|
2211
2329
|
};
|
|
2212
2330
|
};
|
|
2331
|
+
/**
|
|
2332
|
+
* Checks if a pageId matches the min/max/exclude constraints of a rule.
|
|
2333
|
+
*/
|
|
2334
|
+
const passesRuleConstraints$1 = (rule, pageId) => {
|
|
2335
|
+
return (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
2336
|
+
};
|
|
2337
|
+
/**
|
|
2338
|
+
* Records a split point for a specific rule.
|
|
2339
|
+
*/
|
|
2340
|
+
const recordSplitPointAt = (splitPointsByRule, ruleIndex, sp) => {
|
|
2341
|
+
const arr = splitPointsByRule.get(ruleIndex);
|
|
2342
|
+
if (!arr) {
|
|
2343
|
+
splitPointsByRule.set(ruleIndex, [sp]);
|
|
2344
|
+
return;
|
|
2345
|
+
}
|
|
2346
|
+
arr.push(sp);
|
|
2347
|
+
};
|
|
2348
|
+
/**
|
|
2349
|
+
* Processes matches for all fast-fuzzy rules at a specific line start.
|
|
2350
|
+
*/
|
|
2351
|
+
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart, splitPointsByRule) => {
|
|
2352
|
+
for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
|
|
2353
|
+
if (!passesRuleConstraints$1(rule, pageId)) continue;
|
|
2354
|
+
if (isPageStart && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
|
|
2355
|
+
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
2356
|
+
if (end === null) continue;
|
|
2357
|
+
const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
|
|
2358
|
+
if (kind === "startsWith") recordSplitPointAt(splitPointsByRule, ruleIndex, {
|
|
2359
|
+
index: splitIndex,
|
|
2360
|
+
meta: rule.meta
|
|
2361
|
+
});
|
|
2362
|
+
else {
|
|
2363
|
+
const markerLength = end - lineStart;
|
|
2364
|
+
recordSplitPointAt(splitPointsByRule, ruleIndex, {
|
|
2365
|
+
contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
|
|
2366
|
+
index: splitIndex,
|
|
2367
|
+
meta: rule.meta
|
|
2368
|
+
});
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
};
|
|
2213
2372
|
const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
|
|
2214
2373
|
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
2215
2374
|
if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
|
|
@@ -2221,38 +2380,12 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
2221
2380
|
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
2222
2381
|
}
|
|
2223
2382
|
};
|
|
2224
|
-
const recordSplitPoint = (ruleIndex, sp) => {
|
|
2225
|
-
const arr = splitPointsByRule.get(ruleIndex);
|
|
2226
|
-
if (!arr) {
|
|
2227
|
-
splitPointsByRule.set(ruleIndex, [sp]);
|
|
2228
|
-
return;
|
|
2229
|
-
}
|
|
2230
|
-
arr.push(sp);
|
|
2231
|
-
};
|
|
2232
2383
|
const isPageStart = (offset) => offset === currentBoundary?.start;
|
|
2233
2384
|
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
2234
2385
|
advanceBoundaryTo(lineStart);
|
|
2235
2386
|
const pageId = currentBoundary?.id ?? 0;
|
|
2236
2387
|
if (lineStart >= matchContent.length) break;
|
|
2237
|
-
|
|
2238
|
-
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
2239
|
-
if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
|
|
2240
|
-
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
2241
|
-
if (end === null) continue;
|
|
2242
|
-
const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
|
|
2243
|
-
if (kind === "startsWith") recordSplitPoint(ruleIndex, {
|
|
2244
|
-
index: splitIndex,
|
|
2245
|
-
meta: rule.meta
|
|
2246
|
-
});
|
|
2247
|
-
else {
|
|
2248
|
-
const markerLength = end - lineStart;
|
|
2249
|
-
recordSplitPoint(ruleIndex, {
|
|
2250
|
-
contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
|
|
2251
|
-
index: splitIndex,
|
|
2252
|
-
meta: rule.meta
|
|
2253
|
-
});
|
|
2254
|
-
}
|
|
2255
|
-
}
|
|
2388
|
+
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart(lineStart), splitPointsByRule);
|
|
2256
2389
|
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
2257
2390
|
if (nextNl === -1) break;
|
|
2258
2391
|
lineStart = nextNl + 1;
|
|
@@ -2577,20 +2710,6 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2577
2710
|
return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
|
|
2578
2711
|
};
|
|
2579
2712
|
/**
|
|
2580
|
-
* Applies breakpoints to oversized segments.
|
|
2581
|
-
*
|
|
2582
|
-
* For each segment that spans more than maxPages, tries the breakpoint patterns
|
|
2583
|
-
* in order to find a suitable split point. Structural markers (from rules) are
|
|
2584
|
-
* always respected - segments are only broken within their boundaries.
|
|
2585
|
-
*
|
|
2586
|
-
* @param segments - Initial segments from rule processing
|
|
2587
|
-
* @param pages - Original pages for page lookup
|
|
2588
|
-
* @param maxPages - Maximum pages before breakpoints apply
|
|
2589
|
-
* @param breakpoints - Patterns to try in order (tokens supported)
|
|
2590
|
-
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
2591
|
-
* @returns Processed segments with oversized ones broken up
|
|
2592
|
-
*/
|
|
2593
|
-
/**
|
|
2594
2713
|
* Segments pages of content based on pattern-matching rules.
|
|
2595
2714
|
*
|
|
2596
2715
|
* This is the main entry point for the segmentation engine. It takes an array
|
|
@@ -2633,11 +2752,14 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2633
2752
|
* });
|
|
2634
2753
|
*/
|
|
2635
2754
|
const segmentPages = (pages, options) => {
|
|
2636
|
-
const { rules = [],
|
|
2755
|
+
const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength } = options;
|
|
2756
|
+
if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
|
|
2757
|
+
const maxPages = options.maxPages ?? (maxContentLength ? Number.MAX_SAFE_INTEGER : 0);
|
|
2637
2758
|
const debug = resolveDebugConfig(options.debug);
|
|
2638
2759
|
const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
|
|
2639
2760
|
logger?.info?.("[segmenter] starting segmentation", {
|
|
2640
2761
|
breakpointCount: breakpoints.length,
|
|
2762
|
+
maxContentLength,
|
|
2641
2763
|
maxPages,
|
|
2642
2764
|
pageCount: pages.length,
|
|
2643
2765
|
prefer,
|
|
@@ -2665,10 +2787,10 @@ const segmentPages = (pages, options) => {
|
|
|
2665
2787
|
}))
|
|
2666
2788
|
});
|
|
2667
2789
|
segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
|
|
2668
|
-
if (maxPages >= 0 && breakpoints.length) {
|
|
2790
|
+
if ((maxPages >= 0 || maxContentLength && maxContentLength > 0) && breakpoints.length) {
|
|
2669
2791
|
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
2670
2792
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2671
|
-
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
|
|
2793
|
+
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0, maxContentLength);
|
|
2672
2794
|
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
2673
2795
|
return result;
|
|
2674
2796
|
}
|