flappa-doormal 2.10.0 → 2.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -383,6 +383,8 @@ bunx biome lint .
383
383
 
384
384
  11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
385
385
 
386
+ 12. **Prefix matching fails with duplicated content**: When using `indexOf()` to find page boundaries by matching prefixes, false positives occur when pages have identical prefixes AND content is duplicated within pages. Solution: use cumulative byte offsets as the source of truth for expected boundaries, and only accept prefix matches within a strict deviation threshold (2000 chars). When content-based detection fails, fall back directly to the calculated offset rather than returning `remainingContent.length` (which merges all remaining pages).
387
+
386
388
  ### For Future AI Agents (Recovery + Repo gotchas)
387
389
 
388
390
  1. **`lineStartsAfter` vs `lineStartsWith` is not “cosmetic”**: `lineStartsAfter` changes output by stripping the matched marker via an internal `contentStartOffset` during segment construction. If a client used it by accident, you cannot reconstruct the exact stripped prefix from output alone without referencing the original pages and re-matching the marker.
@@ -538,4 +540,69 @@ Use analysis functions to discover patterns, then pass to `segmentPages()`:
538
540
 
539
541
  See README.md for complete examples.
540
542
 
543
+ ---
544
+
545
+ ## Debugging Page Boundary Detection (Added 2026-01-04)
546
+
547
+ ### The Problem: False Positives in Prefix Matching
548
+
549
+ When using `maxPages=0` with empty breakpoint `['']` (page boundary breaks), the segmenter can fail when:
550
+ 1. **Pages have identical prefixes** - All pages start with the same text
551
+ 2. **Duplicated content within pages** - The same phrase appears multiple times in a single page
552
+ 3. **Long content** - Pages are thousands of characters, putting false matches closer to expected boundaries
553
+
554
+ **Root cause**: The `findPageStartNearExpectedBoundary` function in `breakpoint-utils.ts` uses prefix matching to find page boundaries. When content is duplicated, it finds matches at incorrect positions within the current page instead of at the actual page boundary.
555
+
556
+ ### Key Functions in the Breakpoint Chain
557
+
558
+ 1. **`applyBreakpoints()`** - Entry point for breakpoint processing
559
+ 2. **`processOversizedSegment()`** - Iteratively breaks segments exceeding `maxPages`
560
+ 3. **`computeWindowEndIdx()`** - Calculates max page index for current window
561
+ 4. **`findBreakpointWindowEndPosition()`** - Finds the byte position where the window ends
562
+ 5. **`findPageStartNearExpectedBoundary()`** - Content-based search for page start position
563
+ 6. **`handlePageBoundaryBreak()`** - Handles empty pattern `''` (page boundary)
564
+ 7. **`buildCumulativeOffsets()`** - Pre-computes exact byte positions for each page
565
+
566
+ ### Debug Strategy
567
+
568
+ 1. **Check cumulative offsets first** - `buildCumulativeOffsets()` returns correct positions from `pages.join('\n')`
569
+ 2. **Trace `expectedBoundary`** - This is calculated correctly from cumulative offsets
570
+ 3. **Check `findPageStartNearExpectedBoundary` candidates** - The bug is usually here; it finds false matches
571
+ 4. **Verify the deviation check** - Matches must be within `MAX_DEVIATION` (2000 chars) of expected boundary
572
+
573
+ ### The Fix Applied
574
+
575
+ Two changes in `breakpoint-utils.ts`:
541
576
 
577
+ 1. **`findPageStartNearExpectedBoundary`** - Added `MAX_DEVIATION` check to reject matches too far from expected boundary:
578
+ ```typescript
579
+ const MAX_DEVIATION = 2000;
580
+ if (bestDistance <= MAX_DEVIATION) {
581
+ return bestCandidate.pos;
582
+ }
583
+ // Continue trying shorter prefixes or return -1
584
+ ```
585
+
586
+ 2. **`findBreakpointWindowEndPosition`** - Changed fallback from `remainingContent.length` to `bestExpectedBoundary`:
587
+ ```typescript
588
+ // Before (bug): return remainingContent.length; // Merges all remaining pages!
589
+ // After (fix): return Math.min(bestExpectedBoundary, remainingContent.length);
590
+ ```
591
+
592
+ ### Test Case Pattern for This Bug
593
+
594
+ ```typescript
595
+ it('should correctly split pages with identical prefixes and duplicated content', () => {
596
+ const sharedPrefix = 'SHARED PREFIX ';
597
+ const filler = 'Lorem ipsum. '.repeat(200); // ~6000 chars
598
+ const pages: Page[] = [
599
+ { content: sharedPrefix + 'start ' + filler + sharedPrefix + 'end', id: 0 },
600
+ { content: sharedPrefix + 'page1', id: 1 },
601
+ { content: sharedPrefix + 'page2', id: 2 },
602
+ ];
603
+ const result = segmentPages(pages, { breakpoints: [''], maxPages: 0 });
604
+ expect(result).toHaveLength(3); // Without fix: 2 or 1
605
+ });
606
+ ```
607
+
608
+ ---
package/dist/index.mjs CHANGED
@@ -1278,7 +1278,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
1278
1278
  * This is used to define breakpoint windows in terms of actual content being split, rather than
1279
1279
  * raw per-page offsets which can desync when structural rules strip markers.
1280
1280
  */
1281
- const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
1281
+ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
1282
1282
  const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
1283
1283
  if (!targetPageData) return -1;
1284
1284
  const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
@@ -1288,13 +1288,45 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1288
1288
  for (const len of WINDOW_PREFIX_LENGTHS) {
1289
1289
  const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
1290
1290
  if (!prefix) continue;
1291
+ const candidates = [];
1291
1292
  let pos = remainingContent.indexOf(prefix, searchStart);
1292
1293
  while (pos !== -1 && pos <= searchEnd) {
1293
- if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
1294
+ if (pos > 0) {
1295
+ const charBefore = remainingContent[pos - 1];
1296
+ if (charBefore === "\n") candidates.push({
1297
+ isNewline: true,
1298
+ pos
1299
+ });
1300
+ else if (/\s/.test(charBefore)) candidates.push({
1301
+ isNewline: false,
1302
+ pos
1303
+ });
1304
+ }
1294
1305
  pos = remainingContent.indexOf(prefix, pos + 1);
1295
1306
  }
1296
- const last = remainingContent.lastIndexOf(prefix, approx);
1297
- if (last > 0) return last;
1307
+ if (candidates.length > 0) {
1308
+ const newlineCandidates = candidates.filter((c) => c.isNewline);
1309
+ const pool = newlineCandidates.length > 0 ? newlineCandidates : candidates;
1310
+ let bestCandidate = pool[0];
1311
+ let bestDistance = Math.abs(pool[0].pos - expectedBoundary);
1312
+ for (let i = 1; i < pool.length; i++) {
1313
+ const dist = Math.abs(pool[i].pos - expectedBoundary);
1314
+ if (dist < bestDistance) {
1315
+ bestDistance = dist;
1316
+ bestCandidate = pool[i];
1317
+ }
1318
+ }
1319
+ const MAX_DEVIATION = 2e3;
1320
+ if (bestDistance <= MAX_DEVIATION) return bestCandidate.pos;
1321
+ logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
1322
+ targetPageIdx,
1323
+ expectedBoundary,
1324
+ bestDistance,
1325
+ maxDeviation: MAX_DEVIATION,
1326
+ matchPos: bestCandidate.pos,
1327
+ prefixLength: len
1328
+ });
1329
+ }
1298
1330
  }
1299
1331
  return -1;
1300
1332
  };
@@ -1314,6 +1346,7 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1314
1346
  * @param pageIds - Array of all page IDs
1315
1347
  * @param normalizedPages - Map of page ID to normalized content
1316
1348
  * @param cumulativeOffsets - Cumulative character offsets (for estimates)
1349
+ * @param logger - Optional logger for debugging
1317
1350
  * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
1318
1351
  * with a sentinel boundary at segmentContent.length as the last element
1319
1352
  *
@@ -1322,12 +1355,12 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
1322
1355
  * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
1323
1356
  * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
1324
1357
  */
1325
- const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1358
+ const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
1326
1359
  const boundaryPositions = [0];
1327
1360
  const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
1328
1361
  for (let i = fromIdx + 1; i <= toIdx; i++) {
1329
1362
  const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
1330
- const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
1363
+ const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages, logger);
1331
1364
  const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
1332
1365
  if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
1333
1366
  else {
@@ -1371,18 +1404,20 @@ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
1371
1404
  * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
1372
1405
  * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
1373
1406
  */
1374
- const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1407
+ const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
1375
1408
  if (windowEndIdx >= toIdx) return remainingContent.length;
1376
1409
  const desiredNextIdx = windowEndIdx + 1;
1377
1410
  const minNextIdx = currentFromIdx + 1;
1378
1411
  const maxNextIdx = Math.min(desiredNextIdx, toIdx);
1379
1412
  const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
1413
+ let bestExpectedBoundary = remainingContent.length;
1380
1414
  for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
1381
1415
  const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
1382
- const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
1416
+ if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
1417
+ const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
1383
1418
  if (pos > 0) return pos;
1384
1419
  }
1385
- return remainingContent.length;
1420
+ return Math.min(bestExpectedBoundary, remainingContent.length);
1386
1421
  };
1387
1422
  /**
1388
1423
  * Finds exclusion-based break position using raw cumulative offsets.
@@ -1460,7 +1495,8 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
1460
1495
  const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
1461
1496
  if (nextPageData) {
1462
1497
  const pos = findNextPagePosition(remainingContent, nextPageData);
1463
- if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
1498
+ const tolerance = Math.max(2e3, windowEndPosition * .5);
1499
+ if (pos > 0 && Math.abs(pos - windowEndPosition) <= tolerance) return Math.min(pos, windowEndPosition, remainingContent.length);
1464
1500
  }
1465
1501
  }
1466
1502
  return Math.min(windowEndPosition, remainingContent.length);
@@ -1484,14 +1520,14 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
1484
1520
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1485
1521
  if (skipWhenRegex?.test(remainingContent)) continue;
1486
1522
  if (regex === null) return {
1487
- breakpointIndex: i,
1488
1523
  breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
1524
+ breakpointIndex: i,
1489
1525
  rule
1490
1526
  };
1491
1527
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1492
1528
  if (breakPos > 0) return {
1493
- breakpointIndex: i,
1494
1529
  breakPos,
1530
+ breakpointIndex: i,
1495
1531
  rule
1496
1532
  };
1497
1533
  }
@@ -1662,7 +1698,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1662
1698
  let currentFromIdx = fromIdx;
1663
1699
  let isFirstPiece = true;
1664
1700
  let lastBreakpoint = null;
1665
- const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1701
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
1666
1702
  logger?.debug?.("[breakpoints] boundaryPositions built", {
1667
1703
  boundaryPositions,
1668
1704
  fromIdx,
@@ -1683,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1683
1719
  break;
1684
1720
  }
1685
1721
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
1686
- const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1722
+ const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
1687
1723
  logger?.debug?.(`[breakpoints] iteration=${i}`, {
1688
1724
  currentFromIdx,
1689
1725
  cursorPos,