flappa-doormal 2.10.0 → 2.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +67 -0
- package/dist/index.mjs +50 -14
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/AGENTS.md
CHANGED
|
@@ -383,6 +383,8 @@ bunx biome lint .
|
|
|
383
383
|
|
|
384
384
|
11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
|
|
385
385
|
|
|
386
|
+
12. **Prefix matching fails with duplicated content**: When using `indexOf()` to find page boundaries by matching prefixes, false positives occur when pages have identical prefixes AND content is duplicated within pages. Solution: use cumulative byte offsets as the source of truth for expected boundaries, and only accept prefix matches within a strict deviation threshold (2000 chars). When content-based detection fails, fall back directly to the calculated offset rather than returning `remainingContent.length` (which merges all remaining pages).
|
|
387
|
+
|
|
386
388
|
### For Future AI Agents (Recovery + Repo gotchas)
|
|
387
389
|
|
|
388
390
|
1. **`lineStartsAfter` vs `lineStartsWith` is not “cosmetic”**: `lineStartsAfter` changes output by stripping the matched marker via an internal `contentStartOffset` during segment construction. If a client used it by accident, you cannot reconstruct the exact stripped prefix from output alone without referencing the original pages and re-matching the marker.
|
|
@@ -538,4 +540,69 @@ Use analysis functions to discover patterns, then pass to `segmentPages()`:
|
|
|
538
540
|
|
|
539
541
|
See README.md for complete examples.
|
|
540
542
|
|
|
543
|
+
---
|
|
544
|
+
|
|
545
|
+
## Debugging Page Boundary Detection (Added 2026-01-04)
|
|
546
|
+
|
|
547
|
+
### The Problem: False Positives in Prefix Matching
|
|
548
|
+
|
|
549
|
+
When using `maxPages=0` with empty breakpoint `['']` (page boundary breaks), the segmenter can fail when:
|
|
550
|
+
1. **Pages have identical prefixes** - All pages start with the same text
|
|
551
|
+
2. **Duplicated content within pages** - The same phrase appears multiple times in a single page
|
|
552
|
+
3. **Long content** - Pages are thousands of characters, putting false matches closer to expected boundaries
|
|
553
|
+
|
|
554
|
+
**Root cause**: The `findPageStartNearExpectedBoundary` function in `breakpoint-utils.ts` uses prefix matching to find page boundaries. When content is duplicated, it finds matches at incorrect positions within the current page instead of at the actual page boundary.
|
|
555
|
+
|
|
556
|
+
### Key Functions in the Breakpoint Chain
|
|
557
|
+
|
|
558
|
+
1. **`applyBreakpoints()`** - Entry point for breakpoint processing
|
|
559
|
+
2. **`processOversizedSegment()`** - Iteratively breaks segments exceeding `maxPages`
|
|
560
|
+
3. **`computeWindowEndIdx()`** - Calculates max page index for current window
|
|
561
|
+
4. **`findBreakpointWindowEndPosition()`** - Finds the byte position where the window ends
|
|
562
|
+
5. **`findPageStartNearExpectedBoundary()`** - Content-based search for page start position
|
|
563
|
+
6. **`handlePageBoundaryBreak()`** - Handles empty pattern `''` (page boundary)
|
|
564
|
+
7. **`buildCumulativeOffsets()`** - Pre-computes exact byte positions for each page
|
|
565
|
+
|
|
566
|
+
### Debug Strategy
|
|
567
|
+
|
|
568
|
+
1. **Check cumulative offsets first** - `buildCumulativeOffsets()` returns correct positions from `pages.join('\n')`
|
|
569
|
+
2. **Trace `expectedBoundary`** - This is calculated correctly from cumulative offsets
|
|
570
|
+
3. **Check `findPageStartNearExpectedBoundary` candidates** - The bug is usually here; it finds false matches
|
|
571
|
+
4. **Verify the deviation check** - Matches must be within `MAX_DEVIATION` (2000 chars) of expected boundary
|
|
572
|
+
|
|
573
|
+
### The Fix Applied
|
|
574
|
+
|
|
575
|
+
Two changes in `breakpoint-utils.ts`:
|
|
541
576
|
|
|
577
|
+
1. **`findPageStartNearExpectedBoundary`** - Added `MAX_DEVIATION` check to reject matches too far from expected boundary:
|
|
578
|
+
```typescript
|
|
579
|
+
const MAX_DEVIATION = 2000;
|
|
580
|
+
if (bestDistance <= MAX_DEVIATION) {
|
|
581
|
+
return bestCandidate.pos;
|
|
582
|
+
}
|
|
583
|
+
// Continue trying shorter prefixes or return -1
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
2. **`findBreakpointWindowEndPosition`** - Changed fallback from `remainingContent.length` to `bestExpectedBoundary`:
|
|
587
|
+
```typescript
|
|
588
|
+
// Before (bug): return remainingContent.length; // Merges all remaining pages!
|
|
589
|
+
// After (fix): return Math.min(bestExpectedBoundary, remainingContent.length);
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
### Test Case Pattern for This Bug
|
|
593
|
+
|
|
594
|
+
```typescript
|
|
595
|
+
it('should correctly split pages with identical prefixes and duplicated content', () => {
|
|
596
|
+
const sharedPrefix = 'SHARED PREFIX ';
|
|
597
|
+
const filler = 'Lorem ipsum. '.repeat(200); // ~6000 chars
|
|
598
|
+
const pages: Page[] = [
|
|
599
|
+
{ content: sharedPrefix + 'start ' + filler + sharedPrefix + 'end', id: 0 },
|
|
600
|
+
{ content: sharedPrefix + 'page1', id: 1 },
|
|
601
|
+
{ content: sharedPrefix + 'page2', id: 2 },
|
|
602
|
+
];
|
|
603
|
+
const result = segmentPages(pages, { breakpoints: [''], maxPages: 0 });
|
|
604
|
+
expect(result).toHaveLength(3); // Without fix: 2 or 1
|
|
605
|
+
});
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
---
|
package/dist/index.mjs
CHANGED
|
@@ -1278,7 +1278,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
|
|
|
1278
1278
|
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
1279
1279
|
* raw per-page offsets which can desync when structural rules strip markers.
|
|
1280
1280
|
*/
|
|
1281
|
-
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
1281
|
+
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
|
|
1282
1282
|
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
1283
1283
|
if (!targetPageData) return -1;
|
|
1284
1284
|
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
@@ -1288,13 +1288,45 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1288
1288
|
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
1289
1289
|
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
1290
1290
|
if (!prefix) continue;
|
|
1291
|
+
const candidates = [];
|
|
1291
1292
|
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
1292
1293
|
while (pos !== -1 && pos <= searchEnd) {
|
|
1293
|
-
if (pos > 0
|
|
1294
|
+
if (pos > 0) {
|
|
1295
|
+
const charBefore = remainingContent[pos - 1];
|
|
1296
|
+
if (charBefore === "\n") candidates.push({
|
|
1297
|
+
isNewline: true,
|
|
1298
|
+
pos
|
|
1299
|
+
});
|
|
1300
|
+
else if (/\s/.test(charBefore)) candidates.push({
|
|
1301
|
+
isNewline: false,
|
|
1302
|
+
pos
|
|
1303
|
+
});
|
|
1304
|
+
}
|
|
1294
1305
|
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
1295
1306
|
}
|
|
1296
|
-
|
|
1297
|
-
|
|
1307
|
+
if (candidates.length > 0) {
|
|
1308
|
+
const newlineCandidates = candidates.filter((c) => c.isNewline);
|
|
1309
|
+
const pool = newlineCandidates.length > 0 ? newlineCandidates : candidates;
|
|
1310
|
+
let bestCandidate = pool[0];
|
|
1311
|
+
let bestDistance = Math.abs(pool[0].pos - expectedBoundary);
|
|
1312
|
+
for (let i = 1; i < pool.length; i++) {
|
|
1313
|
+
const dist = Math.abs(pool[i].pos - expectedBoundary);
|
|
1314
|
+
if (dist < bestDistance) {
|
|
1315
|
+
bestDistance = dist;
|
|
1316
|
+
bestCandidate = pool[i];
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
const MAX_DEVIATION = 2e3;
|
|
1320
|
+
if (bestDistance <= MAX_DEVIATION) return bestCandidate.pos;
|
|
1321
|
+
logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
|
|
1322
|
+
targetPageIdx,
|
|
1323
|
+
expectedBoundary,
|
|
1324
|
+
bestDistance,
|
|
1325
|
+
maxDeviation: MAX_DEVIATION,
|
|
1326
|
+
matchPos: bestCandidate.pos,
|
|
1327
|
+
prefixLength: len
|
|
1328
|
+
});
|
|
1329
|
+
}
|
|
1298
1330
|
}
|
|
1299
1331
|
return -1;
|
|
1300
1332
|
};
|
|
@@ -1314,6 +1346,7 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1314
1346
|
* @param pageIds - Array of all page IDs
|
|
1315
1347
|
* @param normalizedPages - Map of page ID to normalized content
|
|
1316
1348
|
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
1349
|
+
* @param logger - Optional logger for debugging
|
|
1317
1350
|
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
1318
1351
|
* with a sentinel boundary at segmentContent.length as the last element
|
|
1319
1352
|
*
|
|
@@ -1322,12 +1355,12 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1322
1355
|
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
1323
1356
|
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
1324
1357
|
*/
|
|
1325
|
-
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1358
|
+
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1326
1359
|
const boundaryPositions = [0];
|
|
1327
1360
|
const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
1328
1361
|
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
1329
1362
|
const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
|
|
1330
|
-
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
|
|
1363
|
+
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1331
1364
|
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
1332
1365
|
if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
|
|
1333
1366
|
else {
|
|
@@ -1371,18 +1404,20 @@ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
|
1371
1404
|
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
1372
1405
|
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
1373
1406
|
*/
|
|
1374
|
-
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1407
|
+
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1375
1408
|
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
1376
1409
|
const desiredNextIdx = windowEndIdx + 1;
|
|
1377
1410
|
const minNextIdx = currentFromIdx + 1;
|
|
1378
1411
|
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
1379
1412
|
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
1413
|
+
let bestExpectedBoundary = remainingContent.length;
|
|
1380
1414
|
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
1381
1415
|
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
1382
|
-
|
|
1416
|
+
if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
|
|
1417
|
+
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1383
1418
|
if (pos > 0) return pos;
|
|
1384
1419
|
}
|
|
1385
|
-
return remainingContent.length;
|
|
1420
|
+
return Math.min(bestExpectedBoundary, remainingContent.length);
|
|
1386
1421
|
};
|
|
1387
1422
|
/**
|
|
1388
1423
|
* Finds exclusion-based break position using raw cumulative offsets.
|
|
@@ -1460,7 +1495,8 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1460
1495
|
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
1461
1496
|
if (nextPageData) {
|
|
1462
1497
|
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
1463
|
-
|
|
1498
|
+
const tolerance = Math.max(2e3, windowEndPosition * .5);
|
|
1499
|
+
if (pos > 0 && Math.abs(pos - windowEndPosition) <= tolerance) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
1464
1500
|
}
|
|
1465
1501
|
}
|
|
1466
1502
|
return Math.min(windowEndPosition, remainingContent.length);
|
|
@@ -1484,14 +1520,14 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
1484
1520
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1485
1521
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1486
1522
|
if (regex === null) return {
|
|
1487
|
-
breakpointIndex: i,
|
|
1488
1523
|
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1524
|
+
breakpointIndex: i,
|
|
1489
1525
|
rule
|
|
1490
1526
|
};
|
|
1491
1527
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1492
1528
|
if (breakPos > 0) return {
|
|
1493
|
-
breakpointIndex: i,
|
|
1494
1529
|
breakPos,
|
|
1530
|
+
breakpointIndex: i,
|
|
1495
1531
|
rule
|
|
1496
1532
|
};
|
|
1497
1533
|
}
|
|
@@ -1662,7 +1698,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1662
1698
|
let currentFromIdx = fromIdx;
|
|
1663
1699
|
let isFirstPiece = true;
|
|
1664
1700
|
let lastBreakpoint = null;
|
|
1665
|
-
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1701
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1666
1702
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1667
1703
|
boundaryPositions,
|
|
1668
1704
|
fromIdx,
|
|
@@ -1683,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1683
1719
|
break;
|
|
1684
1720
|
}
|
|
1685
1721
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
1686
|
-
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1722
|
+
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1687
1723
|
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
1688
1724
|
currentFromIdx,
|
|
1689
1725
|
cursorPos,
|