flappa-doormal 2.2.2 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +21 -6
- package/README.md +12 -3
- package/dist/index.d.mts +28 -0
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +443 -331
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -157,6 +157,26 @@ const makeDiacriticInsensitive = (text) => {
|
|
|
157
157
|
|
|
158
158
|
//#endregion
|
|
159
159
|
//#region src/segmentation/breakpoint-utils.ts
|
|
160
|
+
const WINDOW_PREFIX_LENGTHS = [
|
|
161
|
+
80,
|
|
162
|
+
60,
|
|
163
|
+
40,
|
|
164
|
+
30,
|
|
165
|
+
20,
|
|
166
|
+
15
|
|
167
|
+
];
|
|
168
|
+
const JOINER_PREFIX_LENGTHS = [
|
|
169
|
+
80,
|
|
170
|
+
60,
|
|
171
|
+
40,
|
|
172
|
+
30,
|
|
173
|
+
20,
|
|
174
|
+
15,
|
|
175
|
+
12,
|
|
176
|
+
10,
|
|
177
|
+
8,
|
|
178
|
+
6
|
|
179
|
+
];
|
|
160
180
|
/**
|
|
161
181
|
* Normalizes a breakpoint to the object form.
|
|
162
182
|
* Strings are converted to { pattern: str } with no constraints.
|
|
@@ -312,6 +332,120 @@ const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp
|
|
|
312
332
|
}
|
|
313
333
|
});
|
|
314
334
|
/**
|
|
335
|
+
* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
|
|
336
|
+
*
|
|
337
|
+
* This is used for breakpoint-generated segments which don't have access to the original
|
|
338
|
+
* `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
|
|
339
|
+
* prefix after the previous boundary, then replace ONLY the single newline immediately before
|
|
340
|
+
* that page start.
|
|
341
|
+
*
|
|
342
|
+
* This avoids converting real in-page newlines, while still normalizing page joins consistently.
|
|
343
|
+
*/
|
|
344
|
+
const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
|
|
345
|
+
if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
|
|
346
|
+
let updated = content;
|
|
347
|
+
let searchFrom = 0;
|
|
348
|
+
for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
|
|
349
|
+
const pageData = normalizedPages.get(pageIds[pi]);
|
|
350
|
+
if (!pageData) continue;
|
|
351
|
+
const trimmed = pageData.content.trimStart();
|
|
352
|
+
let found = -1;
|
|
353
|
+
for (const len of JOINER_PREFIX_LENGTHS) {
|
|
354
|
+
const prefix = trimmed.slice(0, Math.min(len, trimmed.length)).trim();
|
|
355
|
+
if (!prefix) continue;
|
|
356
|
+
const pos = updated.indexOf(prefix, searchFrom);
|
|
357
|
+
if (pos > 0) {
|
|
358
|
+
found = pos;
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
if (found > 0) {
|
|
363
|
+
if (updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
364
|
+
searchFrom = found;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
return updated;
|
|
368
|
+
};
|
|
369
|
+
/**
|
|
370
|
+
* Estimates how far into the current page `remainingContent` begins.
|
|
371
|
+
*
|
|
372
|
+
* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
|
|
373
|
+
* When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
|
|
374
|
+
* expected boundary positions. This helper computes an approximate starting offset by matching
|
|
375
|
+
* a short prefix of `remainingContent` inside the current page content.
|
|
376
|
+
*/
|
|
377
|
+
const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
|
|
378
|
+
const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
|
|
379
|
+
if (!currentPageData) return 0;
|
|
380
|
+
const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
|
|
381
|
+
const needle = remStart.slice(0, Math.min(30, remStart.length));
|
|
382
|
+
if (!needle) return 0;
|
|
383
|
+
const idx = currentPageData.content.indexOf(needle);
|
|
384
|
+
return idx > 0 ? idx : 0;
|
|
385
|
+
};
|
|
386
|
+
/**
|
|
387
|
+
* Attempts to find the start position of a target page within remainingContent,
|
|
388
|
+
* anchored near an expected boundary position to reduce collisions.
|
|
389
|
+
*
|
|
390
|
+
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
391
|
+
* raw per-page offsets which can desync when structural rules strip markers.
|
|
392
|
+
*/
|
|
393
|
+
const findPageStartNearExpectedBoundary = (remainingContent, currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
394
|
+
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
395
|
+
if (!targetPageData) return -1;
|
|
396
|
+
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
397
|
+
const searchStart = Math.max(0, approx - 1e4);
|
|
398
|
+
const searchEnd = Math.min(remainingContent.length, approx + 2e3);
|
|
399
|
+
const targetTrimmed = targetPageData.content.trimStart();
|
|
400
|
+
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
401
|
+
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
402
|
+
if (!prefix) continue;
|
|
403
|
+
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
404
|
+
while (pos !== -1 && pos <= searchEnd) {
|
|
405
|
+
if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
|
|
406
|
+
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
407
|
+
}
|
|
408
|
+
const last = remainingContent.lastIndexOf(prefix, approx);
|
|
409
|
+
if (last > 0) return last;
|
|
410
|
+
}
|
|
411
|
+
return -1;
|
|
412
|
+
};
|
|
413
|
+
/**
|
|
414
|
+
* Finds the end position of a breakpoint window inside `remainingContent`.
|
|
415
|
+
*
|
|
416
|
+
* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
|
|
417
|
+
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
418
|
+
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
419
|
+
*/
|
|
420
|
+
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
421
|
+
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
422
|
+
const desiredNextIdx = windowEndIdx + 1;
|
|
423
|
+
const minNextIdx = currentFromIdx + 1;
|
|
424
|
+
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
425
|
+
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
426
|
+
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
427
|
+
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
428
|
+
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
|
|
429
|
+
if (pos > 0) return pos;
|
|
430
|
+
}
|
|
431
|
+
return remainingContent.length;
|
|
432
|
+
};
|
|
433
|
+
/**
|
|
434
|
+
* Finds exclusion-based break position using raw cumulative offsets.
|
|
435
|
+
*
|
|
436
|
+
* This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
|
|
437
|
+
* Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
|
|
438
|
+
*/
|
|
439
|
+
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
|
|
440
|
+
const startingPageId = pageIds[currentFromIdx];
|
|
441
|
+
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
442
|
+
for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
|
|
443
|
+
const pageId = pageIds[pageIdx];
|
|
444
|
+
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
|
|
445
|
+
}
|
|
446
|
+
return -1;
|
|
447
|
+
};
|
|
448
|
+
/**
|
|
315
449
|
* Finds the actual ending page index by searching backwards for page content prefix.
|
|
316
450
|
* Used to determine which page a segment actually ends on based on content matching.
|
|
317
451
|
*
|
|
@@ -424,8 +558,8 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
|
|
|
424
558
|
* @param ctx - Breakpoint context with page data and patterns
|
|
425
559
|
* @returns Break position in the content, or -1 if no break found
|
|
426
560
|
*/
|
|
427
|
-
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, ctx) => {
|
|
428
|
-
const { pageIds, normalizedPages,
|
|
561
|
+
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
|
|
562
|
+
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
429
563
|
for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
|
|
430
564
|
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
431
565
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
@@ -436,18 +570,162 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
436
570
|
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
437
571
|
if (nextPageData) {
|
|
438
572
|
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
439
|
-
if (pos > 0) return pos;
|
|
573
|
+
if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
440
574
|
}
|
|
441
575
|
}
|
|
442
|
-
return Math.min(
|
|
576
|
+
return Math.min(windowEndPosition, remainingContent.length);
|
|
443
577
|
}
|
|
444
|
-
const
|
|
445
|
-
const breakPos = findPatternBreakPosition(remainingContent.slice(0, windowEndPosition), regex, prefer);
|
|
578
|
+
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
446
579
|
if (breakPos > 0) return breakPos;
|
|
447
580
|
}
|
|
448
581
|
return -1;
|
|
449
582
|
};
|
|
450
583
|
|
|
584
|
+
//#endregion
|
|
585
|
+
//#region src/segmentation/breakpoint-processor.ts
|
|
586
|
+
/**
|
|
587
|
+
* Breakpoint post-processing engine extracted from segmenter.ts.
|
|
588
|
+
*
|
|
589
|
+
* This module is intentionally split into small helpers to reduce cognitive complexity
|
|
590
|
+
* and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
|
|
591
|
+
*/
|
|
592
|
+
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
593
|
+
const buildNormalizedPagesMap = (pages, normalizedContent) => {
|
|
594
|
+
const normalizedPages = /* @__PURE__ */ new Map();
|
|
595
|
+
for (let i = 0; i < pages.length; i++) {
|
|
596
|
+
const content = normalizedContent[i];
|
|
597
|
+
normalizedPages.set(pages[i].id, {
|
|
598
|
+
content,
|
|
599
|
+
index: i,
|
|
600
|
+
length: content.length
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
return normalizedPages;
|
|
604
|
+
};
|
|
605
|
+
const buildCumulativeOffsets = (pageIds, normalizedPages) => {
|
|
606
|
+
const cumulativeOffsets = [0];
|
|
607
|
+
let totalOffset = 0;
|
|
608
|
+
for (let i = 0; i < pageIds.length; i++) {
|
|
609
|
+
const pageData = normalizedPages.get(pageIds[i]);
|
|
610
|
+
totalOffset += pageData ? pageData.length : 0;
|
|
611
|
+
if (i < pageIds.length - 1) totalOffset += 1;
|
|
612
|
+
cumulativeOffsets.push(totalOffset);
|
|
613
|
+
}
|
|
614
|
+
return cumulativeOffsets;
|
|
615
|
+
};
|
|
616
|
+
const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
617
|
+
const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
|
|
618
|
+
const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
|
|
619
|
+
let windowEndIdx = currentFromIdx;
|
|
620
|
+
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
621
|
+
else break;
|
|
622
|
+
return windowEndIdx;
|
|
623
|
+
};
|
|
624
|
+
const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
|
|
625
|
+
const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
|
|
626
|
+
const computePiecePages = (pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages) => {
|
|
627
|
+
const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
628
|
+
return {
|
|
629
|
+
actualEndIdx: pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx,
|
|
630
|
+
actualStartIdx
|
|
631
|
+
};
|
|
632
|
+
};
|
|
633
|
+
const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
|
|
634
|
+
let nextFromIdx = actualEndIdx;
|
|
635
|
+
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
636
|
+
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
637
|
+
if (nextPageData) {
|
|
638
|
+
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
639
|
+
if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
return nextFromIdx;
|
|
643
|
+
};
|
|
644
|
+
const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
|
|
645
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
646
|
+
const result = [];
|
|
647
|
+
let remainingContent = segment.content;
|
|
648
|
+
let currentFromIdx = fromIdx;
|
|
649
|
+
let isFirstPiece = true;
|
|
650
|
+
let iterationCount = 0;
|
|
651
|
+
const maxIterations = 1e4;
|
|
652
|
+
while (currentFromIdx <= toIdx) {
|
|
653
|
+
iterationCount++;
|
|
654
|
+
if (iterationCount > maxIterations) {
|
|
655
|
+
logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
|
|
656
|
+
break;
|
|
657
|
+
}
|
|
658
|
+
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
659
|
+
if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
|
|
660
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
661
|
+
if (finalSeg) result.push(finalSeg);
|
|
662
|
+
break;
|
|
663
|
+
}
|
|
664
|
+
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
665
|
+
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
666
|
+
const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
|
|
667
|
+
let breakPosition = -1;
|
|
668
|
+
if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
669
|
+
if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
670
|
+
expandedBreakpoints,
|
|
671
|
+
normalizedPages,
|
|
672
|
+
pageIds,
|
|
673
|
+
prefer
|
|
674
|
+
});
|
|
675
|
+
if (breakPosition <= 0) breakPosition = windowEndPosition;
|
|
676
|
+
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
677
|
+
const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
|
|
678
|
+
if (pieceContent) {
|
|
679
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
680
|
+
if (pieceSeg) result.push(pieceSeg);
|
|
681
|
+
}
|
|
682
|
+
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
683
|
+
if (!remainingContent) break;
|
|
684
|
+
currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
685
|
+
isFirstPiece = false;
|
|
686
|
+
}
|
|
687
|
+
return result;
|
|
688
|
+
};
|
|
689
|
+
/**
|
|
690
|
+
* Applies breakpoints to oversized segments.
|
|
691
|
+
*
|
|
692
|
+
* Note: This is an internal engine used by `segmentPages()`.
|
|
693
|
+
*/
|
|
694
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
|
|
695
|
+
const pageIds = pages.map((p) => p.id);
|
|
696
|
+
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
697
|
+
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
698
|
+
const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
|
|
699
|
+
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
|
|
700
|
+
const result = [];
|
|
701
|
+
logger?.info?.("Starting breakpoint processing", {
|
|
702
|
+
maxPages,
|
|
703
|
+
segmentCount: segments.length
|
|
704
|
+
});
|
|
705
|
+
for (const segment of segments) {
|
|
706
|
+
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
707
|
+
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
708
|
+
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
709
|
+
const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
|
|
710
|
+
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
711
|
+
result.push(segment);
|
|
712
|
+
continue;
|
|
713
|
+
}
|
|
714
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
715
|
+
result.push(...broken.map((s) => {
|
|
716
|
+
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
717
|
+
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
718
|
+
if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
|
|
719
|
+
...s,
|
|
720
|
+
content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
|
|
721
|
+
};
|
|
722
|
+
return s;
|
|
723
|
+
}));
|
|
724
|
+
}
|
|
725
|
+
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
726
|
+
return result;
|
|
727
|
+
};
|
|
728
|
+
|
|
451
729
|
//#endregion
|
|
452
730
|
//#region src/segmentation/match-utils.ts
|
|
453
731
|
/**
|
|
@@ -613,28 +891,6 @@ const anyRuleAllowsId = (rules, pageId) => {
|
|
|
613
891
|
});
|
|
614
892
|
};
|
|
615
893
|
|
|
616
|
-
//#endregion
|
|
617
|
-
//#region src/segmentation/textUtils.ts
|
|
618
|
-
/**
|
|
619
|
-
* Strip all HTML tags from content, keeping only text.
|
|
620
|
-
*
|
|
621
|
-
* @param html - HTML content
|
|
622
|
-
* @returns Plain text content
|
|
623
|
-
*/
|
|
624
|
-
const stripHtmlTags = (html) => {
|
|
625
|
-
return html.replace(/<[^>]*>/g, "");
|
|
626
|
-
};
|
|
627
|
-
/**
|
|
628
|
-
* Normalizes line endings to Unix-style (`\n`).
|
|
629
|
-
*
|
|
630
|
-
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
631
|
-
* for consistent pattern matching across platforms.
|
|
632
|
-
*
|
|
633
|
-
* @param content - Raw content with potentially mixed line endings
|
|
634
|
-
* @returns Content with all line endings normalized to `\n`
|
|
635
|
-
*/
|
|
636
|
-
const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
637
|
-
|
|
638
894
|
//#endregion
|
|
639
895
|
//#region src/segmentation/tokens.ts
|
|
640
896
|
/**
|
|
@@ -1004,15 +1260,12 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
1004
1260
|
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1005
1261
|
|
|
1006
1262
|
//#endregion
|
|
1007
|
-
//#region src/segmentation/
|
|
1263
|
+
//#region src/segmentation/rule-regex.ts
|
|
1008
1264
|
/**
|
|
1009
|
-
*
|
|
1265
|
+
* Split rule → compiled regex builder.
|
|
1010
1266
|
*
|
|
1011
|
-
*
|
|
1012
|
-
*
|
|
1013
|
-
* and optional metadata.
|
|
1014
|
-
*
|
|
1015
|
-
* @module segmenter
|
|
1267
|
+
* Extracted from `segmenter.ts` to reduce cognitive complexity and enable
|
|
1268
|
+
* independent unit testing of regex compilation and token expansion behavior.
|
|
1016
1269
|
*/
|
|
1017
1270
|
/**
|
|
1018
1271
|
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
@@ -1023,35 +1276,26 @@ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
|
1023
1276
|
* - Lookbehind assertions `(?<=...)` and `(?<!...)`
|
|
1024
1277
|
* - Named groups `(?<name>...)` (start with `(?` so excluded here)
|
|
1025
1278
|
*
|
|
1026
|
-
*
|
|
1027
|
-
* excluded by this check because they are tracked separately via the
|
|
1028
|
-
* `captureNames` array from token expansion. This function only detects
|
|
1029
|
-
* anonymous capturing groups like `(.*)`.
|
|
1030
|
-
*
|
|
1031
|
-
* @param pattern - Regex pattern string to analyze
|
|
1032
|
-
* @returns `true` if the pattern contains at least one anonymous capturing group
|
|
1279
|
+
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
1033
1280
|
*/
|
|
1034
1281
|
const hasCapturingGroup = (pattern) => {
|
|
1035
1282
|
return /\((?!\?)/.test(pattern);
|
|
1036
1283
|
};
|
|
1037
1284
|
/**
|
|
1285
|
+
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1286
|
+
*/
|
|
1287
|
+
const compileRuleRegex = (pattern) => {
|
|
1288
|
+
try {
|
|
1289
|
+
return new RegExp(pattern, "gmu");
|
|
1290
|
+
} catch (error) {
|
|
1291
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1292
|
+
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
|
|
1293
|
+
}
|
|
1294
|
+
};
|
|
1295
|
+
/**
|
|
1038
1296
|
* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
|
|
1039
1297
|
*
|
|
1040
|
-
*
|
|
1041
|
-
* transform is applied to token patterns BEFORE wrapping with capture groups,
|
|
1042
|
-
* ensuring regex metacharacters (`(`, `)`, `|`, etc.) are not corrupted.
|
|
1043
|
-
*
|
|
1044
|
-
* @param pattern - Pattern string potentially containing `{{token}}` placeholders
|
|
1045
|
-
* @param fuzzy - Whether to apply diacritic-insensitive transformation
|
|
1046
|
-
* @returns Processed pattern with expanded tokens and capture names
|
|
1047
|
-
*
|
|
1048
|
-
* @example
|
|
1049
|
-
* processPattern('{{raqms:num}} {{dash}}', false)
|
|
1050
|
-
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ]', captureNames: ['num'] }
|
|
1051
|
-
*
|
|
1052
|
-
* @example
|
|
1053
|
-
* processPattern('{{naql}}', true)
|
|
1054
|
-
* // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
|
|
1298
|
+
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
1055
1299
|
*/
|
|
1056
1300
|
const processPattern = (pattern, fuzzy) => {
|
|
1057
1301
|
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
|
|
@@ -1060,77 +1304,115 @@ const processPattern = (pattern, fuzzy) => {
|
|
|
1060
1304
|
pattern: expanded
|
|
1061
1305
|
};
|
|
1062
1306
|
};
|
|
1307
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
|
|
1308
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1309
|
+
const union = processed.map((p) => p.pattern).join("|");
|
|
1310
|
+
return {
|
|
1311
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1312
|
+
regex: `^(?:${union})(.*)`
|
|
1313
|
+
};
|
|
1314
|
+
};
|
|
1315
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
|
|
1316
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1317
|
+
const union = processed.map((p) => p.pattern).join("|");
|
|
1318
|
+
return {
|
|
1319
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1320
|
+
regex: `^(?:${union})`
|
|
1321
|
+
};
|
|
1322
|
+
};
|
|
1323
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
|
|
1324
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1325
|
+
const union = processed.map((p) => p.pattern).join("|");
|
|
1326
|
+
return {
|
|
1327
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1328
|
+
regex: `(?:${union})$`
|
|
1329
|
+
};
|
|
1330
|
+
};
|
|
1331
|
+
const buildTemplateRegexSource = (template) => {
|
|
1332
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
|
|
1333
|
+
return {
|
|
1334
|
+
captureNames,
|
|
1335
|
+
regex: pattern
|
|
1336
|
+
};
|
|
1337
|
+
};
|
|
1338
|
+
const determineUsesCapture = (regexSource, captureNames) => hasCapturingGroup(regexSource) || captureNames.length > 0;
|
|
1063
1339
|
/**
|
|
1064
1340
|
* Builds a compiled regex and metadata from a split rule.
|
|
1065
1341
|
*
|
|
1066
|
-
*
|
|
1067
|
-
* - `regex`: Used as-is (no token expansion)
|
|
1068
|
-
* - `template`: Tokens expanded via `expandTokensWithCaptures`
|
|
1069
|
-
* - `lineStartsWith`: Converted to `^(?:patterns...)`
|
|
1070
|
-
* - `lineStartsAfter`: Converted to `^(?:patterns...)(.*)`
|
|
1071
|
-
* - `lineEndsWith`: Converted to `(?:patterns...)$`
|
|
1072
|
-
*
|
|
1073
|
-
* @param rule - Split rule containing pattern and options
|
|
1074
|
-
* @returns Compiled regex with capture metadata
|
|
1342
|
+
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
1075
1343
|
*/
|
|
1076
1344
|
const buildRuleRegex = (rule) => {
|
|
1077
1345
|
const s = { ...rule };
|
|
1078
1346
|
const fuzzy = rule.fuzzy ?? false;
|
|
1079
1347
|
let allCaptureNames = [];
|
|
1080
|
-
/**
|
|
1081
|
-
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1082
|
-
*
|
|
1083
|
-
* @remarks
|
|
1084
|
-
* This catches syntax errors only. It does NOT protect against ReDoS
|
|
1085
|
-
* (catastrophic backtracking) from pathological patterns. Avoid compiling
|
|
1086
|
-
* patterns from untrusted sources.
|
|
1087
|
-
*/
|
|
1088
|
-
const compileRegex = (pattern) => {
|
|
1089
|
-
try {
|
|
1090
|
-
return new RegExp(pattern, "gmu");
|
|
1091
|
-
} catch (error) {
|
|
1092
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1093
|
-
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
|
|
1094
|
-
}
|
|
1095
|
-
};
|
|
1096
1348
|
if (s.lineStartsAfter?.length) {
|
|
1097
|
-
const
|
|
1098
|
-
|
|
1099
|
-
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1100
|
-
s.regex = `^(?:${patterns})(.*)`;
|
|
1349
|
+
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
|
|
1350
|
+
allCaptureNames = captureNames;
|
|
1101
1351
|
return {
|
|
1102
1352
|
captureNames: allCaptureNames,
|
|
1103
|
-
regex:
|
|
1353
|
+
regex: compileRuleRegex(regex),
|
|
1104
1354
|
usesCapture: true,
|
|
1105
1355
|
usesLineStartsAfter: true
|
|
1106
1356
|
};
|
|
1107
1357
|
}
|
|
1108
1358
|
if (s.lineStartsWith?.length) {
|
|
1109
|
-
const
|
|
1110
|
-
|
|
1111
|
-
allCaptureNames =
|
|
1112
|
-
s.regex = `^(?:${patterns})`;
|
|
1359
|
+
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
|
|
1360
|
+
s.regex = regex;
|
|
1361
|
+
allCaptureNames = captureNames;
|
|
1113
1362
|
}
|
|
1114
1363
|
if (s.lineEndsWith?.length) {
|
|
1115
|
-
const
|
|
1116
|
-
|
|
1117
|
-
allCaptureNames =
|
|
1118
|
-
s.regex = `(?:${patterns})$`;
|
|
1364
|
+
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
|
|
1365
|
+
s.regex = regex;
|
|
1366
|
+
allCaptureNames = captureNames;
|
|
1119
1367
|
}
|
|
1120
1368
|
if (s.template) {
|
|
1121
|
-
const {
|
|
1122
|
-
s.regex =
|
|
1369
|
+
const { regex, captureNames } = buildTemplateRegexSource(s.template);
|
|
1370
|
+
s.regex = regex;
|
|
1123
1371
|
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1124
1372
|
}
|
|
1125
1373
|
if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
1126
|
-
const usesCapture =
|
|
1374
|
+
const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
|
|
1127
1375
|
return {
|
|
1128
1376
|
captureNames: allCaptureNames,
|
|
1129
|
-
regex:
|
|
1377
|
+
regex: compileRuleRegex(s.regex),
|
|
1130
1378
|
usesCapture,
|
|
1131
1379
|
usesLineStartsAfter: false
|
|
1132
1380
|
};
|
|
1133
1381
|
};
|
|
1382
|
+
|
|
1383
|
+
//#endregion
|
|
1384
|
+
//#region src/segmentation/textUtils.ts
|
|
1385
|
+
/**
|
|
1386
|
+
* Strip all HTML tags from content, keeping only text.
|
|
1387
|
+
*
|
|
1388
|
+
* @param html - HTML content
|
|
1389
|
+
* @returns Plain text content
|
|
1390
|
+
*/
|
|
1391
|
+
const stripHtmlTags = (html) => {
|
|
1392
|
+
return html.replace(/<[^>]*>/g, "");
|
|
1393
|
+
};
|
|
1394
|
+
/**
|
|
1395
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
1396
|
+
*
|
|
1397
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
1398
|
+
* for consistent pattern matching across platforms.
|
|
1399
|
+
*
|
|
1400
|
+
* @param content - Raw content with potentially mixed line endings
|
|
1401
|
+
* @returns Content with all line endings normalized to `\n`
|
|
1402
|
+
*/
|
|
1403
|
+
const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
1404
|
+
|
|
1405
|
+
//#endregion
|
|
1406
|
+
//#region src/segmentation/segmenter.ts
|
|
1407
|
+
/**
|
|
1408
|
+
* Core segmentation engine for splitting Arabic text pages into logical segments.
|
|
1409
|
+
*
|
|
1410
|
+
* The segmenter takes an array of pages and applies pattern-based rules to
|
|
1411
|
+
* identify split points, producing segments with content, page references,
|
|
1412
|
+
* and optional metadata.
|
|
1413
|
+
*
|
|
1414
|
+
* @module segmenter
|
|
1415
|
+
*/
|
|
1134
1416
|
/**
|
|
1135
1417
|
* Builds a concatenated content string and page mapping from input pages.
|
|
1136
1418
|
*
|
|
@@ -1200,6 +1482,62 @@ const buildPageMap = (pages) => {
|
|
|
1200
1482
|
};
|
|
1201
1483
|
};
|
|
1202
1484
|
/**
|
|
1485
|
+
* Deduplicate split points by index, preferring ones with more information.
|
|
1486
|
+
*
|
|
1487
|
+
* Preference rules (when same index):
|
|
1488
|
+
* - Prefer a split with `contentStartOffset` (needed for `lineStartsAfter` marker stripping)
|
|
1489
|
+
* - Otherwise prefer a split with `meta` over one without
|
|
1490
|
+
*/
|
|
1491
|
+
const dedupeSplitPoints = (splitPoints) => {
|
|
1492
|
+
const byIndex = /* @__PURE__ */ new Map();
|
|
1493
|
+
for (const p of splitPoints) {
|
|
1494
|
+
const existing = byIndex.get(p.index);
|
|
1495
|
+
if (!existing) {
|
|
1496
|
+
byIndex.set(p.index, p);
|
|
1497
|
+
continue;
|
|
1498
|
+
}
|
|
1499
|
+
if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
|
|
1500
|
+
}
|
|
1501
|
+
const unique = [...byIndex.values()];
|
|
1502
|
+
unique.sort((a, b) => a.index - b.index);
|
|
1503
|
+
return unique;
|
|
1504
|
+
};
|
|
1505
|
+
/**
|
|
1506
|
+
* If no structural rules produced segments, create a single segment spanning all pages.
|
|
1507
|
+
* This allows breakpoint processing to still run.
|
|
1508
|
+
*/
|
|
1509
|
+
const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
|
|
1510
|
+
if (segments.length > 0 || pages.length === 0) return segments;
|
|
1511
|
+
const firstPage = pages[0];
|
|
1512
|
+
const lastPage = pages[pages.length - 1];
|
|
1513
|
+
const joinChar = pageJoiner === "newline" ? "\n" : " ";
|
|
1514
|
+
const allContent = normalizedContent.join(joinChar).trim();
|
|
1515
|
+
if (!allContent) return segments;
|
|
1516
|
+
const initialSeg = {
|
|
1517
|
+
content: allContent,
|
|
1518
|
+
from: firstPage.id
|
|
1519
|
+
};
|
|
1520
|
+
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
1521
|
+
return [initialSeg];
|
|
1522
|
+
};
|
|
1523
|
+
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
1524
|
+
const collectSplitPointsFromRule = (rule) => {
|
|
1525
|
+
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1526
|
+
return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
|
|
1527
|
+
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1528
|
+
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1529
|
+
return {
|
|
1530
|
+
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
1531
|
+
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
1532
|
+
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
1533
|
+
meta: rule.meta,
|
|
1534
|
+
namedCaptures: m.namedCaptures
|
|
1535
|
+
};
|
|
1536
|
+
});
|
|
1537
|
+
};
|
|
1538
|
+
return rules.flatMap(collectSplitPointsFromRule);
|
|
1539
|
+
};
|
|
1540
|
+
/**
|
|
1203
1541
|
* Executes a regex against content and extracts match results with capture information.
|
|
1204
1542
|
*
|
|
1205
1543
|
* @param content - Full content string to search
|
|
@@ -1282,202 +1620,6 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
1282
1620
|
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
1283
1621
|
* @returns Processed segments with oversized ones broken up
|
|
1284
1622
|
*/
|
|
1285
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger) => {
|
|
1286
|
-
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
|
|
1287
|
-
const startingPageId = pageIds$1[currentFromIdx];
|
|
1288
|
-
if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
|
|
1289
|
-
for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
|
|
1290
|
-
const pageId = pageIds$1[pageIdx];
|
|
1291
|
-
if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets$1[pageIdx] - cumulativeOffsets$1[currentFromIdx];
|
|
1292
|
-
}
|
|
1293
|
-
return -1;
|
|
1294
|
-
};
|
|
1295
|
-
const pageIds = pages.map((p) => p.id);
|
|
1296
|
-
const pageIdToIndex = new Map(pageIds.map((id, i) => [id, i]));
|
|
1297
|
-
const normalizedPages = /* @__PURE__ */ new Map();
|
|
1298
|
-
for (let i = 0; i < pages.length; i++) {
|
|
1299
|
-
const content = normalizedContent[i];
|
|
1300
|
-
normalizedPages.set(pages[i].id, {
|
|
1301
|
-
content,
|
|
1302
|
-
index: i,
|
|
1303
|
-
length: content.length
|
|
1304
|
-
});
|
|
1305
|
-
}
|
|
1306
|
-
const cumulativeOffsets = [0];
|
|
1307
|
-
let totalOffset = 0;
|
|
1308
|
-
for (let i = 0; i < pageIds.length; i++) {
|
|
1309
|
-
const pageData = normalizedPages.get(pageIds[i]);
|
|
1310
|
-
totalOffset += pageData ? pageData.length : 0;
|
|
1311
|
-
if (i < pageIds.length - 1) totalOffset += 1;
|
|
1312
|
-
cumulativeOffsets.push(totalOffset);
|
|
1313
|
-
}
|
|
1314
|
-
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1315
|
-
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
|
|
1316
|
-
const result = [];
|
|
1317
|
-
logger?.info?.("Starting breakpoint processing", {
|
|
1318
|
-
maxPages,
|
|
1319
|
-
segmentCount: segments.length
|
|
1320
|
-
});
|
|
1321
|
-
for (const segment of segments) {
|
|
1322
|
-
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
1323
|
-
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
1324
|
-
logger?.debug?.("Processing segment", {
|
|
1325
|
-
contentLength: segment.content.length,
|
|
1326
|
-
contentPreview: segment.content.slice(0, 100),
|
|
1327
|
-
from: segment.from,
|
|
1328
|
-
fromIdx,
|
|
1329
|
-
to: segment.to,
|
|
1330
|
-
toIdx
|
|
1331
|
-
});
|
|
1332
|
-
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
1333
|
-
const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
1334
|
-
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
1335
|
-
logger?.trace?.("Segment within limit, keeping as-is");
|
|
1336
|
-
result.push(segment);
|
|
1337
|
-
continue;
|
|
1338
|
-
}
|
|
1339
|
-
logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
|
|
1340
|
-
let remainingContent = segment.content;
|
|
1341
|
-
let currentFromIdx = fromIdx;
|
|
1342
|
-
let isFirstPiece = true;
|
|
1343
|
-
let iterationCount = 0;
|
|
1344
|
-
const maxIterations = 1e4;
|
|
1345
|
-
while (currentFromIdx <= toIdx) {
|
|
1346
|
-
iterationCount++;
|
|
1347
|
-
if (iterationCount > maxIterations) {
|
|
1348
|
-
logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
|
|
1349
|
-
logger?.error?.("Loop state", {
|
|
1350
|
-
currentFromIdx,
|
|
1351
|
-
remainingContentLength: remainingContent.length,
|
|
1352
|
-
toIdx
|
|
1353
|
-
});
|
|
1354
|
-
break;
|
|
1355
|
-
}
|
|
1356
|
-
const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
|
|
1357
|
-
logger?.trace?.("Loop iteration", {
|
|
1358
|
-
currentFromIdx,
|
|
1359
|
-
currentPageId: pageIds[currentFromIdx],
|
|
1360
|
-
iterationCount,
|
|
1361
|
-
remainingContentLength: remainingContent.length,
|
|
1362
|
-
remainingContentPreview: remainingContent.slice(0, 80),
|
|
1363
|
-
remainingSpan,
|
|
1364
|
-
toIdx,
|
|
1365
|
-
toPageId: pageIds[toIdx]
|
|
1366
|
-
});
|
|
1367
|
-
const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
|
|
1368
|
-
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1369
|
-
logger?.debug?.("Remaining span within limit, outputting final segment");
|
|
1370
|
-
const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1371
|
-
if (finalSeg) result.push(finalSeg);
|
|
1372
|
-
break;
|
|
1373
|
-
}
|
|
1374
|
-
const currentPageId = pageIds[currentFromIdx];
|
|
1375
|
-
const maxWindowPageId = currentPageId + maxPages;
|
|
1376
|
-
let windowEndIdx = currentFromIdx;
|
|
1377
|
-
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
1378
|
-
else break;
|
|
1379
|
-
logger?.trace?.("Window calculation", {
|
|
1380
|
-
currentPageId,
|
|
1381
|
-
maxWindowPageId,
|
|
1382
|
-
windowEndIdx,
|
|
1383
|
-
windowEndPageId: pageIds[windowEndIdx]
|
|
1384
|
-
});
|
|
1385
|
-
const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
|
|
1386
|
-
let breakPosition = -1;
|
|
1387
|
-
if (windowHasExclusions) {
|
|
1388
|
-
logger?.trace?.("Window has exclusions, finding exclusion break position");
|
|
1389
|
-
breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1390
|
-
logger?.trace?.("Exclusion break position", { breakPosition });
|
|
1391
|
-
}
|
|
1392
|
-
if (breakPosition <= 0) {
|
|
1393
|
-
const breakpointCtx = {
|
|
1394
|
-
cumulativeOffsets,
|
|
1395
|
-
expandedBreakpoints,
|
|
1396
|
-
normalizedPages,
|
|
1397
|
-
pageIds,
|
|
1398
|
-
prefer
|
|
1399
|
-
};
|
|
1400
|
-
logger?.trace?.("Finding break position using patterns...");
|
|
1401
|
-
breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
|
|
1402
|
-
logger?.trace?.("Pattern break position", { breakPosition });
|
|
1403
|
-
}
|
|
1404
|
-
if (breakPosition <= 0) {
|
|
1405
|
-
logger?.debug?.("No pattern matched, falling back to page boundary");
|
|
1406
|
-
if (windowEndIdx === currentFromIdx) {
|
|
1407
|
-
logger?.trace?.("Single page window, outputting page and advancing");
|
|
1408
|
-
const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
|
|
1409
|
-
const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
|
|
1410
|
-
if (pageSeg) result.push(pageSeg);
|
|
1411
|
-
remainingContent = remainingContent.slice(pageContent.length).trim();
|
|
1412
|
-
currentFromIdx++;
|
|
1413
|
-
isFirstPiece = false;
|
|
1414
|
-
logger?.trace?.("After single page", {
|
|
1415
|
-
currentFromIdx,
|
|
1416
|
-
remainingContentLength: remainingContent.length
|
|
1417
|
-
});
|
|
1418
|
-
continue;
|
|
1419
|
-
}
|
|
1420
|
-
breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
1421
|
-
logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
|
|
1422
|
-
}
|
|
1423
|
-
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
1424
|
-
logger?.trace?.("Piece extracted", {
|
|
1425
|
-
breakPosition,
|
|
1426
|
-
pieceContentLength: pieceContent.length,
|
|
1427
|
-
pieceContentPreview: pieceContent.slice(0, 80)
|
|
1428
|
-
});
|
|
1429
|
-
const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1430
|
-
const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1431
|
-
logger?.trace?.("Actual page indices", {
|
|
1432
|
-
actualEndIdx,
|
|
1433
|
-
actualStartIdx,
|
|
1434
|
-
pieceHasContent: !!pieceContent
|
|
1435
|
-
});
|
|
1436
|
-
if (pieceContent) {
|
|
1437
|
-
const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1438
|
-
if (pieceSeg) {
|
|
1439
|
-
result.push(pieceSeg);
|
|
1440
|
-
logger?.debug?.("Created segment", {
|
|
1441
|
-
contentLength: pieceSeg.content.length,
|
|
1442
|
-
from: pieceSeg.from,
|
|
1443
|
-
to: pieceSeg.to
|
|
1444
|
-
});
|
|
1445
|
-
}
|
|
1446
|
-
}
|
|
1447
|
-
const prevRemainingLength = remainingContent.length;
|
|
1448
|
-
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
1449
|
-
logger?.trace?.("After slicing remainingContent", {
|
|
1450
|
-
newLength: remainingContent.length,
|
|
1451
|
-
prevLength: prevRemainingLength,
|
|
1452
|
-
slicedAmount: breakPosition
|
|
1453
|
-
});
|
|
1454
|
-
if (!remainingContent) {
|
|
1455
|
-
logger?.debug?.("No remaining content, breaking out of loop");
|
|
1456
|
-
break;
|
|
1457
|
-
}
|
|
1458
|
-
let nextFromIdx = actualEndIdx;
|
|
1459
|
-
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
1460
|
-
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
1461
|
-
if (nextPageData) {
|
|
1462
|
-
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
1463
|
-
if (nextPrefix && remainingContent.startsWith(nextPrefix)) {
|
|
1464
|
-
nextFromIdx = actualEndIdx + 1;
|
|
1465
|
-
logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
|
|
1466
|
-
}
|
|
1467
|
-
}
|
|
1468
|
-
}
|
|
1469
|
-
logger?.trace?.("End of iteration", {
|
|
1470
|
-
nextFromIdx,
|
|
1471
|
-
prevCurrentFromIdx: currentFromIdx,
|
|
1472
|
-
willAdvance: nextFromIdx !== currentFromIdx
|
|
1473
|
-
});
|
|
1474
|
-
currentFromIdx = nextFromIdx;
|
|
1475
|
-
isFirstPiece = false;
|
|
1476
|
-
}
|
|
1477
|
-
}
|
|
1478
|
-
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
1479
|
-
return result;
|
|
1480
|
-
};
|
|
1481
1623
|
/**
|
|
1482
1624
|
* Segments pages of content based on pattern-matching rules.
|
|
1483
1625
|
*
|
|
@@ -1521,45 +1663,15 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1521
1663
|
* });
|
|
1522
1664
|
*/
|
|
1523
1665
|
const segmentPages = (pages, options) => {
|
|
1524
|
-
const { rules = [], maxPages, breakpoints, prefer = "longer", logger } = options;
|
|
1666
|
+
const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
|
|
1525
1667
|
if (!pages.length) return [];
|
|
1526
1668
|
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
const
|
|
1531
|
-
|
|
1532
|
-
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1533
|
-
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1534
|
-
splitPoints.push({
|
|
1535
|
-
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
1536
|
-
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
1537
|
-
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
1538
|
-
meta: rule.meta,
|
|
1539
|
-
namedCaptures: m.namedCaptures
|
|
1540
|
-
});
|
|
1541
|
-
}
|
|
1542
|
-
}
|
|
1543
|
-
const byIndex = /* @__PURE__ */ new Map();
|
|
1544
|
-
for (const p of splitPoints) {
|
|
1545
|
-
const existing = byIndex.get(p.index);
|
|
1546
|
-
if (!existing) byIndex.set(p.index, p);
|
|
1547
|
-
else if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
|
|
1548
|
-
}
|
|
1549
|
-
const unique = [...byIndex.values()];
|
|
1550
|
-
unique.sort((a, b) => a.index - b.index);
|
|
1551
|
-
let segments = buildSegments(unique, matchContent, pageMap, rules);
|
|
1552
|
-
if (segments.length === 0 && pages.length > 0) {
|
|
1553
|
-
const firstPage = pages[0];
|
|
1554
|
-
const lastPage = pages[pages.length - 1];
|
|
1555
|
-
const initialSeg = {
|
|
1556
|
-
content: normalizedContent.join("\n").trim(),
|
|
1557
|
-
from: firstPage.id
|
|
1558
|
-
};
|
|
1559
|
-
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
1560
|
-
if (initialSeg.content) segments = [initialSeg];
|
|
1669
|
+
let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
|
|
1670
|
+
segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
|
|
1671
|
+
if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
|
|
1672
|
+
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1673
|
+
return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
1561
1674
|
}
|
|
1562
|
-
if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger);
|
|
1563
1675
|
return segments;
|
|
1564
1676
|
};
|
|
1565
1677
|
/**
|