flappa-doormal 2.2.3 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +17 -0
- package/README.md +22 -0
- package/dist/index.d.mts +1 -1
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +425 -219
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.mjs
CHANGED
|
@@ -348,25 +348,25 @@ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalize
|
|
|
348
348
|
for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
|
|
349
349
|
const pageData = normalizedPages.get(pageIds[pi]);
|
|
350
350
|
if (!pageData) continue;
|
|
351
|
-
const
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
const prefix = trimmed.slice(0, Math.min(len, trimmed.length)).trim();
|
|
355
|
-
if (!prefix) continue;
|
|
356
|
-
const pos = updated.indexOf(prefix, searchFrom);
|
|
357
|
-
if (pos > 0) {
|
|
358
|
-
found = pos;
|
|
359
|
-
break;
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
if (found > 0) {
|
|
363
|
-
if (updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
364
|
-
searchFrom = found;
|
|
365
|
-
}
|
|
351
|
+
const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
|
|
352
|
+
if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
353
|
+
if (found > 0) searchFrom = found;
|
|
366
354
|
}
|
|
367
355
|
return updated;
|
|
368
356
|
};
|
|
369
357
|
/**
|
|
358
|
+
* Finds the position of a page prefix in content, trying multiple prefix lengths.
|
|
359
|
+
*/
|
|
360
|
+
const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
|
|
361
|
+
for (const len of JOINER_PREFIX_LENGTHS) {
|
|
362
|
+
const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
|
|
363
|
+
if (!prefix) continue;
|
|
364
|
+
const pos = content.indexOf(prefix, searchFrom);
|
|
365
|
+
if (pos > 0) return pos;
|
|
366
|
+
}
|
|
367
|
+
return -1;
|
|
368
|
+
};
|
|
369
|
+
/**
|
|
370
370
|
* Estimates how far into the current page `remainingContent` begins.
|
|
371
371
|
*
|
|
372
372
|
* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
|
|
@@ -390,7 +390,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
|
|
|
390
390
|
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
391
391
|
* raw per-page offsets which can desync when structural rules strip markers.
|
|
392
392
|
*/
|
|
393
|
-
const findPageStartNearExpectedBoundary = (remainingContent,
|
|
393
|
+
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
394
394
|
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
395
395
|
if (!targetPageData) return -1;
|
|
396
396
|
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
@@ -548,6 +548,21 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
|
|
|
548
548
|
return selected.index + selected.length;
|
|
549
549
|
};
|
|
550
550
|
/**
|
|
551
|
+
* Handles page boundary breakpoint (empty pattern).
|
|
552
|
+
* Returns break position or -1 if no valid position found.
|
|
553
|
+
*/
|
|
554
|
+
const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
|
|
555
|
+
const nextPageIdx = windowEndIdx + 1;
|
|
556
|
+
if (nextPageIdx <= toIdx) {
|
|
557
|
+
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
558
|
+
if (nextPageData) {
|
|
559
|
+
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
560
|
+
if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
return Math.min(windowEndPosition, remainingContent.length);
|
|
564
|
+
};
|
|
565
|
+
/**
|
|
551
566
|
* Tries to find a break position within the current window using breakpoint patterns.
|
|
552
567
|
* Returns the break position or -1 if no suitable break was found.
|
|
553
568
|
*
|
|
@@ -564,17 +579,7 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
564
579
|
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
565
580
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
566
581
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
567
|
-
if (regex === null)
|
|
568
|
-
const nextPageIdx = windowEndIdx + 1;
|
|
569
|
-
if (nextPageIdx <= toIdx) {
|
|
570
|
-
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
571
|
-
if (nextPageData) {
|
|
572
|
-
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
573
|
-
if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
574
|
-
}
|
|
575
|
-
}
|
|
576
|
-
return Math.min(windowEndPosition, remainingContent.length);
|
|
577
|
-
}
|
|
582
|
+
if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
|
|
578
583
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
579
584
|
if (breakPos > 0) return breakPos;
|
|
580
585
|
}
|
|
@@ -636,7 +641,8 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
|
|
|
636
641
|
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
637
642
|
if (nextPageData) {
|
|
638
643
|
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
639
|
-
|
|
644
|
+
const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
|
|
645
|
+
if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
|
|
640
646
|
}
|
|
641
647
|
}
|
|
642
648
|
return nextFromIdx;
|
|
@@ -726,171 +732,6 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
726
732
|
return result;
|
|
727
733
|
};
|
|
728
734
|
|
|
729
|
-
//#endregion
|
|
730
|
-
//#region src/segmentation/match-utils.ts
|
|
731
|
-
/**
|
|
732
|
-
* Utility functions for regex matching and result processing.
|
|
733
|
-
*
|
|
734
|
-
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
735
|
-
* and enable independent testing. They handle match filtering, capture
|
|
736
|
-
* extraction, and occurrence-based selection.
|
|
737
|
-
*
|
|
738
|
-
* @module match-utils
|
|
739
|
-
*/
|
|
740
|
-
/**
|
|
741
|
-
* Extracts named capture groups from a regex match.
|
|
742
|
-
*
|
|
743
|
-
* Only includes groups that are in the `captureNames` list and have
|
|
744
|
-
* defined values. This filters out positional captures and ensures
|
|
745
|
-
* only explicitly requested named captures are returned.
|
|
746
|
-
*
|
|
747
|
-
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
748
|
-
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
749
|
-
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
750
|
-
*
|
|
751
|
-
* @example
|
|
752
|
-
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
753
|
-
* extractNamedCaptures(match.groups, ['num'])
|
|
754
|
-
* // → { num: '٦٦٩٦' }
|
|
755
|
-
*
|
|
756
|
-
* @example
|
|
757
|
-
* // No matching captures
|
|
758
|
-
* extractNamedCaptures({}, ['num'])
|
|
759
|
-
* // → undefined
|
|
760
|
-
*
|
|
761
|
-
* @example
|
|
762
|
-
* // Undefined groups
|
|
763
|
-
* extractNamedCaptures(undefined, ['num'])
|
|
764
|
-
* // → undefined
|
|
765
|
-
*/
|
|
766
|
-
const extractNamedCaptures = (groups, captureNames) => {
|
|
767
|
-
if (!groups || captureNames.length === 0) return;
|
|
768
|
-
const namedCaptures = {};
|
|
769
|
-
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
770
|
-
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
771
|
-
};
|
|
772
|
-
/**
|
|
773
|
-
* Gets the last defined positional capture group from a match array.
|
|
774
|
-
*
|
|
775
|
-
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
776
|
-
* is always at the end of the pattern. Named captures may shift the
|
|
777
|
-
* positional indices, so we iterate backward to find the actual content.
|
|
778
|
-
*
|
|
779
|
-
* @param match - RegExp exec result array
|
|
780
|
-
* @returns The last defined capture group value, or `undefined` if none
|
|
781
|
-
*
|
|
782
|
-
* @example
|
|
783
|
-
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
784
|
-
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
785
|
-
* getLastPositionalCapture(match)
|
|
786
|
-
* // → 'content'
|
|
787
|
-
*
|
|
788
|
-
* @example
|
|
789
|
-
* // No captures
|
|
790
|
-
* getLastPositionalCapture(['full match'])
|
|
791
|
-
* // → undefined
|
|
792
|
-
*/
|
|
793
|
-
const getLastPositionalCapture = (match) => {
|
|
794
|
-
if (match.length <= 1) return;
|
|
795
|
-
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
796
|
-
};
|
|
797
|
-
/**
|
|
798
|
-
* Filters matches to only include those within page ID constraints.
|
|
799
|
-
*
|
|
800
|
-
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
801
|
-
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
802
|
-
*
|
|
803
|
-
* @param matches - Array of match results to filter
|
|
804
|
-
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
805
|
-
* @param getId - Function that returns the page ID for a given offset
|
|
806
|
-
* @returns Filtered array containing only matches within constraints
|
|
807
|
-
*
|
|
808
|
-
* @example
|
|
809
|
-
* const matches = [
|
|
810
|
-
* { start: 0, end: 10 }, // Page 1
|
|
811
|
-
* { start: 100, end: 110 }, // Page 5
|
|
812
|
-
* { start: 200, end: 210 }, // Page 10
|
|
813
|
-
* ];
|
|
814
|
-
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
815
|
-
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
816
|
-
*/
|
|
817
|
-
const filterByConstraints = (matches, rule, getId) => {
|
|
818
|
-
return matches.filter((m) => {
|
|
819
|
-
const id = getId(m.start);
|
|
820
|
-
if (rule.min !== void 0 && id < rule.min) return false;
|
|
821
|
-
if (rule.max !== void 0 && id > rule.max) return false;
|
|
822
|
-
if (isPageExcluded(id, rule.exclude)) return false;
|
|
823
|
-
return true;
|
|
824
|
-
});
|
|
825
|
-
};
|
|
826
|
-
/**
|
|
827
|
-
* Filters matches based on occurrence setting (first, last, or all).
|
|
828
|
-
*
|
|
829
|
-
* Applies occurrence-based selection to a list of matches:
|
|
830
|
-
* - `'all'` or `undefined`: Return all matches (default)
|
|
831
|
-
* - `'first'`: Return only the first match
|
|
832
|
-
* - `'last'`: Return only the last match
|
|
833
|
-
*
|
|
834
|
-
* @param matches - Array of match results to filter
|
|
835
|
-
* @param occurrence - Which occurrence(s) to keep
|
|
836
|
-
* @returns Filtered array based on occurrence setting
|
|
837
|
-
*
|
|
838
|
-
* @example
|
|
839
|
-
* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
|
|
840
|
-
*
|
|
841
|
-
* filterByOccurrence(matches, 'first')
|
|
842
|
-
* // → [{ start: 0 }]
|
|
843
|
-
*
|
|
844
|
-
* filterByOccurrence(matches, 'last')
|
|
845
|
-
* // → [{ start: 20 }]
|
|
846
|
-
*
|
|
847
|
-
* filterByOccurrence(matches, 'all')
|
|
848
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
|
|
849
|
-
*
|
|
850
|
-
* filterByOccurrence(matches, undefined)
|
|
851
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
|
|
852
|
-
*/
|
|
853
|
-
const filterByOccurrence = (matches, occurrence) => {
|
|
854
|
-
if (!matches.length) return [];
|
|
855
|
-
if (occurrence === "first") return [matches[0]];
|
|
856
|
-
if (occurrence === "last") return [matches[matches.length - 1]];
|
|
857
|
-
return matches;
|
|
858
|
-
};
|
|
859
|
-
/**
|
|
860
|
-
* Checks if any rule in the list allows the given page ID.
|
|
861
|
-
*
|
|
862
|
-
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
863
|
-
* Rules without constraints allow all page IDs.
|
|
864
|
-
*
|
|
865
|
-
* This is used to determine whether to create a segment for content
|
|
866
|
-
* that appears before any split points (the "first segment").
|
|
867
|
-
*
|
|
868
|
-
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
869
|
-
* @param pageId - Page ID to check
|
|
870
|
-
* @returns `true` if at least one rule allows the page ID
|
|
871
|
-
*
|
|
872
|
-
* @example
|
|
873
|
-
* const rules = [
|
|
874
|
-
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
875
|
-
* { min: 20 }, // Allows pages 20+
|
|
876
|
-
* ];
|
|
877
|
-
*
|
|
878
|
-
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
879
|
-
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
880
|
-
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
881
|
-
*
|
|
882
|
-
* @example
|
|
883
|
-
* // Rules without constraints allow everything
|
|
884
|
-
* anyRuleAllowsId([{}], 999) // → true
|
|
885
|
-
*/
|
|
886
|
-
const anyRuleAllowsId = (rules, pageId) => {
|
|
887
|
-
return rules.some((r) => {
|
|
888
|
-
const minOk = r.min === void 0 || pageId >= r.min;
|
|
889
|
-
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
890
|
-
return minOk && maxOk;
|
|
891
|
-
});
|
|
892
|
-
};
|
|
893
|
-
|
|
894
735
|
//#endregion
|
|
895
736
|
//#region src/segmentation/tokens.ts
|
|
896
737
|
/**
|
|
@@ -977,6 +818,7 @@ const BASE_TOKENS = {
|
|
|
977
818
|
dash: "[-–—ـ]",
|
|
978
819
|
fasl: ["مسألة", "فصل"].join("|"),
|
|
979
820
|
harf: "[أ-ي]",
|
|
821
|
+
harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
|
|
980
822
|
kitab: "كتاب",
|
|
981
823
|
naql: [
|
|
982
824
|
"حدثني",
|
|
@@ -1120,7 +962,7 @@ const containsTokens = (query) => {
|
|
|
1120
962
|
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
1121
963
|
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
1122
964
|
*/
|
|
1123
|
-
const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
965
|
+
const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
|
|
1124
966
|
const captureNames = [];
|
|
1125
967
|
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
1126
968
|
/**
|
|
@@ -1162,16 +1004,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
|
1162
1004
|
const [, tokenName, captureName] = tokenMatch;
|
|
1163
1005
|
if (!tokenName && captureName) {
|
|
1164
1006
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1165
|
-
|
|
1166
|
-
|
|
1007
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1008
|
+
captureNames.push(prefixedName);
|
|
1009
|
+
return `(?<${prefixedName}>.+)`;
|
|
1167
1010
|
}
|
|
1168
1011
|
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
1169
1012
|
if (!tokenPattern) return segment.value;
|
|
1170
1013
|
if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
1171
1014
|
if (captureName) {
|
|
1172
1015
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1173
|
-
|
|
1174
|
-
|
|
1016
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1017
|
+
captureNames.push(prefixedName);
|
|
1018
|
+
return `(?<${prefixedName}>${tokenPattern})`;
|
|
1175
1019
|
}
|
|
1176
1020
|
return tokenPattern;
|
|
1177
1021
|
});
|
|
@@ -1259,6 +1103,224 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
1259
1103
|
*/
|
|
1260
1104
|
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1261
1105
|
|
|
1106
|
+
//#endregion
|
|
1107
|
+
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1108
|
+
/**
|
|
1109
|
+
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
1110
|
+
*
|
|
1111
|
+
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
1112
|
+
* a giant concatenated string. Instead, we match only at known line-start
|
|
1113
|
+
* offsets and perform a small deterministic comparison:
|
|
1114
|
+
* - Skip Arabic diacritics in the CONTENT
|
|
1115
|
+
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
1116
|
+
*
|
|
1117
|
+
* This module is intentionally conservative: it only supports "literal"
|
|
1118
|
+
* token patterns (plain text alternation via `|`), not general regex.
|
|
1119
|
+
*/
|
|
1120
|
+
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
1121
|
+
const equivKey = (ch) => {
|
|
1122
|
+
switch (ch) {
|
|
1123
|
+
case "آ":
|
|
1124
|
+
case "أ":
|
|
1125
|
+
case "إ": return "ا";
|
|
1126
|
+
case "ه": return "ة";
|
|
1127
|
+
case "ي": return "ى";
|
|
1128
|
+
default: return ch;
|
|
1129
|
+
}
|
|
1130
|
+
};
|
|
1131
|
+
/**
|
|
1132
|
+
* Match a fuzzy literal prefix at a given offset.
|
|
1133
|
+
*
|
|
1134
|
+
* - Skips diacritics in the content
|
|
1135
|
+
* - Applies equivalence groups on both content and literal
|
|
1136
|
+
*
|
|
1137
|
+
* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
|
|
1138
|
+
*/
|
|
1139
|
+
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
1140
|
+
let i = offset;
|
|
1141
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1142
|
+
for (let j = 0; j < literal.length; j++) {
|
|
1143
|
+
const litCh = literal[j];
|
|
1144
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1145
|
+
if (i >= content.length) return null;
|
|
1146
|
+
const cCh = content[i];
|
|
1147
|
+
if (equivKey(cCh) !== equivKey(litCh)) return null;
|
|
1148
|
+
i++;
|
|
1149
|
+
}
|
|
1150
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1151
|
+
return i;
|
|
1152
|
+
};
|
|
1153
|
+
const isLiteralOnly = (s) => {
|
|
1154
|
+
return !/[\\[\]{}()^$.*+?]/.test(s);
|
|
1155
|
+
};
|
|
1156
|
+
const compileLiteralAlternation = (pattern) => {
|
|
1157
|
+
if (!pattern) return null;
|
|
1158
|
+
if (!isLiteralOnly(pattern)) return null;
|
|
1159
|
+
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
1160
|
+
if (!alternatives.length) return null;
|
|
1161
|
+
return { alternatives };
|
|
1162
|
+
};
|
|
1163
|
+
/**
|
|
1164
|
+
* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
|
|
1165
|
+
* Returns null if not eligible.
|
|
1166
|
+
*/
|
|
1167
|
+
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
1168
|
+
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
1169
|
+
if (!m) return null;
|
|
1170
|
+
const token = m[1];
|
|
1171
|
+
const tokenPattern = getTokenPattern(token);
|
|
1172
|
+
if (!tokenPattern) return null;
|
|
1173
|
+
const compiled = compileLiteralAlternation(tokenPattern);
|
|
1174
|
+
if (!compiled) return null;
|
|
1175
|
+
return {
|
|
1176
|
+
alternatives: compiled.alternatives,
|
|
1177
|
+
token
|
|
1178
|
+
};
|
|
1179
|
+
};
|
|
1180
|
+
/**
|
|
1181
|
+
* Try matching any alternative for a compiled token at a line-start offset.
|
|
1182
|
+
* Returns endOffset (exclusive) on match, else null.
|
|
1183
|
+
*/
|
|
1184
|
+
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
1185
|
+
for (const alt of compiled.alternatives) {
|
|
1186
|
+
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
1187
|
+
if (end !== null) return end;
|
|
1188
|
+
}
|
|
1189
|
+
return null;
|
|
1190
|
+
};
|
|
1191
|
+
|
|
1192
|
+
//#endregion
|
|
1193
|
+
//#region src/segmentation/match-utils.ts
|
|
1194
|
+
/**
|
|
1195
|
+
* Utility functions for regex matching and result processing.
|
|
1196
|
+
*
|
|
1197
|
+
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
1198
|
+
* and enable independent testing. They handle match filtering, capture
|
|
1199
|
+
* extraction, and occurrence-based selection.
|
|
1200
|
+
*
|
|
1201
|
+
* @module match-utils
|
|
1202
|
+
*/
|
|
1203
|
+
/**
|
|
1204
|
+
* Extracts named capture groups from a regex match.
|
|
1205
|
+
*
|
|
1206
|
+
* Only includes groups that are in the `captureNames` list and have
|
|
1207
|
+
* defined values. This filters out positional captures and ensures
|
|
1208
|
+
* only explicitly requested named captures are returned.
|
|
1209
|
+
*
|
|
1210
|
+
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
1211
|
+
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
1212
|
+
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
1213
|
+
*
|
|
1214
|
+
* @example
|
|
1215
|
+
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
1216
|
+
* extractNamedCaptures(match.groups, ['num'])
|
|
1217
|
+
* // → { num: '٦٦٩٦' }
|
|
1218
|
+
*
|
|
1219
|
+
* @example
|
|
1220
|
+
* // No matching captures
|
|
1221
|
+
* extractNamedCaptures({}, ['num'])
|
|
1222
|
+
* // → undefined
|
|
1223
|
+
*
|
|
1224
|
+
* @example
|
|
1225
|
+
* // Undefined groups
|
|
1226
|
+
* extractNamedCaptures(undefined, ['num'])
|
|
1227
|
+
* // → undefined
|
|
1228
|
+
*/
|
|
1229
|
+
const extractNamedCaptures = (groups, captureNames) => {
|
|
1230
|
+
if (!groups || captureNames.length === 0) return;
|
|
1231
|
+
const namedCaptures = {};
|
|
1232
|
+
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
1233
|
+
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
1234
|
+
};
|
|
1235
|
+
/**
|
|
1236
|
+
* Gets the last defined positional capture group from a match array.
|
|
1237
|
+
*
|
|
1238
|
+
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
1239
|
+
* is always at the end of the pattern. Named captures may shift the
|
|
1240
|
+
* positional indices, so we iterate backward to find the actual content.
|
|
1241
|
+
*
|
|
1242
|
+
* @param match - RegExp exec result array
|
|
1243
|
+
* @returns The last defined capture group value, or `undefined` if none
|
|
1244
|
+
*
|
|
1245
|
+
* @example
|
|
1246
|
+
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
1247
|
+
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
1248
|
+
* getLastPositionalCapture(match)
|
|
1249
|
+
* // → 'content'
|
|
1250
|
+
*
|
|
1251
|
+
* @example
|
|
1252
|
+
* // No captures
|
|
1253
|
+
* getLastPositionalCapture(['full match'])
|
|
1254
|
+
* // → undefined
|
|
1255
|
+
*/
|
|
1256
|
+
const getLastPositionalCapture = (match) => {
|
|
1257
|
+
if (match.length <= 1) return;
|
|
1258
|
+
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
1259
|
+
};
|
|
1260
|
+
/**
|
|
1261
|
+
* Filters matches to only include those within page ID constraints.
|
|
1262
|
+
*
|
|
1263
|
+
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
1264
|
+
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
1265
|
+
*
|
|
1266
|
+
* @param matches - Array of match results to filter
|
|
1267
|
+
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
1268
|
+
* @param getId - Function that returns the page ID for a given offset
|
|
1269
|
+
* @returns Filtered array containing only matches within constraints
|
|
1270
|
+
*
|
|
1271
|
+
* @example
|
|
1272
|
+
* const matches = [
|
|
1273
|
+
* { start: 0, end: 10 }, // Page 1
|
|
1274
|
+
* { start: 100, end: 110 }, // Page 5
|
|
1275
|
+
* { start: 200, end: 210 }, // Page 10
|
|
1276
|
+
* ];
|
|
1277
|
+
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
1278
|
+
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1279
|
+
*/
|
|
1280
|
+
const filterByConstraints = (matches, rule, getId) => {
|
|
1281
|
+
return matches.filter((m) => {
|
|
1282
|
+
const id = getId(m.start);
|
|
1283
|
+
if (rule.min !== void 0 && id < rule.min) return false;
|
|
1284
|
+
if (rule.max !== void 0 && id > rule.max) return false;
|
|
1285
|
+
if (isPageExcluded(id, rule.exclude)) return false;
|
|
1286
|
+
return true;
|
|
1287
|
+
});
|
|
1288
|
+
};
|
|
1289
|
+
/**
|
|
1290
|
+
* Checks if any rule in the list allows the given page ID.
|
|
1291
|
+
*
|
|
1292
|
+
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
1293
|
+
* Rules without constraints allow all page IDs.
|
|
1294
|
+
*
|
|
1295
|
+
* This is used to determine whether to create a segment for content
|
|
1296
|
+
* that appears before any split points (the "first segment").
|
|
1297
|
+
*
|
|
1298
|
+
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
1299
|
+
* @param pageId - Page ID to check
|
|
1300
|
+
* @returns `true` if at least one rule allows the page ID
|
|
1301
|
+
*
|
|
1302
|
+
* @example
|
|
1303
|
+
* const rules = [
|
|
1304
|
+
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
1305
|
+
* { min: 20 }, // Allows pages 20+
|
|
1306
|
+
* ];
|
|
1307
|
+
*
|
|
1308
|
+
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
1309
|
+
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
1310
|
+
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
1311
|
+
*
|
|
1312
|
+
* @example
|
|
1313
|
+
* // Rules without constraints allow everything
|
|
1314
|
+
* anyRuleAllowsId([{}], 999) // → true
|
|
1315
|
+
*/
|
|
1316
|
+
const anyRuleAllowsId = (rules, pageId) => {
|
|
1317
|
+
return rules.some((r) => {
|
|
1318
|
+
const minOk = r.min === void 0 || pageId >= r.min;
|
|
1319
|
+
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
1320
|
+
return minOk && maxOk;
|
|
1321
|
+
});
|
|
1322
|
+
};
|
|
1323
|
+
|
|
1262
1324
|
//#endregion
|
|
1263
1325
|
//#region src/segmentation/rule-regex.ts
|
|
1264
1326
|
/**
|
|
@@ -1282,6 +1344,21 @@ const hasCapturingGroup = (pattern) => {
|
|
|
1282
1344
|
return /\((?!\?)/.test(pattern);
|
|
1283
1345
|
};
|
|
1284
1346
|
/**
|
|
1347
|
+
* Extracts named capture group names from a regex pattern.
|
|
1348
|
+
*
|
|
1349
|
+
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
1350
|
+
*
|
|
1351
|
+
* @example
|
|
1352
|
+
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
1353
|
+
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
1354
|
+
* extractNamedCaptureNames('^\\d+') // []
|
|
1355
|
+
*/
|
|
1356
|
+
const extractNamedCaptureNames = (pattern) => {
|
|
1357
|
+
const names = [];
|
|
1358
|
+
for (const match of pattern.matchAll(/\(\?<([^>]+)>/g)) names.push(match[1]);
|
|
1359
|
+
return names;
|
|
1360
|
+
};
|
|
1361
|
+
/**
|
|
1285
1362
|
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1286
1363
|
*/
|
|
1287
1364
|
const compileRuleRegex = (pattern) => {
|
|
@@ -1297,56 +1374,59 @@ const compileRuleRegex = (pattern) => {
|
|
|
1297
1374
|
*
|
|
1298
1375
|
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
1299
1376
|
*/
|
|
1300
|
-
const processPattern = (pattern, fuzzy) => {
|
|
1301
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1377
|
+
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
1378
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
1302
1379
|
return {
|
|
1303
1380
|
captureNames,
|
|
1304
1381
|
pattern: expanded
|
|
1305
1382
|
};
|
|
1306
1383
|
};
|
|
1307
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
|
|
1308
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1384
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1385
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1309
1386
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1387
|
+
const captureNames = processed.flatMap((p) => p.captureNames);
|
|
1388
|
+
const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
|
|
1389
|
+
if (capturePrefix) captureNames.push(`${capturePrefix}content`);
|
|
1310
1390
|
return {
|
|
1311
|
-
captureNames
|
|
1312
|
-
regex: `^(?:${union})
|
|
1391
|
+
captureNames,
|
|
1392
|
+
regex: `^(?:${union})${contentCapture}`
|
|
1313
1393
|
};
|
|
1314
1394
|
};
|
|
1315
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
|
|
1316
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1395
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1396
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1317
1397
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1318
1398
|
return {
|
|
1319
1399
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1320
1400
|
regex: `^(?:${union})`
|
|
1321
1401
|
};
|
|
1322
1402
|
};
|
|
1323
|
-
const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
|
|
1324
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1403
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1404
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1325
1405
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1326
1406
|
return {
|
|
1327
1407
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1328
1408
|
regex: `(?:${union})$`
|
|
1329
1409
|
};
|
|
1330
1410
|
};
|
|
1331
|
-
const buildTemplateRegexSource = (template) => {
|
|
1332
|
-
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
|
|
1411
|
+
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
1412
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
1333
1413
|
return {
|
|
1334
1414
|
captureNames,
|
|
1335
1415
|
regex: pattern
|
|
1336
1416
|
};
|
|
1337
1417
|
};
|
|
1338
|
-
const determineUsesCapture = (regexSource,
|
|
1418
|
+
const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(regexSource);
|
|
1339
1419
|
/**
|
|
1340
1420
|
* Builds a compiled regex and metadata from a split rule.
|
|
1341
1421
|
*
|
|
1342
1422
|
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
1343
1423
|
*/
|
|
1344
|
-
const buildRuleRegex = (rule) => {
|
|
1424
|
+
const buildRuleRegex = (rule, capturePrefix) => {
|
|
1345
1425
|
const s = { ...rule };
|
|
1346
1426
|
const fuzzy = rule.fuzzy ?? false;
|
|
1347
1427
|
let allCaptureNames = [];
|
|
1348
1428
|
if (s.lineStartsAfter?.length) {
|
|
1349
|
-
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
|
|
1429
|
+
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
|
|
1350
1430
|
allCaptureNames = captureNames;
|
|
1351
1431
|
return {
|
|
1352
1432
|
captureNames: allCaptureNames,
|
|
@@ -1356,21 +1436,22 @@ const buildRuleRegex = (rule) => {
|
|
|
1356
1436
|
};
|
|
1357
1437
|
}
|
|
1358
1438
|
if (s.lineStartsWith?.length) {
|
|
1359
|
-
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
|
|
1439
|
+
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
|
|
1360
1440
|
s.regex = regex;
|
|
1361
1441
|
allCaptureNames = captureNames;
|
|
1362
1442
|
}
|
|
1363
1443
|
if (s.lineEndsWith?.length) {
|
|
1364
|
-
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
|
|
1444
|
+
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
|
|
1365
1445
|
s.regex = regex;
|
|
1366
1446
|
allCaptureNames = captureNames;
|
|
1367
1447
|
}
|
|
1368
1448
|
if (s.template) {
|
|
1369
|
-
const { regex, captureNames } = buildTemplateRegexSource(s.template);
|
|
1449
|
+
const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
|
|
1370
1450
|
s.regex = regex;
|
|
1371
1451
|
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1372
1452
|
}
|
|
1373
1453
|
if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
1454
|
+
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(s.regex);
|
|
1374
1455
|
const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
|
|
1375
1456
|
return {
|
|
1376
1457
|
captureNames: allCaptureNames,
|
|
@@ -1521,9 +1602,120 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
1521
1602
|
return [initialSeg];
|
|
1522
1603
|
};
|
|
1523
1604
|
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
1524
|
-
const
|
|
1605
|
+
const combinableRules = [];
|
|
1606
|
+
const standaloneRules = [];
|
|
1607
|
+
const fastFuzzyRules = [];
|
|
1608
|
+
rules.forEach((rule, index) => {
|
|
1609
|
+
let isCombinable = true;
|
|
1610
|
+
if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
|
|
1611
|
+
const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
|
|
1612
|
+
if (compiled) {
|
|
1613
|
+
fastFuzzyRules.push({
|
|
1614
|
+
compiled,
|
|
1615
|
+
rule,
|
|
1616
|
+
ruleIndex: index
|
|
1617
|
+
});
|
|
1618
|
+
return;
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
if ("regex" in rule && rule.regex) {
|
|
1622
|
+
const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
|
|
1623
|
+
const hasBackreferences = /\\[1-9]/.test(rule.regex);
|
|
1624
|
+
const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
|
|
1625
|
+
if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
|
|
1626
|
+
}
|
|
1627
|
+
if (isCombinable) combinableRules.push({
|
|
1628
|
+
index,
|
|
1629
|
+
prefix: `r${index}_`,
|
|
1630
|
+
rule
|
|
1631
|
+
});
|
|
1632
|
+
else standaloneRules.push(rule);
|
|
1633
|
+
});
|
|
1634
|
+
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
1635
|
+
if (fastFuzzyRules.length > 0) {
|
|
1636
|
+
let boundaryIdx = 0;
|
|
1637
|
+
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1638
|
+
const advanceBoundaryTo = (offset) => {
|
|
1639
|
+
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
1640
|
+
boundaryIdx++;
|
|
1641
|
+
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1642
|
+
}
|
|
1643
|
+
};
|
|
1644
|
+
const recordSplitPoint = (ruleIndex, sp) => {
|
|
1645
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1646
|
+
splitPointsByRule.get(ruleIndex).push(sp);
|
|
1647
|
+
};
|
|
1648
|
+
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
1649
|
+
advanceBoundaryTo(lineStart);
|
|
1650
|
+
const pageId = currentBoundary?.id ?? 0;
|
|
1651
|
+
if (lineStart >= matchContent.length) break;
|
|
1652
|
+
for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
|
|
1653
|
+
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
1654
|
+
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
1655
|
+
if (end === null) continue;
|
|
1656
|
+
recordSplitPoint(ruleIndex, {
|
|
1657
|
+
index: (rule.split ?? "at") === "at" ? lineStart : end,
|
|
1658
|
+
meta: rule.meta
|
|
1659
|
+
});
|
|
1660
|
+
}
|
|
1661
|
+
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
1662
|
+
if (nextNl === -1) break;
|
|
1663
|
+
lineStart = nextNl + 1;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
if (combinableRules.length > 0) {
|
|
1667
|
+
const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
|
|
1668
|
+
const built = buildRuleRegex(rule, prefix);
|
|
1669
|
+
return {
|
|
1670
|
+
prefix,
|
|
1671
|
+
source: `(?<${prefix}>${built.regex.source})`,
|
|
1672
|
+
...built
|
|
1673
|
+
};
|
|
1674
|
+
});
|
|
1675
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1676
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1677
|
+
combinedRegex.lastIndex = 0;
|
|
1678
|
+
let m = combinedRegex.exec(matchContent);
|
|
1679
|
+
while (m !== null) {
|
|
1680
|
+
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
1681
|
+
if (matchedRuleIndex !== -1) {
|
|
1682
|
+
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
1683
|
+
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
1684
|
+
const namedCaptures = {};
|
|
1685
|
+
if (m.groups) {
|
|
1686
|
+
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
1687
|
+
const cleanName = prefixedName.slice(prefix.length);
|
|
1688
|
+
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
let capturedContent;
|
|
1692
|
+
let contentStartOffset;
|
|
1693
|
+
if (ruleInfo.usesLineStartsAfter) {
|
|
1694
|
+
capturedContent = m.groups?.[`${prefix}content`];
|
|
1695
|
+
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
1696
|
+
}
|
|
1697
|
+
const start = m.index;
|
|
1698
|
+
const end = m.index + m[0].length;
|
|
1699
|
+
const pageId = pageMap.getId(start);
|
|
1700
|
+
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
|
|
1701
|
+
const sp = {
|
|
1702
|
+
capturedContent: void 0,
|
|
1703
|
+
contentStartOffset,
|
|
1704
|
+
index: (rule.split ?? "at") === "at" ? start : end,
|
|
1705
|
+
meta: rule.meta,
|
|
1706
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
1707
|
+
};
|
|
1708
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
1709
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
1713
|
+
m = combinedRegex.exec(matchContent);
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
1525
1717
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1526
|
-
|
|
1718
|
+
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
|
|
1527
1719
|
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1528
1720
|
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1529
1721
|
return {
|
|
@@ -1534,8 +1726,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
|
1534
1726
|
namedCaptures: m.namedCaptures
|
|
1535
1727
|
};
|
|
1536
1728
|
});
|
|
1729
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1730
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
1537
1731
|
};
|
|
1538
|
-
|
|
1732
|
+
standaloneRules.forEach((rule) => {
|
|
1733
|
+
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
1734
|
+
});
|
|
1735
|
+
const finalSplitPoints = [];
|
|
1736
|
+
rules.forEach((rule, index) => {
|
|
1737
|
+
const points = splitPointsByRule.get(index);
|
|
1738
|
+
if (!points || points.length === 0) return;
|
|
1739
|
+
let filtered = points;
|
|
1740
|
+
if (rule.occurrence === "first") filtered = [points[0]];
|
|
1741
|
+
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
1742
|
+
finalSplitPoints.push(...filtered);
|
|
1743
|
+
});
|
|
1744
|
+
return finalSplitPoints;
|
|
1539
1745
|
};
|
|
1540
1746
|
/**
|
|
1541
1747
|
* Executes a regex against content and extracts match results with capture information.
|