flappa-doormal 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1 -1
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +373 -190
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -967,7 +967,7 @@ type ExpandResult = {
|
|
|
967
967
|
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
968
968
|
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
969
969
|
*/
|
|
970
|
-
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
|
|
970
|
+
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string, capturePrefix?: string) => ExpandResult;
|
|
971
971
|
/**
|
|
972
972
|
* Expands template tokens in a query string to their regex equivalents.
|
|
973
973
|
*
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA,KA9VK,YAAA,GA8VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA,KHvOK,eAAA,GG2OJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAwCC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;;;;AEhtBX;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cHgaa,sBAAuB,iBAAiB,wBAAsB;;;;AFxhB3E;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cEjJO,oBFiJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;cAngBa;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cDuGa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,mHAIV;;;;;;;;;;;;;;;;;;;;cAqHU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;;AJrgBb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cIlmBN,mBJkmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIlmBgC,eJkmBhC,EAAA;AAiBnB;;;;AC/FA;;;;;;;;ACplBA;AAca,cE8GA,wBF7GyD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE6GL,eF7GK,EAAA,EAAA,GAAA,MAAA;;;;AC+CtE;AA6NA;AA2CA;AAWY,cC9LC,oBD8LW,EAAA,CAAA,QAAA,EC7LV,eD6LU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBAoGZ,GAAA,iBAhGE;EAqHU,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;AC3jBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
|
package/dist/index.mjs
CHANGED
|
@@ -732,171 +732,6 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
732
732
|
return result;
|
|
733
733
|
};
|
|
734
734
|
|
|
735
|
-
//#endregion
|
|
736
|
-
//#region src/segmentation/match-utils.ts
|
|
737
|
-
/**
|
|
738
|
-
* Utility functions for regex matching and result processing.
|
|
739
|
-
*
|
|
740
|
-
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
741
|
-
* and enable independent testing. They handle match filtering, capture
|
|
742
|
-
* extraction, and occurrence-based selection.
|
|
743
|
-
*
|
|
744
|
-
* @module match-utils
|
|
745
|
-
*/
|
|
746
|
-
/**
|
|
747
|
-
* Extracts named capture groups from a regex match.
|
|
748
|
-
*
|
|
749
|
-
* Only includes groups that are in the `captureNames` list and have
|
|
750
|
-
* defined values. This filters out positional captures and ensures
|
|
751
|
-
* only explicitly requested named captures are returned.
|
|
752
|
-
*
|
|
753
|
-
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
754
|
-
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
755
|
-
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
756
|
-
*
|
|
757
|
-
* @example
|
|
758
|
-
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
759
|
-
* extractNamedCaptures(match.groups, ['num'])
|
|
760
|
-
* // → { num: '٦٦٩٦' }
|
|
761
|
-
*
|
|
762
|
-
* @example
|
|
763
|
-
* // No matching captures
|
|
764
|
-
* extractNamedCaptures({}, ['num'])
|
|
765
|
-
* // → undefined
|
|
766
|
-
*
|
|
767
|
-
* @example
|
|
768
|
-
* // Undefined groups
|
|
769
|
-
* extractNamedCaptures(undefined, ['num'])
|
|
770
|
-
* // → undefined
|
|
771
|
-
*/
|
|
772
|
-
const extractNamedCaptures = (groups, captureNames) => {
|
|
773
|
-
if (!groups || captureNames.length === 0) return;
|
|
774
|
-
const namedCaptures = {};
|
|
775
|
-
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
776
|
-
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
777
|
-
};
|
|
778
|
-
/**
|
|
779
|
-
* Gets the last defined positional capture group from a match array.
|
|
780
|
-
*
|
|
781
|
-
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
782
|
-
* is always at the end of the pattern. Named captures may shift the
|
|
783
|
-
* positional indices, so we iterate backward to find the actual content.
|
|
784
|
-
*
|
|
785
|
-
* @param match - RegExp exec result array
|
|
786
|
-
* @returns The last defined capture group value, or `undefined` if none
|
|
787
|
-
*
|
|
788
|
-
* @example
|
|
789
|
-
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
790
|
-
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
791
|
-
* getLastPositionalCapture(match)
|
|
792
|
-
* // → 'content'
|
|
793
|
-
*
|
|
794
|
-
* @example
|
|
795
|
-
* // No captures
|
|
796
|
-
* getLastPositionalCapture(['full match'])
|
|
797
|
-
* // → undefined
|
|
798
|
-
*/
|
|
799
|
-
const getLastPositionalCapture = (match) => {
|
|
800
|
-
if (match.length <= 1) return;
|
|
801
|
-
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
802
|
-
};
|
|
803
|
-
/**
|
|
804
|
-
* Filters matches to only include those within page ID constraints.
|
|
805
|
-
*
|
|
806
|
-
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
807
|
-
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
808
|
-
*
|
|
809
|
-
* @param matches - Array of match results to filter
|
|
810
|
-
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
811
|
-
* @param getId - Function that returns the page ID for a given offset
|
|
812
|
-
* @returns Filtered array containing only matches within constraints
|
|
813
|
-
*
|
|
814
|
-
* @example
|
|
815
|
-
* const matches = [
|
|
816
|
-
* { start: 0, end: 10 }, // Page 1
|
|
817
|
-
* { start: 100, end: 110 }, // Page 5
|
|
818
|
-
* { start: 200, end: 210 }, // Page 10
|
|
819
|
-
* ];
|
|
820
|
-
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
821
|
-
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
822
|
-
*/
|
|
823
|
-
const filterByConstraints = (matches, rule, getId) => {
|
|
824
|
-
return matches.filter((m) => {
|
|
825
|
-
const id = getId(m.start);
|
|
826
|
-
if (rule.min !== void 0 && id < rule.min) return false;
|
|
827
|
-
if (rule.max !== void 0 && id > rule.max) return false;
|
|
828
|
-
if (isPageExcluded(id, rule.exclude)) return false;
|
|
829
|
-
return true;
|
|
830
|
-
});
|
|
831
|
-
};
|
|
832
|
-
/**
|
|
833
|
-
* Filters matches based on occurrence setting (first, last, or all).
|
|
834
|
-
*
|
|
835
|
-
* Applies occurrence-based selection to a list of matches:
|
|
836
|
-
* - `'all'` or `undefined`: Return all matches (default)
|
|
837
|
-
* - `'first'`: Return only the first match
|
|
838
|
-
* - `'last'`: Return only the last match
|
|
839
|
-
*
|
|
840
|
-
* @param matches - Array of match results to filter
|
|
841
|
-
* @param occurrence - Which occurrence(s) to keep
|
|
842
|
-
* @returns Filtered array based on occurrence setting
|
|
843
|
-
*
|
|
844
|
-
* @example
|
|
845
|
-
* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
|
|
846
|
-
*
|
|
847
|
-
* filterByOccurrence(matches, 'first')
|
|
848
|
-
* // → [{ start: 0 }]
|
|
849
|
-
*
|
|
850
|
-
* filterByOccurrence(matches, 'last')
|
|
851
|
-
* // → [{ start: 20 }]
|
|
852
|
-
*
|
|
853
|
-
* filterByOccurrence(matches, 'all')
|
|
854
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
|
|
855
|
-
*
|
|
856
|
-
* filterByOccurrence(matches, undefined)
|
|
857
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
|
|
858
|
-
*/
|
|
859
|
-
const filterByOccurrence = (matches, occurrence) => {
|
|
860
|
-
if (!matches.length) return [];
|
|
861
|
-
if (occurrence === "first") return [matches[0]];
|
|
862
|
-
if (occurrence === "last") return [matches[matches.length - 1]];
|
|
863
|
-
return matches;
|
|
864
|
-
};
|
|
865
|
-
/**
|
|
866
|
-
* Checks if any rule in the list allows the given page ID.
|
|
867
|
-
*
|
|
868
|
-
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
869
|
-
* Rules without constraints allow all page IDs.
|
|
870
|
-
*
|
|
871
|
-
* This is used to determine whether to create a segment for content
|
|
872
|
-
* that appears before any split points (the "first segment").
|
|
873
|
-
*
|
|
874
|
-
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
875
|
-
* @param pageId - Page ID to check
|
|
876
|
-
* @returns `true` if at least one rule allows the page ID
|
|
877
|
-
*
|
|
878
|
-
* @example
|
|
879
|
-
* const rules = [
|
|
880
|
-
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
881
|
-
* { min: 20 }, // Allows pages 20+
|
|
882
|
-
* ];
|
|
883
|
-
*
|
|
884
|
-
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
885
|
-
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
886
|
-
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
887
|
-
*
|
|
888
|
-
* @example
|
|
889
|
-
* // Rules without constraints allow everything
|
|
890
|
-
* anyRuleAllowsId([{}], 999) // → true
|
|
891
|
-
*/
|
|
892
|
-
const anyRuleAllowsId = (rules, pageId) => {
|
|
893
|
-
return rules.some((r) => {
|
|
894
|
-
const minOk = r.min === void 0 || pageId >= r.min;
|
|
895
|
-
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
896
|
-
return minOk && maxOk;
|
|
897
|
-
});
|
|
898
|
-
};
|
|
899
|
-
|
|
900
735
|
//#endregion
|
|
901
736
|
//#region src/segmentation/tokens.ts
|
|
902
737
|
/**
|
|
@@ -1127,7 +962,7 @@ const containsTokens = (query) => {
|
|
|
1127
962
|
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
1128
963
|
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
1129
964
|
*/
|
|
1130
|
-
const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
965
|
+
const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
|
|
1131
966
|
const captureNames = [];
|
|
1132
967
|
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
1133
968
|
/**
|
|
@@ -1169,16 +1004,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
|
1169
1004
|
const [, tokenName, captureName] = tokenMatch;
|
|
1170
1005
|
if (!tokenName && captureName) {
|
|
1171
1006
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1172
|
-
|
|
1173
|
-
|
|
1007
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1008
|
+
captureNames.push(prefixedName);
|
|
1009
|
+
return `(?<${prefixedName}>.+)`;
|
|
1174
1010
|
}
|
|
1175
1011
|
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
1176
1012
|
if (!tokenPattern) return segment.value;
|
|
1177
1013
|
if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
1178
1014
|
if (captureName) {
|
|
1179
1015
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1180
|
-
|
|
1181
|
-
|
|
1016
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1017
|
+
captureNames.push(prefixedName);
|
|
1018
|
+
return `(?<${prefixedName}>${tokenPattern})`;
|
|
1182
1019
|
}
|
|
1183
1020
|
return tokenPattern;
|
|
1184
1021
|
});
|
|
@@ -1266,6 +1103,224 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
1266
1103
|
*/
|
|
1267
1104
|
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1268
1105
|
|
|
1106
|
+
//#endregion
|
|
1107
|
+
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1108
|
+
/**
|
|
1109
|
+
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
1110
|
+
*
|
|
1111
|
+
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
1112
|
+
* a giant concatenated string. Instead, we match only at known line-start
|
|
1113
|
+
* offsets and perform a small deterministic comparison:
|
|
1114
|
+
* - Skip Arabic diacritics in the CONTENT
|
|
1115
|
+
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
1116
|
+
*
|
|
1117
|
+
* This module is intentionally conservative: it only supports "literal"
|
|
1118
|
+
* token patterns (plain text alternation via `|`), not general regex.
|
|
1119
|
+
*/
|
|
1120
|
+
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
1121
|
+
const equivKey = (ch) => {
|
|
1122
|
+
switch (ch) {
|
|
1123
|
+
case "آ":
|
|
1124
|
+
case "أ":
|
|
1125
|
+
case "إ": return "ا";
|
|
1126
|
+
case "ه": return "ة";
|
|
1127
|
+
case "ي": return "ى";
|
|
1128
|
+
default: return ch;
|
|
1129
|
+
}
|
|
1130
|
+
};
|
|
1131
|
+
/**
|
|
1132
|
+
* Match a fuzzy literal prefix at a given offset.
|
|
1133
|
+
*
|
|
1134
|
+
* - Skips diacritics in the content
|
|
1135
|
+
* - Applies equivalence groups on both content and literal
|
|
1136
|
+
*
|
|
1137
|
+
* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
|
|
1138
|
+
*/
|
|
1139
|
+
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
1140
|
+
let i = offset;
|
|
1141
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1142
|
+
for (let j = 0; j < literal.length; j++) {
|
|
1143
|
+
const litCh = literal[j];
|
|
1144
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1145
|
+
if (i >= content.length) return null;
|
|
1146
|
+
const cCh = content[i];
|
|
1147
|
+
if (equivKey(cCh) !== equivKey(litCh)) return null;
|
|
1148
|
+
i++;
|
|
1149
|
+
}
|
|
1150
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1151
|
+
return i;
|
|
1152
|
+
};
|
|
1153
|
+
const isLiteralOnly = (s) => {
|
|
1154
|
+
return !/[\\[\]{}()^$.*+?]/.test(s);
|
|
1155
|
+
};
|
|
1156
|
+
const compileLiteralAlternation = (pattern) => {
|
|
1157
|
+
if (!pattern) return null;
|
|
1158
|
+
if (!isLiteralOnly(pattern)) return null;
|
|
1159
|
+
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
1160
|
+
if (!alternatives.length) return null;
|
|
1161
|
+
return { alternatives };
|
|
1162
|
+
};
|
|
1163
|
+
/**
|
|
1164
|
+
* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
|
|
1165
|
+
* Returns null if not eligible.
|
|
1166
|
+
*/
|
|
1167
|
+
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
1168
|
+
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
1169
|
+
if (!m) return null;
|
|
1170
|
+
const token = m[1];
|
|
1171
|
+
const tokenPattern = getTokenPattern(token);
|
|
1172
|
+
if (!tokenPattern) return null;
|
|
1173
|
+
const compiled = compileLiteralAlternation(tokenPattern);
|
|
1174
|
+
if (!compiled) return null;
|
|
1175
|
+
return {
|
|
1176
|
+
alternatives: compiled.alternatives,
|
|
1177
|
+
token
|
|
1178
|
+
};
|
|
1179
|
+
};
|
|
1180
|
+
/**
|
|
1181
|
+
* Try matching any alternative for a compiled token at a line-start offset.
|
|
1182
|
+
* Returns endOffset (exclusive) on match, else null.
|
|
1183
|
+
*/
|
|
1184
|
+
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
1185
|
+
for (const alt of compiled.alternatives) {
|
|
1186
|
+
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
1187
|
+
if (end !== null) return end;
|
|
1188
|
+
}
|
|
1189
|
+
return null;
|
|
1190
|
+
};
|
|
1191
|
+
|
|
1192
|
+
//#endregion
|
|
1193
|
+
//#region src/segmentation/match-utils.ts
|
|
1194
|
+
/**
|
|
1195
|
+
* Utility functions for regex matching and result processing.
|
|
1196
|
+
*
|
|
1197
|
+
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
1198
|
+
* and enable independent testing. They handle match filtering, capture
|
|
1199
|
+
* extraction, and occurrence-based selection.
|
|
1200
|
+
*
|
|
1201
|
+
* @module match-utils
|
|
1202
|
+
*/
|
|
1203
|
+
/**
|
|
1204
|
+
* Extracts named capture groups from a regex match.
|
|
1205
|
+
*
|
|
1206
|
+
* Only includes groups that are in the `captureNames` list and have
|
|
1207
|
+
* defined values. This filters out positional captures and ensures
|
|
1208
|
+
* only explicitly requested named captures are returned.
|
|
1209
|
+
*
|
|
1210
|
+
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
1211
|
+
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
1212
|
+
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
1213
|
+
*
|
|
1214
|
+
* @example
|
|
1215
|
+
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
1216
|
+
* extractNamedCaptures(match.groups, ['num'])
|
|
1217
|
+
* // → { num: '٦٦٩٦' }
|
|
1218
|
+
*
|
|
1219
|
+
* @example
|
|
1220
|
+
* // No matching captures
|
|
1221
|
+
* extractNamedCaptures({}, ['num'])
|
|
1222
|
+
* // → undefined
|
|
1223
|
+
*
|
|
1224
|
+
* @example
|
|
1225
|
+
* // Undefined groups
|
|
1226
|
+
* extractNamedCaptures(undefined, ['num'])
|
|
1227
|
+
* // → undefined
|
|
1228
|
+
*/
|
|
1229
|
+
const extractNamedCaptures = (groups, captureNames) => {
|
|
1230
|
+
if (!groups || captureNames.length === 0) return;
|
|
1231
|
+
const namedCaptures = {};
|
|
1232
|
+
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
1233
|
+
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
1234
|
+
};
|
|
1235
|
+
/**
|
|
1236
|
+
* Gets the last defined positional capture group from a match array.
|
|
1237
|
+
*
|
|
1238
|
+
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
1239
|
+
* is always at the end of the pattern. Named captures may shift the
|
|
1240
|
+
* positional indices, so we iterate backward to find the actual content.
|
|
1241
|
+
*
|
|
1242
|
+
* @param match - RegExp exec result array
|
|
1243
|
+
* @returns The last defined capture group value, or `undefined` if none
|
|
1244
|
+
*
|
|
1245
|
+
* @example
|
|
1246
|
+
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
1247
|
+
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
1248
|
+
* getLastPositionalCapture(match)
|
|
1249
|
+
* // → 'content'
|
|
1250
|
+
*
|
|
1251
|
+
* @example
|
|
1252
|
+
* // No captures
|
|
1253
|
+
* getLastPositionalCapture(['full match'])
|
|
1254
|
+
* // → undefined
|
|
1255
|
+
*/
|
|
1256
|
+
const getLastPositionalCapture = (match) => {
|
|
1257
|
+
if (match.length <= 1) return;
|
|
1258
|
+
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
1259
|
+
};
|
|
1260
|
+
/**
|
|
1261
|
+
* Filters matches to only include those within page ID constraints.
|
|
1262
|
+
*
|
|
1263
|
+
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
1264
|
+
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
1265
|
+
*
|
|
1266
|
+
* @param matches - Array of match results to filter
|
|
1267
|
+
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
1268
|
+
* @param getId - Function that returns the page ID for a given offset
|
|
1269
|
+
* @returns Filtered array containing only matches within constraints
|
|
1270
|
+
*
|
|
1271
|
+
* @example
|
|
1272
|
+
* const matches = [
|
|
1273
|
+
* { start: 0, end: 10 }, // Page 1
|
|
1274
|
+
* { start: 100, end: 110 }, // Page 5
|
|
1275
|
+
* { start: 200, end: 210 }, // Page 10
|
|
1276
|
+
* ];
|
|
1277
|
+
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
1278
|
+
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1279
|
+
*/
|
|
1280
|
+
const filterByConstraints = (matches, rule, getId) => {
|
|
1281
|
+
return matches.filter((m) => {
|
|
1282
|
+
const id = getId(m.start);
|
|
1283
|
+
if (rule.min !== void 0 && id < rule.min) return false;
|
|
1284
|
+
if (rule.max !== void 0 && id > rule.max) return false;
|
|
1285
|
+
if (isPageExcluded(id, rule.exclude)) return false;
|
|
1286
|
+
return true;
|
|
1287
|
+
});
|
|
1288
|
+
};
|
|
1289
|
+
/**
|
|
1290
|
+
* Checks if any rule in the list allows the given page ID.
|
|
1291
|
+
*
|
|
1292
|
+
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
1293
|
+
* Rules without constraints allow all page IDs.
|
|
1294
|
+
*
|
|
1295
|
+
* This is used to determine whether to create a segment for content
|
|
1296
|
+
* that appears before any split points (the "first segment").
|
|
1297
|
+
*
|
|
1298
|
+
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
1299
|
+
* @param pageId - Page ID to check
|
|
1300
|
+
* @returns `true` if at least one rule allows the page ID
|
|
1301
|
+
*
|
|
1302
|
+
* @example
|
|
1303
|
+
* const rules = [
|
|
1304
|
+
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
1305
|
+
* { min: 20 }, // Allows pages 20+
|
|
1306
|
+
* ];
|
|
1307
|
+
*
|
|
1308
|
+
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
1309
|
+
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
1310
|
+
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
1311
|
+
*
|
|
1312
|
+
* @example
|
|
1313
|
+
* // Rules without constraints allow everything
|
|
1314
|
+
* anyRuleAllowsId([{}], 999) // → true
|
|
1315
|
+
*/
|
|
1316
|
+
const anyRuleAllowsId = (rules, pageId) => {
|
|
1317
|
+
return rules.some((r) => {
|
|
1318
|
+
const minOk = r.min === void 0 || pageId >= r.min;
|
|
1319
|
+
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
1320
|
+
return minOk && maxOk;
|
|
1321
|
+
});
|
|
1322
|
+
};
|
|
1323
|
+
|
|
1269
1324
|
//#endregion
|
|
1270
1325
|
//#region src/segmentation/rule-regex.ts
|
|
1271
1326
|
/**
|
|
@@ -1319,39 +1374,42 @@ const compileRuleRegex = (pattern) => {
|
|
|
1319
1374
|
*
|
|
1320
1375
|
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
1321
1376
|
*/
|
|
1322
|
-
const processPattern = (pattern, fuzzy) => {
|
|
1323
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1377
|
+
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
1378
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
1324
1379
|
return {
|
|
1325
1380
|
captureNames,
|
|
1326
1381
|
pattern: expanded
|
|
1327
1382
|
};
|
|
1328
1383
|
};
|
|
1329
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
|
|
1330
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1384
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1385
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1331
1386
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1387
|
+
const captureNames = processed.flatMap((p) => p.captureNames);
|
|
1388
|
+
const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
|
|
1389
|
+
if (capturePrefix) captureNames.push(`${capturePrefix}content`);
|
|
1332
1390
|
return {
|
|
1333
|
-
captureNames
|
|
1334
|
-
regex: `^(?:${union})
|
|
1391
|
+
captureNames,
|
|
1392
|
+
regex: `^(?:${union})${contentCapture}`
|
|
1335
1393
|
};
|
|
1336
1394
|
};
|
|
1337
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
|
|
1338
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1395
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1396
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1339
1397
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1340
1398
|
return {
|
|
1341
1399
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1342
1400
|
regex: `^(?:${union})`
|
|
1343
1401
|
};
|
|
1344
1402
|
};
|
|
1345
|
-
const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
|
|
1346
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1403
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1404
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1347
1405
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1348
1406
|
return {
|
|
1349
1407
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1350
1408
|
regex: `(?:${union})$`
|
|
1351
1409
|
};
|
|
1352
1410
|
};
|
|
1353
|
-
const buildTemplateRegexSource = (template) => {
|
|
1354
|
-
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
|
|
1411
|
+
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
1412
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
1355
1413
|
return {
|
|
1356
1414
|
captureNames,
|
|
1357
1415
|
regex: pattern
|
|
@@ -1363,12 +1421,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
|
|
|
1363
1421
|
*
|
|
1364
1422
|
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
1365
1423
|
*/
|
|
1366
|
-
const buildRuleRegex = (rule) => {
|
|
1424
|
+
const buildRuleRegex = (rule, capturePrefix) => {
|
|
1367
1425
|
const s = { ...rule };
|
|
1368
1426
|
const fuzzy = rule.fuzzy ?? false;
|
|
1369
1427
|
let allCaptureNames = [];
|
|
1370
1428
|
if (s.lineStartsAfter?.length) {
|
|
1371
|
-
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
|
|
1429
|
+
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
|
|
1372
1430
|
allCaptureNames = captureNames;
|
|
1373
1431
|
return {
|
|
1374
1432
|
captureNames: allCaptureNames,
|
|
@@ -1378,17 +1436,17 @@ const buildRuleRegex = (rule) => {
|
|
|
1378
1436
|
};
|
|
1379
1437
|
}
|
|
1380
1438
|
if (s.lineStartsWith?.length) {
|
|
1381
|
-
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
|
|
1439
|
+
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
|
|
1382
1440
|
s.regex = regex;
|
|
1383
1441
|
allCaptureNames = captureNames;
|
|
1384
1442
|
}
|
|
1385
1443
|
if (s.lineEndsWith?.length) {
|
|
1386
|
-
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
|
|
1444
|
+
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
|
|
1387
1445
|
s.regex = regex;
|
|
1388
1446
|
allCaptureNames = captureNames;
|
|
1389
1447
|
}
|
|
1390
1448
|
if (s.template) {
|
|
1391
|
-
const { regex, captureNames } = buildTemplateRegexSource(s.template);
|
|
1449
|
+
const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
|
|
1392
1450
|
s.regex = regex;
|
|
1393
1451
|
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1394
1452
|
}
|
|
@@ -1544,9 +1602,120 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
1544
1602
|
return [initialSeg];
|
|
1545
1603
|
};
|
|
1546
1604
|
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
1547
|
-
const
|
|
1605
|
+
const combinableRules = [];
|
|
1606
|
+
const standaloneRules = [];
|
|
1607
|
+
const fastFuzzyRules = [];
|
|
1608
|
+
rules.forEach((rule, index) => {
|
|
1609
|
+
let isCombinable = true;
|
|
1610
|
+
if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
|
|
1611
|
+
const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
|
|
1612
|
+
if (compiled) {
|
|
1613
|
+
fastFuzzyRules.push({
|
|
1614
|
+
compiled,
|
|
1615
|
+
rule,
|
|
1616
|
+
ruleIndex: index
|
|
1617
|
+
});
|
|
1618
|
+
return;
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
if ("regex" in rule && rule.regex) {
|
|
1622
|
+
const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
|
|
1623
|
+
const hasBackreferences = /\\[1-9]/.test(rule.regex);
|
|
1624
|
+
const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
|
|
1625
|
+
if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
|
|
1626
|
+
}
|
|
1627
|
+
if (isCombinable) combinableRules.push({
|
|
1628
|
+
index,
|
|
1629
|
+
prefix: `r${index}_`,
|
|
1630
|
+
rule
|
|
1631
|
+
});
|
|
1632
|
+
else standaloneRules.push(rule);
|
|
1633
|
+
});
|
|
1634
|
+
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
1635
|
+
if (fastFuzzyRules.length > 0) {
|
|
1636
|
+
let boundaryIdx = 0;
|
|
1637
|
+
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1638
|
+
const advanceBoundaryTo = (offset) => {
|
|
1639
|
+
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
1640
|
+
boundaryIdx++;
|
|
1641
|
+
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1642
|
+
}
|
|
1643
|
+
};
|
|
1644
|
+
const recordSplitPoint = (ruleIndex, sp) => {
|
|
1645
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1646
|
+
splitPointsByRule.get(ruleIndex).push(sp);
|
|
1647
|
+
};
|
|
1648
|
+
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
1649
|
+
advanceBoundaryTo(lineStart);
|
|
1650
|
+
const pageId = currentBoundary?.id ?? 0;
|
|
1651
|
+
if (lineStart >= matchContent.length) break;
|
|
1652
|
+
for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
|
|
1653
|
+
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
1654
|
+
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
1655
|
+
if (end === null) continue;
|
|
1656
|
+
recordSplitPoint(ruleIndex, {
|
|
1657
|
+
index: (rule.split ?? "at") === "at" ? lineStart : end,
|
|
1658
|
+
meta: rule.meta
|
|
1659
|
+
});
|
|
1660
|
+
}
|
|
1661
|
+
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
1662
|
+
if (nextNl === -1) break;
|
|
1663
|
+
lineStart = nextNl + 1;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
if (combinableRules.length > 0) {
|
|
1667
|
+
const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
|
|
1668
|
+
const built = buildRuleRegex(rule, prefix);
|
|
1669
|
+
return {
|
|
1670
|
+
prefix,
|
|
1671
|
+
source: `(?<${prefix}>${built.regex.source})`,
|
|
1672
|
+
...built
|
|
1673
|
+
};
|
|
1674
|
+
});
|
|
1675
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1676
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1677
|
+
combinedRegex.lastIndex = 0;
|
|
1678
|
+
let m = combinedRegex.exec(matchContent);
|
|
1679
|
+
while (m !== null) {
|
|
1680
|
+
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
1681
|
+
if (matchedRuleIndex !== -1) {
|
|
1682
|
+
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
1683
|
+
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
1684
|
+
const namedCaptures = {};
|
|
1685
|
+
if (m.groups) {
|
|
1686
|
+
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
1687
|
+
const cleanName = prefixedName.slice(prefix.length);
|
|
1688
|
+
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
let capturedContent;
|
|
1692
|
+
let contentStartOffset;
|
|
1693
|
+
if (ruleInfo.usesLineStartsAfter) {
|
|
1694
|
+
capturedContent = m.groups?.[`${prefix}content`];
|
|
1695
|
+
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
1696
|
+
}
|
|
1697
|
+
const start = m.index;
|
|
1698
|
+
const end = m.index + m[0].length;
|
|
1699
|
+
const pageId = pageMap.getId(start);
|
|
1700
|
+
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
|
|
1701
|
+
const sp = {
|
|
1702
|
+
capturedContent: void 0,
|
|
1703
|
+
contentStartOffset,
|
|
1704
|
+
index: (rule.split ?? "at") === "at" ? start : end,
|
|
1705
|
+
meta: rule.meta,
|
|
1706
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
1707
|
+
};
|
|
1708
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
1709
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
1713
|
+
m = combinedRegex.exec(matchContent);
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
1548
1717
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1549
|
-
|
|
1718
|
+
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
|
|
1550
1719
|
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1551
1720
|
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1552
1721
|
return {
|
|
@@ -1557,8 +1726,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
|
1557
1726
|
namedCaptures: m.namedCaptures
|
|
1558
1727
|
};
|
|
1559
1728
|
});
|
|
1729
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1730
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
1560
1731
|
};
|
|
1561
|
-
|
|
1732
|
+
standaloneRules.forEach((rule) => {
|
|
1733
|
+
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
1734
|
+
});
|
|
1735
|
+
const finalSplitPoints = [];
|
|
1736
|
+
rules.forEach((rule, index) => {
|
|
1737
|
+
const points = splitPointsByRule.get(index);
|
|
1738
|
+
if (!points || points.length === 0) return;
|
|
1739
|
+
let filtered = points;
|
|
1740
|
+
if (rule.occurrence === "first") filtered = [points[0]];
|
|
1741
|
+
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
1742
|
+
finalSplitPoints.push(...filtered);
|
|
1743
|
+
});
|
|
1744
|
+
return finalSplitPoints;
|
|
1562
1745
|
};
|
|
1563
1746
|
/**
|
|
1564
1747
|
* Executes a regex against content and extracts match results with capture information.
|