flappa-doormal 2.3.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +55 -0
- package/README.md +260 -5
- package/dist/index.d.mts +106 -22
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +657 -312
- package/dist/index.mjs.map +1 -1
- package/package.json +6 -5
package/dist/index.mjs
CHANGED
|
@@ -662,12 +662,24 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
662
662
|
break;
|
|
663
663
|
}
|
|
664
664
|
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
665
|
-
|
|
665
|
+
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
666
|
+
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
666
667
|
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
667
668
|
if (finalSeg) result.push(finalSeg);
|
|
668
669
|
break;
|
|
669
670
|
}
|
|
670
671
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
672
|
+
logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
|
|
673
|
+
currentFromIdx,
|
|
674
|
+
currentFromPageId: pageIds[currentFromIdx],
|
|
675
|
+
remainingContentStart: remainingContent.slice(0, 50),
|
|
676
|
+
remainingContentLength: remainingContent.length,
|
|
677
|
+
remainingSpan,
|
|
678
|
+
toIdx,
|
|
679
|
+
toPageId: pageIds[toIdx],
|
|
680
|
+
windowEndIdx,
|
|
681
|
+
windowEndPageId: pageIds[windowEndIdx]
|
|
682
|
+
});
|
|
671
683
|
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
672
684
|
const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
|
|
673
685
|
let breakPosition = -1;
|
|
@@ -680,16 +692,35 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
680
692
|
});
|
|
681
693
|
if (breakPosition <= 0) breakPosition = windowEndPosition;
|
|
682
694
|
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
695
|
+
logger?.debug?.("[breakpoints] selectedBreak", {
|
|
696
|
+
breakPosition,
|
|
697
|
+
pieceContentEnd: pieceContent.slice(-50),
|
|
698
|
+
pieceContentLength: pieceContent.length,
|
|
699
|
+
windowEndPosition
|
|
700
|
+
});
|
|
683
701
|
const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
|
|
684
702
|
if (pieceContent) {
|
|
685
703
|
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
686
704
|
if (pieceSeg) result.push(pieceSeg);
|
|
687
705
|
}
|
|
688
706
|
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
689
|
-
|
|
707
|
+
logger?.debug?.("[breakpoints] afterSlice", {
|
|
708
|
+
actualEndIdx,
|
|
709
|
+
remainingContentLength: remainingContent.length,
|
|
710
|
+
remainingContentStart: remainingContent.slice(0, 60)
|
|
711
|
+
});
|
|
712
|
+
if (!remainingContent) {
|
|
713
|
+
logger?.debug?.("[breakpoints] done: no remaining content");
|
|
714
|
+
break;
|
|
715
|
+
}
|
|
690
716
|
currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
717
|
+
logger?.debug?.("[breakpoints] nextIteration", {
|
|
718
|
+
currentFromIdx,
|
|
719
|
+
currentFromPageId: pageIds[currentFromIdx]
|
|
720
|
+
});
|
|
691
721
|
isFirstPiece = false;
|
|
692
722
|
}
|
|
723
|
+
logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
|
|
693
724
|
return result;
|
|
694
725
|
};
|
|
695
726
|
/**
|
|
@@ -708,6 +739,14 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
708
739
|
maxPages,
|
|
709
740
|
segmentCount: segments.length
|
|
710
741
|
});
|
|
742
|
+
logger?.debug?.("[breakpoints] inputSegments", {
|
|
743
|
+
segmentCount: segments.length,
|
|
744
|
+
segments: segments.map((s) => ({
|
|
745
|
+
contentLength: s.content.length,
|
|
746
|
+
from: s.from,
|
|
747
|
+
to: s.to
|
|
748
|
+
}))
|
|
749
|
+
});
|
|
711
750
|
for (const segment of segments) {
|
|
712
751
|
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
713
752
|
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
@@ -732,6 +771,138 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
732
771
|
return result;
|
|
733
772
|
};
|
|
734
773
|
|
|
774
|
+
//#endregion
|
|
775
|
+
//#region src/segmentation/match-utils.ts
|
|
776
|
+
/**
|
|
777
|
+
* Utility functions for regex matching and result processing.
|
|
778
|
+
*
|
|
779
|
+
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
780
|
+
* and enable independent testing. They handle match filtering, capture
|
|
781
|
+
* extraction, and occurrence-based selection.
|
|
782
|
+
*
|
|
783
|
+
* @module match-utils
|
|
784
|
+
*/
|
|
785
|
+
/**
|
|
786
|
+
* Extracts named capture groups from a regex match.
|
|
787
|
+
*
|
|
788
|
+
* Only includes groups that are in the `captureNames` list and have
|
|
789
|
+
* defined values. This filters out positional captures and ensures
|
|
790
|
+
* only explicitly requested named captures are returned.
|
|
791
|
+
*
|
|
792
|
+
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
793
|
+
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
794
|
+
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
795
|
+
*
|
|
796
|
+
* @example
|
|
797
|
+
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
798
|
+
* extractNamedCaptures(match.groups, ['num'])
|
|
799
|
+
* // → { num: '٦٦٩٦' }
|
|
800
|
+
*
|
|
801
|
+
* @example
|
|
802
|
+
* // No matching captures
|
|
803
|
+
* extractNamedCaptures({}, ['num'])
|
|
804
|
+
* // → undefined
|
|
805
|
+
*
|
|
806
|
+
* @example
|
|
807
|
+
* // Undefined groups
|
|
808
|
+
* extractNamedCaptures(undefined, ['num'])
|
|
809
|
+
* // → undefined
|
|
810
|
+
*/
|
|
811
|
+
const extractNamedCaptures = (groups, captureNames) => {
|
|
812
|
+
if (!groups || captureNames.length === 0) return;
|
|
813
|
+
const namedCaptures = {};
|
|
814
|
+
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
815
|
+
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
816
|
+
};
|
|
817
|
+
/**
|
|
818
|
+
* Gets the last defined positional capture group from a match array.
|
|
819
|
+
*
|
|
820
|
+
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
821
|
+
* is always at the end of the pattern. Named captures may shift the
|
|
822
|
+
* positional indices, so we iterate backward to find the actual content.
|
|
823
|
+
*
|
|
824
|
+
* @param match - RegExp exec result array
|
|
825
|
+
* @returns The last defined capture group value, or `undefined` if none
|
|
826
|
+
*
|
|
827
|
+
* @example
|
|
828
|
+
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
829
|
+
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
830
|
+
* getLastPositionalCapture(match)
|
|
831
|
+
* // → 'content'
|
|
832
|
+
*
|
|
833
|
+
* @example
|
|
834
|
+
* // No captures
|
|
835
|
+
* getLastPositionalCapture(['full match'])
|
|
836
|
+
* // → undefined
|
|
837
|
+
*/
|
|
838
|
+
const getLastPositionalCapture = (match) => {
|
|
839
|
+
if (match.length <= 1) return;
|
|
840
|
+
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
841
|
+
};
|
|
842
|
+
/**
|
|
843
|
+
* Filters matches to only include those within page ID constraints.
|
|
844
|
+
*
|
|
845
|
+
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
846
|
+
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
847
|
+
*
|
|
848
|
+
* @param matches - Array of match results to filter
|
|
849
|
+
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
850
|
+
* @param getId - Function that returns the page ID for a given offset
|
|
851
|
+
* @returns Filtered array containing only matches within constraints
|
|
852
|
+
*
|
|
853
|
+
* @example
|
|
854
|
+
* const matches = [
|
|
855
|
+
* { start: 0, end: 10 }, // Page 1
|
|
856
|
+
* { start: 100, end: 110 }, // Page 5
|
|
857
|
+
* { start: 200, end: 210 }, // Page 10
|
|
858
|
+
* ];
|
|
859
|
+
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
860
|
+
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
861
|
+
*/
|
|
862
|
+
const filterByConstraints = (matches, rule, getId) => {
|
|
863
|
+
return matches.filter((m) => {
|
|
864
|
+
const id = getId(m.start);
|
|
865
|
+
if (rule.min !== void 0 && id < rule.min) return false;
|
|
866
|
+
if (rule.max !== void 0 && id > rule.max) return false;
|
|
867
|
+
if (isPageExcluded(id, rule.exclude)) return false;
|
|
868
|
+
return true;
|
|
869
|
+
});
|
|
870
|
+
};
|
|
871
|
+
/**
|
|
872
|
+
* Checks if any rule in the list allows the given page ID.
|
|
873
|
+
*
|
|
874
|
+
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
875
|
+
* Rules without constraints allow all page IDs.
|
|
876
|
+
*
|
|
877
|
+
* This is used to determine whether to create a segment for content
|
|
878
|
+
* that appears before any split points (the "first segment").
|
|
879
|
+
*
|
|
880
|
+
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
881
|
+
* @param pageId - Page ID to check
|
|
882
|
+
* @returns `true` if at least one rule allows the page ID
|
|
883
|
+
*
|
|
884
|
+
* @example
|
|
885
|
+
* const rules = [
|
|
886
|
+
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
887
|
+
* { min: 20 }, // Allows pages 20+
|
|
888
|
+
* ];
|
|
889
|
+
*
|
|
890
|
+
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
891
|
+
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
892
|
+
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
893
|
+
*
|
|
894
|
+
* @example
|
|
895
|
+
* // Rules without constraints allow everything
|
|
896
|
+
* anyRuleAllowsId([{}], 999) // → true
|
|
897
|
+
*/
|
|
898
|
+
const anyRuleAllowsId = (rules, pageId) => {
|
|
899
|
+
return rules.some((r) => {
|
|
900
|
+
const minOk = r.min === void 0 || pageId >= r.min;
|
|
901
|
+
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
902
|
+
return minOk && maxOk;
|
|
903
|
+
});
|
|
904
|
+
};
|
|
905
|
+
|
|
735
906
|
//#endregion
|
|
736
907
|
//#region src/segmentation/tokens.ts
|
|
737
908
|
/**
|
|
@@ -798,19 +969,13 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
798
969
|
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
799
970
|
*/
|
|
800
971
|
const escapeTemplateBrackets = (pattern) => {
|
|
801
|
-
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (
|
|
972
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
|
|
802
973
|
if (token) return token;
|
|
803
974
|
return `\\${bracket}`;
|
|
804
975
|
});
|
|
805
976
|
};
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
*
|
|
809
|
-
* These tokens contain raw regex patterns and do not reference other tokens.
|
|
810
|
-
* For composite tokens that build on these, see `COMPOSITE_TOKENS`.
|
|
811
|
-
*
|
|
812
|
-
* @internal
|
|
813
|
-
*/
|
|
977
|
+
const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|كن|مد|قد|خد|فد|دل|كد|غد|صد|تم|فق|دق|[خرزيمنصسدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
|
|
978
|
+
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
814
979
|
const BASE_TOKENS = {
|
|
815
980
|
bab: "باب",
|
|
816
981
|
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
@@ -818,7 +983,7 @@ const BASE_TOKENS = {
|
|
|
818
983
|
dash: "[-–—ـ]",
|
|
819
984
|
fasl: ["مسألة", "فصل"].join("|"),
|
|
820
985
|
harf: "[أ-ي]",
|
|
821
|
-
harfs: "[أ-ي](
|
|
986
|
+
harfs: "[أ-ي](?:\\s+[أ-ي])*",
|
|
822
987
|
kitab: "كتاب",
|
|
823
988
|
naql: [
|
|
824
989
|
"حدثني",
|
|
@@ -831,6 +996,7 @@ const BASE_TOKENS = {
|
|
|
831
996
|
].join("|"),
|
|
832
997
|
raqm: "[\\u0660-\\u0669]",
|
|
833
998
|
raqms: "[\\u0660-\\u0669]+",
|
|
999
|
+
rumuz: RUMUZ_BLOCK,
|
|
834
1000
|
tarqim: "[.!?؟؛]"
|
|
835
1001
|
};
|
|
836
1002
|
/**
|
|
@@ -1090,236 +1256,18 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
1090
1256
|
/**
|
|
1091
1257
|
* Gets the regex pattern for a specific token name.
|
|
1092
1258
|
*
|
|
1093
|
-
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
1094
|
-
* without any expansion or capture group wrapping.
|
|
1095
|
-
*
|
|
1096
|
-
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
1097
|
-
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
1098
|
-
*
|
|
1099
|
-
* @example
|
|
1100
|
-
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
1101
|
-
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
1102
|
-
* getTokenPattern('unknown') // → undefined
|
|
1103
|
-
*/
|
|
1104
|
-
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1105
|
-
|
|
1106
|
-
//#endregion
|
|
1107
|
-
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1108
|
-
/**
|
|
1109
|
-
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
1110
|
-
*
|
|
1111
|
-
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
1112
|
-
* a giant concatenated string. Instead, we match only at known line-start
|
|
1113
|
-
* offsets and perform a small deterministic comparison:
|
|
1114
|
-
* - Skip Arabic diacritics in the CONTENT
|
|
1115
|
-
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
1116
|
-
*
|
|
1117
|
-
* This module is intentionally conservative: it only supports "literal"
|
|
1118
|
-
* token patterns (plain text alternation via `|`), not general regex.
|
|
1119
|
-
*/
|
|
1120
|
-
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
1121
|
-
const equivKey = (ch) => {
|
|
1122
|
-
switch (ch) {
|
|
1123
|
-
case "آ":
|
|
1124
|
-
case "أ":
|
|
1125
|
-
case "إ": return "ا";
|
|
1126
|
-
case "ه": return "ة";
|
|
1127
|
-
case "ي": return "ى";
|
|
1128
|
-
default: return ch;
|
|
1129
|
-
}
|
|
1130
|
-
};
|
|
1131
|
-
/**
|
|
1132
|
-
* Match a fuzzy literal prefix at a given offset.
|
|
1133
|
-
*
|
|
1134
|
-
* - Skips diacritics in the content
|
|
1135
|
-
* - Applies equivalence groups on both content and literal
|
|
1136
|
-
*
|
|
1137
|
-
* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
|
|
1138
|
-
*/
|
|
1139
|
-
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
1140
|
-
let i = offset;
|
|
1141
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1142
|
-
for (let j = 0; j < literal.length; j++) {
|
|
1143
|
-
const litCh = literal[j];
|
|
1144
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1145
|
-
if (i >= content.length) return null;
|
|
1146
|
-
const cCh = content[i];
|
|
1147
|
-
if (equivKey(cCh) !== equivKey(litCh)) return null;
|
|
1148
|
-
i++;
|
|
1149
|
-
}
|
|
1150
|
-
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1151
|
-
return i;
|
|
1152
|
-
};
|
|
1153
|
-
const isLiteralOnly = (s) => {
|
|
1154
|
-
return !/[\\[\]{}()^$.*+?]/.test(s);
|
|
1155
|
-
};
|
|
1156
|
-
const compileLiteralAlternation = (pattern) => {
|
|
1157
|
-
if (!pattern) return null;
|
|
1158
|
-
if (!isLiteralOnly(pattern)) return null;
|
|
1159
|
-
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
1160
|
-
if (!alternatives.length) return null;
|
|
1161
|
-
return { alternatives };
|
|
1162
|
-
};
|
|
1163
|
-
/**
|
|
1164
|
-
* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
|
|
1165
|
-
* Returns null if not eligible.
|
|
1166
|
-
*/
|
|
1167
|
-
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
1168
|
-
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
1169
|
-
if (!m) return null;
|
|
1170
|
-
const token = m[1];
|
|
1171
|
-
const tokenPattern = getTokenPattern(token);
|
|
1172
|
-
if (!tokenPattern) return null;
|
|
1173
|
-
const compiled = compileLiteralAlternation(tokenPattern);
|
|
1174
|
-
if (!compiled) return null;
|
|
1175
|
-
return {
|
|
1176
|
-
alternatives: compiled.alternatives,
|
|
1177
|
-
token
|
|
1178
|
-
};
|
|
1179
|
-
};
|
|
1180
|
-
/**
|
|
1181
|
-
* Try matching any alternative for a compiled token at a line-start offset.
|
|
1182
|
-
* Returns endOffset (exclusive) on match, else null.
|
|
1183
|
-
*/
|
|
1184
|
-
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
1185
|
-
for (const alt of compiled.alternatives) {
|
|
1186
|
-
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
1187
|
-
if (end !== null) return end;
|
|
1188
|
-
}
|
|
1189
|
-
return null;
|
|
1190
|
-
};
|
|
1191
|
-
|
|
1192
|
-
//#endregion
|
|
1193
|
-
//#region src/segmentation/match-utils.ts
|
|
1194
|
-
/**
|
|
1195
|
-
* Utility functions for regex matching and result processing.
|
|
1196
|
-
*
|
|
1197
|
-
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
1198
|
-
* and enable independent testing. They handle match filtering, capture
|
|
1199
|
-
* extraction, and occurrence-based selection.
|
|
1200
|
-
*
|
|
1201
|
-
* @module match-utils
|
|
1202
|
-
*/
|
|
1203
|
-
/**
|
|
1204
|
-
* Extracts named capture groups from a regex match.
|
|
1205
|
-
*
|
|
1206
|
-
* Only includes groups that are in the `captureNames` list and have
|
|
1207
|
-
* defined values. This filters out positional captures and ensures
|
|
1208
|
-
* only explicitly requested named captures are returned.
|
|
1209
|
-
*
|
|
1210
|
-
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
1211
|
-
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
1212
|
-
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
1213
|
-
*
|
|
1214
|
-
* @example
|
|
1215
|
-
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
1216
|
-
* extractNamedCaptures(match.groups, ['num'])
|
|
1217
|
-
* // → { num: '٦٦٩٦' }
|
|
1218
|
-
*
|
|
1219
|
-
* @example
|
|
1220
|
-
* // No matching captures
|
|
1221
|
-
* extractNamedCaptures({}, ['num'])
|
|
1222
|
-
* // → undefined
|
|
1223
|
-
*
|
|
1224
|
-
* @example
|
|
1225
|
-
* // Undefined groups
|
|
1226
|
-
* extractNamedCaptures(undefined, ['num'])
|
|
1227
|
-
* // → undefined
|
|
1228
|
-
*/
|
|
1229
|
-
const extractNamedCaptures = (groups, captureNames) => {
|
|
1230
|
-
if (!groups || captureNames.length === 0) return;
|
|
1231
|
-
const namedCaptures = {};
|
|
1232
|
-
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
1233
|
-
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
1234
|
-
};
|
|
1235
|
-
/**
|
|
1236
|
-
* Gets the last defined positional capture group from a match array.
|
|
1237
|
-
*
|
|
1238
|
-
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
1239
|
-
* is always at the end of the pattern. Named captures may shift the
|
|
1240
|
-
* positional indices, so we iterate backward to find the actual content.
|
|
1241
|
-
*
|
|
1242
|
-
* @param match - RegExp exec result array
|
|
1243
|
-
* @returns The last defined capture group value, or `undefined` if none
|
|
1244
|
-
*
|
|
1245
|
-
* @example
|
|
1246
|
-
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
1247
|
-
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
1248
|
-
* getLastPositionalCapture(match)
|
|
1249
|
-
* // → 'content'
|
|
1250
|
-
*
|
|
1251
|
-
* @example
|
|
1252
|
-
* // No captures
|
|
1253
|
-
* getLastPositionalCapture(['full match'])
|
|
1254
|
-
* // → undefined
|
|
1255
|
-
*/
|
|
1256
|
-
const getLastPositionalCapture = (match) => {
|
|
1257
|
-
if (match.length <= 1) return;
|
|
1258
|
-
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
1259
|
-
};
|
|
1260
|
-
/**
|
|
1261
|
-
* Filters matches to only include those within page ID constraints.
|
|
1262
|
-
*
|
|
1263
|
-
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
1264
|
-
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
1265
|
-
*
|
|
1266
|
-
* @param matches - Array of match results to filter
|
|
1267
|
-
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
1268
|
-
* @param getId - Function that returns the page ID for a given offset
|
|
1269
|
-
* @returns Filtered array containing only matches within constraints
|
|
1270
|
-
*
|
|
1271
|
-
* @example
|
|
1272
|
-
* const matches = [
|
|
1273
|
-
* { start: 0, end: 10 }, // Page 1
|
|
1274
|
-
* { start: 100, end: 110 }, // Page 5
|
|
1275
|
-
* { start: 200, end: 210 }, // Page 10
|
|
1276
|
-
* ];
|
|
1277
|
-
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
1278
|
-
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1279
|
-
*/
|
|
1280
|
-
const filterByConstraints = (matches, rule, getId) => {
|
|
1281
|
-
return matches.filter((m) => {
|
|
1282
|
-
const id = getId(m.start);
|
|
1283
|
-
if (rule.min !== void 0 && id < rule.min) return false;
|
|
1284
|
-
if (rule.max !== void 0 && id > rule.max) return false;
|
|
1285
|
-
if (isPageExcluded(id, rule.exclude)) return false;
|
|
1286
|
-
return true;
|
|
1287
|
-
});
|
|
1288
|
-
};
|
|
1289
|
-
/**
|
|
1290
|
-
* Checks if any rule in the list allows the given page ID.
|
|
1291
|
-
*
|
|
1292
|
-
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
1293
|
-
* Rules without constraints allow all page IDs.
|
|
1294
|
-
*
|
|
1295
|
-
* This is used to determine whether to create a segment for content
|
|
1296
|
-
* that appears before any split points (the "first segment").
|
|
1297
|
-
*
|
|
1298
|
-
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
1299
|
-
* @param pageId - Page ID to check
|
|
1300
|
-
* @returns `true` if at least one rule allows the page ID
|
|
1301
|
-
*
|
|
1302
|
-
* @example
|
|
1303
|
-
* const rules = [
|
|
1304
|
-
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
1305
|
-
* { min: 20 }, // Allows pages 20+
|
|
1306
|
-
* ];
|
|
1259
|
+
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
1260
|
+
* without any expansion or capture group wrapping.
|
|
1307
1261
|
*
|
|
1308
|
-
*
|
|
1309
|
-
*
|
|
1310
|
-
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
1262
|
+
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
1263
|
+
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
1311
1264
|
*
|
|
1312
1265
|
* @example
|
|
1313
|
-
* //
|
|
1314
|
-
*
|
|
1266
|
+
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
1267
|
+
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
1268
|
+
* getTokenPattern('unknown') // → undefined
|
|
1315
1269
|
*/
|
|
1316
|
-
const
|
|
1317
|
-
return rules.some((r) => {
|
|
1318
|
-
const minOk = r.min === void 0 || pageId >= r.min;
|
|
1319
|
-
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
1320
|
-
return minOk && maxOk;
|
|
1321
|
-
});
|
|
1322
|
-
};
|
|
1270
|
+
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1323
1271
|
|
|
1324
1272
|
//#endregion
|
|
1325
1273
|
//#region src/segmentation/rule-regex.ts
|
|
@@ -1462,16 +1410,231 @@ const buildRuleRegex = (rule, capturePrefix) => {
|
|
|
1462
1410
|
};
|
|
1463
1411
|
|
|
1464
1412
|
//#endregion
|
|
1465
|
-
//#region src/segmentation/
|
|
1413
|
+
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1414
|
+
/**
|
|
1415
|
+
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
1416
|
+
*
|
|
1417
|
+
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
1418
|
+
* a giant concatenated string. Instead, we match only at known line-start
|
|
1419
|
+
* offsets and perform a small deterministic comparison:
|
|
1420
|
+
* - Skip Arabic diacritics in the CONTENT
|
|
1421
|
+
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
1422
|
+
*
|
|
1423
|
+
* This module is intentionally conservative: it only supports "literal"
|
|
1424
|
+
* token patterns (plain text alternation via `|`), not general regex.
|
|
1425
|
+
*/
|
|
1426
|
+
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
1427
|
+
const equivKey = (ch) => {
|
|
1428
|
+
switch (ch) {
|
|
1429
|
+
case "آ":
|
|
1430
|
+
case "أ":
|
|
1431
|
+
case "إ": return "ا";
|
|
1432
|
+
case "ه": return "ة";
|
|
1433
|
+
case "ي": return "ى";
|
|
1434
|
+
default: return ch;
|
|
1435
|
+
}
|
|
1436
|
+
};
|
|
1466
1437
|
/**
|
|
1467
|
-
*
|
|
1438
|
+
* Match a fuzzy literal prefix at a given offset.
|
|
1439
|
+
*
|
|
1440
|
+
* - Skips diacritics in the content
|
|
1441
|
+
* - Applies equivalence groups on both content and literal
|
|
1468
1442
|
*
|
|
1469
|
-
* @
|
|
1470
|
-
|
|
1443
|
+
* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
|
|
1444
|
+
*/
|
|
1445
|
+
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
1446
|
+
let i = offset;
|
|
1447
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1448
|
+
for (let j = 0; j < literal.length; j++) {
|
|
1449
|
+
const litCh = literal[j];
|
|
1450
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1451
|
+
if (i >= content.length) return null;
|
|
1452
|
+
const cCh = content[i];
|
|
1453
|
+
if (equivKey(cCh) !== equivKey(litCh)) return null;
|
|
1454
|
+
i++;
|
|
1455
|
+
}
|
|
1456
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1457
|
+
return i;
|
|
1458
|
+
};
|
|
1459
|
+
const isLiteralOnly = (s) => {
|
|
1460
|
+
return !/[\\[\]{}()^$.*+?]/.test(s);
|
|
1461
|
+
};
|
|
1462
|
+
const compileLiteralAlternation = (pattern) => {
|
|
1463
|
+
if (!pattern) return null;
|
|
1464
|
+
if (!isLiteralOnly(pattern)) return null;
|
|
1465
|
+
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
1466
|
+
if (!alternatives.length) return null;
|
|
1467
|
+
return { alternatives };
|
|
1468
|
+
};
|
|
1469
|
+
/**
|
|
1470
|
+
* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
|
|
1471
|
+
* Returns null if not eligible.
|
|
1472
|
+
*/
|
|
1473
|
+
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
1474
|
+
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
1475
|
+
if (!m) return null;
|
|
1476
|
+
const token = m[1];
|
|
1477
|
+
const tokenPattern = getTokenPattern(token);
|
|
1478
|
+
if (!tokenPattern) return null;
|
|
1479
|
+
const compiled = compileLiteralAlternation(tokenPattern);
|
|
1480
|
+
if (!compiled) return null;
|
|
1481
|
+
return {
|
|
1482
|
+
alternatives: compiled.alternatives,
|
|
1483
|
+
token
|
|
1484
|
+
};
|
|
1485
|
+
};
|
|
1486
|
+
/**
|
|
1487
|
+
* Try matching any alternative for a compiled token at a line-start offset.
|
|
1488
|
+
* Returns endOffset (exclusive) on match, else null.
|
|
1471
1489
|
*/
|
|
1472
|
-
const
|
|
1473
|
-
|
|
1490
|
+
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
1491
|
+
for (const alt of compiled.alternatives) {
|
|
1492
|
+
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
1493
|
+
if (end !== null) return end;
|
|
1494
|
+
}
|
|
1495
|
+
return null;
|
|
1496
|
+
};
|
|
1497
|
+
|
|
1498
|
+
//#endregion
|
|
1499
|
+
//#region src/segmentation/segmenter-rule-utils.ts
|
|
1500
|
+
const partitionRulesForMatching = (rules) => {
|
|
1501
|
+
const combinableRules = [];
|
|
1502
|
+
const standaloneRules = [];
|
|
1503
|
+
const fastFuzzyRules = [];
|
|
1504
|
+
rules.forEach((rule, index) => {
|
|
1505
|
+
if (rule.fuzzy && "lineStartsWith" in rule) {
|
|
1506
|
+
const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
|
|
1507
|
+
if (compiled) {
|
|
1508
|
+
fastFuzzyRules.push({
|
|
1509
|
+
compiled,
|
|
1510
|
+
kind: "startsWith",
|
|
1511
|
+
rule,
|
|
1512
|
+
ruleIndex: index
|
|
1513
|
+
});
|
|
1514
|
+
return;
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
if (rule.fuzzy && "lineStartsAfter" in rule) {
|
|
1518
|
+
const compiled = rule.lineStartsAfter.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsAfter[0]) : null;
|
|
1519
|
+
if (compiled) {
|
|
1520
|
+
fastFuzzyRules.push({
|
|
1521
|
+
compiled,
|
|
1522
|
+
kind: "startsAfter",
|
|
1523
|
+
rule,
|
|
1524
|
+
ruleIndex: index
|
|
1525
|
+
});
|
|
1526
|
+
return;
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
let isCombinable = true;
|
|
1530
|
+
if ("regex" in rule && rule.regex) {
|
|
1531
|
+
const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
|
|
1532
|
+
const hasBackreferences = /\\[1-9]/.test(rule.regex);
|
|
1533
|
+
const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
|
|
1534
|
+
if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
|
|
1535
|
+
}
|
|
1536
|
+
if (isCombinable) combinableRules.push({
|
|
1537
|
+
index,
|
|
1538
|
+
prefix: `r${index}_`,
|
|
1539
|
+
rule
|
|
1540
|
+
});
|
|
1541
|
+
else standaloneRules.push(rule);
|
|
1542
|
+
});
|
|
1543
|
+
return {
|
|
1544
|
+
combinableRules,
|
|
1545
|
+
fastFuzzyRules,
|
|
1546
|
+
standaloneRules
|
|
1547
|
+
};
|
|
1548
|
+
};
|
|
1549
|
+
const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
1550
|
+
const pageStartToBoundaryIndex = /* @__PURE__ */ new Map();
|
|
1551
|
+
for (let i = 0; i < pageMap.boundaries.length; i++) pageStartToBoundaryIndex.set(pageMap.boundaries[i].start, i);
|
|
1552
|
+
const compiledPageStartPrev = /* @__PURE__ */ new Map();
|
|
1553
|
+
const getPageStartPrevRegex = (rule, ruleIndex) => {
|
|
1554
|
+
if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
|
|
1555
|
+
const pattern = rule.pageStartGuard;
|
|
1556
|
+
if (!pattern) {
|
|
1557
|
+
compiledPageStartPrev.set(ruleIndex, null);
|
|
1558
|
+
return null;
|
|
1559
|
+
}
|
|
1560
|
+
const expanded = processPattern(pattern, false).pattern;
|
|
1561
|
+
const re = new RegExp(`(?:${expanded})$`, "u");
|
|
1562
|
+
compiledPageStartPrev.set(ruleIndex, re);
|
|
1563
|
+
return re;
|
|
1564
|
+
};
|
|
1565
|
+
const getPrevPageLastNonWsChar = (boundaryIndex) => {
|
|
1566
|
+
if (boundaryIndex <= 0) return "";
|
|
1567
|
+
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
1568
|
+
for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
|
|
1569
|
+
const ch = matchContent[i];
|
|
1570
|
+
if (!ch) continue;
|
|
1571
|
+
if (/\s/u.test(ch)) continue;
|
|
1572
|
+
return ch;
|
|
1573
|
+
}
|
|
1574
|
+
return "";
|
|
1575
|
+
};
|
|
1576
|
+
return (rule, ruleIndex, matchStart) => {
|
|
1577
|
+
const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
|
|
1578
|
+
if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
|
|
1579
|
+
const prevReq = getPageStartPrevRegex(rule, ruleIndex);
|
|
1580
|
+
if (!prevReq) return true;
|
|
1581
|
+
const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
|
|
1582
|
+
if (!lastChar) return false;
|
|
1583
|
+
return prevReq.test(lastChar);
|
|
1584
|
+
};
|
|
1585
|
+
};
|
|
1586
|
+
const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
|
|
1587
|
+
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
1588
|
+
if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
|
|
1589
|
+
let boundaryIdx = 0;
|
|
1590
|
+
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1591
|
+
const advanceBoundaryTo = (offset) => {
|
|
1592
|
+
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
1593
|
+
boundaryIdx++;
|
|
1594
|
+
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1595
|
+
}
|
|
1596
|
+
};
|
|
1597
|
+
const recordSplitPoint = (ruleIndex, sp) => {
|
|
1598
|
+
const arr = splitPointsByRule.get(ruleIndex);
|
|
1599
|
+
if (!arr) {
|
|
1600
|
+
splitPointsByRule.set(ruleIndex, [sp]);
|
|
1601
|
+
return;
|
|
1602
|
+
}
|
|
1603
|
+
arr.push(sp);
|
|
1604
|
+
};
|
|
1605
|
+
const isPageStart = (offset) => offset === currentBoundary?.start;
|
|
1606
|
+
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
1607
|
+
advanceBoundaryTo(lineStart);
|
|
1608
|
+
const pageId = currentBoundary?.id ?? 0;
|
|
1609
|
+
if (lineStart >= matchContent.length) break;
|
|
1610
|
+
for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
|
|
1611
|
+
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
1612
|
+
if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
|
|
1613
|
+
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
1614
|
+
if (end === null) continue;
|
|
1615
|
+
const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
|
|
1616
|
+
if (kind === "startsWith") recordSplitPoint(ruleIndex, {
|
|
1617
|
+
index: splitIndex,
|
|
1618
|
+
meta: rule.meta
|
|
1619
|
+
});
|
|
1620
|
+
else {
|
|
1621
|
+
const markerLength = end - lineStart;
|
|
1622
|
+
recordSplitPoint(ruleIndex, {
|
|
1623
|
+
contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
|
|
1624
|
+
index: splitIndex,
|
|
1625
|
+
meta: rule.meta
|
|
1626
|
+
});
|
|
1627
|
+
}
|
|
1628
|
+
}
|
|
1629
|
+
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
1630
|
+
if (nextNl === -1) break;
|
|
1631
|
+
lineStart = nextNl + 1;
|
|
1632
|
+
}
|
|
1633
|
+
return splitPointsByRule;
|
|
1474
1634
|
};
|
|
1635
|
+
|
|
1636
|
+
//#endregion
|
|
1637
|
+
//#region src/segmentation/textUtils.ts
|
|
1475
1638
|
/**
|
|
1476
1639
|
* Normalizes line endings to Unix-style (`\n`).
|
|
1477
1640
|
*
|
|
@@ -1481,7 +1644,9 @@ const stripHtmlTags = (html) => {
|
|
|
1481
1644
|
* @param content - Raw content with potentially mixed line endings
|
|
1482
1645
|
* @returns Content with all line endings normalized to `\n`
|
|
1483
1646
|
*/
|
|
1484
|
-
const normalizeLineEndings = (content) =>
|
|
1647
|
+
const normalizeLineEndings = (content) => {
|
|
1648
|
+
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
1649
|
+
};
|
|
1485
1650
|
|
|
1486
1651
|
//#endregion
|
|
1487
1652
|
//#region src/segmentation/segmenter.ts
|
|
@@ -1602,67 +1767,9 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
1602
1767
|
return [initialSeg];
|
|
1603
1768
|
};
|
|
1604
1769
|
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
1605
|
-
const
|
|
1606
|
-
const standaloneRules =
|
|
1607
|
-
const
|
|
1608
|
-
rules.forEach((rule, index) => {
|
|
1609
|
-
let isCombinable = true;
|
|
1610
|
-
if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
|
|
1611
|
-
const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
|
|
1612
|
-
if (compiled) {
|
|
1613
|
-
fastFuzzyRules.push({
|
|
1614
|
-
compiled,
|
|
1615
|
-
rule,
|
|
1616
|
-
ruleIndex: index
|
|
1617
|
-
});
|
|
1618
|
-
return;
|
|
1619
|
-
}
|
|
1620
|
-
}
|
|
1621
|
-
if ("regex" in rule && rule.regex) {
|
|
1622
|
-
const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
|
|
1623
|
-
const hasBackreferences = /\\[1-9]/.test(rule.regex);
|
|
1624
|
-
const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
|
|
1625
|
-
if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
|
|
1626
|
-
}
|
|
1627
|
-
if (isCombinable) combinableRules.push({
|
|
1628
|
-
index,
|
|
1629
|
-
prefix: `r${index}_`,
|
|
1630
|
-
rule
|
|
1631
|
-
});
|
|
1632
|
-
else standaloneRules.push(rule);
|
|
1633
|
-
});
|
|
1634
|
-
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
1635
|
-
if (fastFuzzyRules.length > 0) {
|
|
1636
|
-
let boundaryIdx = 0;
|
|
1637
|
-
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1638
|
-
const advanceBoundaryTo = (offset) => {
|
|
1639
|
-
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
1640
|
-
boundaryIdx++;
|
|
1641
|
-
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1642
|
-
}
|
|
1643
|
-
};
|
|
1644
|
-
const recordSplitPoint = (ruleIndex, sp) => {
|
|
1645
|
-
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1646
|
-
splitPointsByRule.get(ruleIndex).push(sp);
|
|
1647
|
-
};
|
|
1648
|
-
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
1649
|
-
advanceBoundaryTo(lineStart);
|
|
1650
|
-
const pageId = currentBoundary?.id ?? 0;
|
|
1651
|
-
if (lineStart >= matchContent.length) break;
|
|
1652
|
-
for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
|
|
1653
|
-
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
1654
|
-
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
1655
|
-
if (end === null) continue;
|
|
1656
|
-
recordSplitPoint(ruleIndex, {
|
|
1657
|
-
index: (rule.split ?? "at") === "at" ? lineStart : end,
|
|
1658
|
-
meta: rule.meta
|
|
1659
|
-
});
|
|
1660
|
-
}
|
|
1661
|
-
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
1662
|
-
if (nextNl === -1) break;
|
|
1663
|
-
lineStart = nextNl + 1;
|
|
1664
|
-
}
|
|
1665
|
-
}
|
|
1770
|
+
const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
|
|
1771
|
+
const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
|
|
1772
|
+
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
1666
1773
|
if (combinableRules.length > 0) {
|
|
1667
1774
|
const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
|
|
1668
1775
|
const built = buildRuleRegex(rule, prefix);
|
|
@@ -1698,6 +1805,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
|
1698
1805
|
const end = m.index + m[0].length;
|
|
1699
1806
|
const pageId = pageMap.getId(start);
|
|
1700
1807
|
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
|
|
1808
|
+
if (!passesPageStartGuard(rule, originalIndex, start)) continue;
|
|
1701
1809
|
const sp = {
|
|
1702
1810
|
capturedContent: void 0,
|
|
1703
1811
|
contentStartOffset,
|
|
@@ -1715,7 +1823,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
|
1715
1823
|
}
|
|
1716
1824
|
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
1717
1825
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1718
|
-
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
|
|
1826
|
+
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
1719
1827
|
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1720
1828
|
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1721
1829
|
return {
|
|
@@ -1869,12 +1977,11 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
1869
1977
|
* });
|
|
1870
1978
|
*/
|
|
1871
1979
|
const segmentPages = (pages, options) => {
|
|
1872
|
-
const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
|
|
1873
|
-
if (!pages.length) return [];
|
|
1980
|
+
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
1874
1981
|
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
|
|
1875
1982
|
let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
|
|
1876
1983
|
segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
|
|
1877
|
-
if (maxPages
|
|
1984
|
+
if (maxPages >= 0 && breakpoints.length) {
|
|
1878
1985
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1879
1986
|
return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
1880
1987
|
}
|
|
@@ -1949,7 +2056,232 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
1949
2056
|
};
|
|
1950
2057
|
|
|
1951
2058
|
//#endregion
|
|
1952
|
-
//#region src/
|
|
2059
|
+
//#region src/analysis.ts
|
|
2060
|
+
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2061
|
+
const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
|
|
2062
|
+
const computeSpecificity = (pattern) => {
|
|
2063
|
+
const tokenCount = countTokenMarkers(pattern);
|
|
2064
|
+
return {
|
|
2065
|
+
literalLen: stripWhitespacePlaceholders(pattern).length,
|
|
2066
|
+
tokenCount
|
|
2067
|
+
};
|
|
2068
|
+
};
|
|
2069
|
+
const DEFAULT_OPTIONS = {
|
|
2070
|
+
includeFirstWordFallback: true,
|
|
2071
|
+
lineFilter: void 0,
|
|
2072
|
+
maxExamples: 1,
|
|
2073
|
+
minCount: 3,
|
|
2074
|
+
minLineLength: 6,
|
|
2075
|
+
normalizeArabicDiacritics: true,
|
|
2076
|
+
prefixChars: 60,
|
|
2077
|
+
prefixMatchers: [/^#+/u],
|
|
2078
|
+
sortBy: "specificity",
|
|
2079
|
+
topK: 40,
|
|
2080
|
+
whitespace: "regex"
|
|
2081
|
+
};
|
|
2082
|
+
const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
2083
|
+
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2084
|
+
"basmalah",
|
|
2085
|
+
"kitab",
|
|
2086
|
+
"bab",
|
|
2087
|
+
"fasl",
|
|
2088
|
+
"naql",
|
|
2089
|
+
"rumuz",
|
|
2090
|
+
"numbered",
|
|
2091
|
+
"raqms",
|
|
2092
|
+
"raqm",
|
|
2093
|
+
"dash",
|
|
2094
|
+
"bullet",
|
|
2095
|
+
"tarqim"
|
|
2096
|
+
];
|
|
2097
|
+
const buildTokenPriority = () => {
|
|
2098
|
+
const allTokens = new Set(getAvailableTokens());
|
|
2099
|
+
return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
|
|
2100
|
+
};
|
|
2101
|
+
const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
|
|
2102
|
+
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
|
|
2103
|
+
const compileTokenRegexes = (tokenNames) => {
|
|
2104
|
+
const compiled = [];
|
|
2105
|
+
for (const token of tokenNames) {
|
|
2106
|
+
const pat = TOKEN_PATTERNS[token];
|
|
2107
|
+
if (!pat) continue;
|
|
2108
|
+
try {
|
|
2109
|
+
compiled.push({
|
|
2110
|
+
re: new RegExp(pat, "uy"),
|
|
2111
|
+
token
|
|
2112
|
+
});
|
|
2113
|
+
} catch {}
|
|
2114
|
+
}
|
|
2115
|
+
return compiled;
|
|
2116
|
+
};
|
|
2117
|
+
const appendWs = (out, mode) => {
|
|
2118
|
+
if (!out) return out;
|
|
2119
|
+
if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
|
|
2120
|
+
return out.endsWith("\\s*") ? out : `${out}\\s*`;
|
|
2121
|
+
};
|
|
2122
|
+
const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
|
|
2123
|
+
let matchedAny = false;
|
|
2124
|
+
let currentPos = pos;
|
|
2125
|
+
let currentOut = out;
|
|
2126
|
+
for (const re of prefixMatchers) {
|
|
2127
|
+
if (currentPos >= s.length) break;
|
|
2128
|
+
const m = re.exec(s.slice(currentPos));
|
|
2129
|
+
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2130
|
+
currentOut += escapeRegexLiteral(m[0]);
|
|
2131
|
+
currentPos += m[0].length;
|
|
2132
|
+
matchedAny = true;
|
|
2133
|
+
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
2134
|
+
if (wsAfter) {
|
|
2135
|
+
currentPos += wsAfter[0].length;
|
|
2136
|
+
currentOut = appendWs(currentOut, whitespace);
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
return {
|
|
2140
|
+
matchedAny,
|
|
2141
|
+
out: currentOut,
|
|
2142
|
+
pos: currentPos
|
|
2143
|
+
};
|
|
2144
|
+
};
|
|
2145
|
+
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
2146
|
+
let best = null;
|
|
2147
|
+
for (const { token, re } of compiled) {
|
|
2148
|
+
re.lastIndex = pos;
|
|
2149
|
+
const m = re.exec(s);
|
|
2150
|
+
if (!m || m.index !== pos) continue;
|
|
2151
|
+
if (!best || m[0].length > best.text.length) best = {
|
|
2152
|
+
text: m[0],
|
|
2153
|
+
token
|
|
2154
|
+
};
|
|
2155
|
+
}
|
|
2156
|
+
if (best?.token === "rumuz") {
|
|
2157
|
+
const end = pos + best.text.length;
|
|
2158
|
+
const next = end < s.length ? s[end] : "";
|
|
2159
|
+
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
2160
|
+
}
|
|
2161
|
+
return best;
|
|
2162
|
+
};
|
|
2163
|
+
const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers, whitespace) => {
|
|
2164
|
+
const trimmed = collapseWhitespace(line);
|
|
2165
|
+
if (!trimmed) return null;
|
|
2166
|
+
const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
|
|
2167
|
+
let pos = 0;
|
|
2168
|
+
let out = "";
|
|
2169
|
+
let matchedAny = false;
|
|
2170
|
+
let matchedToken = false;
|
|
2171
|
+
const compiled = compileTokenRegexes(tokenNames);
|
|
2172
|
+
const isArabicLetter = (ch) => /[\u0600-\u06FF]/u.test(ch);
|
|
2173
|
+
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
2174
|
+
{
|
|
2175
|
+
const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers, whitespace);
|
|
2176
|
+
pos = consumed.pos;
|
|
2177
|
+
out = consumed.out;
|
|
2178
|
+
matchedAny = consumed.matchedAny;
|
|
2179
|
+
}
|
|
2180
|
+
for (let steps = 0; steps < 6 && pos < s.length; steps++) {
|
|
2181
|
+
const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
|
|
2182
|
+
if (wsMatch) {
|
|
2183
|
+
pos += wsMatch[0].length;
|
|
2184
|
+
out = appendWs(out, whitespace);
|
|
2185
|
+
continue;
|
|
2186
|
+
}
|
|
2187
|
+
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
2188
|
+
if (best) {
|
|
2189
|
+
if (out && !out.endsWith("\\s*")) {}
|
|
2190
|
+
out += `{{${best.token}}}`;
|
|
2191
|
+
matchedAny = true;
|
|
2192
|
+
matchedToken = true;
|
|
2193
|
+
pos += best.text.length;
|
|
2194
|
+
continue;
|
|
2195
|
+
}
|
|
2196
|
+
if (matchedAny) {
|
|
2197
|
+
const ch = s[pos];
|
|
2198
|
+
if (ch && isCommonDelimiter(ch)) {
|
|
2199
|
+
out += escapeRegexLiteral(ch);
|
|
2200
|
+
pos += 1;
|
|
2201
|
+
continue;
|
|
2202
|
+
}
|
|
2203
|
+
}
|
|
2204
|
+
if (matchedAny) {
|
|
2205
|
+
if (includeFirstWordFallback && !matchedToken) {
|
|
2206
|
+
const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2207
|
+
if (!firstWord$1) break;
|
|
2208
|
+
out += escapeRegexLiteral(firstWord$1);
|
|
2209
|
+
}
|
|
2210
|
+
break;
|
|
2211
|
+
}
|
|
2212
|
+
if (!includeFirstWordFallback) return null;
|
|
2213
|
+
const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2214
|
+
if (!firstWord) return null;
|
|
2215
|
+
out += escapeRegexLiteral(firstWord);
|
|
2216
|
+
return out;
|
|
2217
|
+
}
|
|
2218
|
+
if (!matchedAny) return null;
|
|
2219
|
+
if (whitespace === "regex") while (out.endsWith("\\s*")) out = out.slice(0, -3);
|
|
2220
|
+
else while (out.endsWith(" ")) out = out.slice(0, -1);
|
|
2221
|
+
return out;
|
|
2222
|
+
};
|
|
2223
|
+
/**
|
|
2224
|
+
* Analyze pages and return the most common line-start patterns (top K).
|
|
2225
|
+
*
|
|
2226
|
+
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
2227
|
+
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
2228
|
+
*/
|
|
2229
|
+
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
2230
|
+
const o = {
|
|
2231
|
+
...DEFAULT_OPTIONS,
|
|
2232
|
+
...options,
|
|
2233
|
+
lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
|
|
2234
|
+
prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
|
|
2235
|
+
whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
|
|
2236
|
+
};
|
|
2237
|
+
const tokenPriority = buildTokenPriority();
|
|
2238
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2239
|
+
for (const page of pages) {
|
|
2240
|
+
const lines = normalizeLineEndings(page.content ?? "").split("\n");
|
|
2241
|
+
for (const line of lines) {
|
|
2242
|
+
const trimmed = collapseWhitespace(line);
|
|
2243
|
+
if (trimmed.length < o.minLineLength) continue;
|
|
2244
|
+
if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
|
|
2245
|
+
const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers, o.whitespace);
|
|
2246
|
+
if (!sig) continue;
|
|
2247
|
+
const existing = counts.get(sig);
|
|
2248
|
+
if (!existing) counts.set(sig, {
|
|
2249
|
+
count: 1,
|
|
2250
|
+
examples: [{
|
|
2251
|
+
line: trimmed,
|
|
2252
|
+
pageId: page.id
|
|
2253
|
+
}]
|
|
2254
|
+
});
|
|
2255
|
+
else {
|
|
2256
|
+
existing.count++;
|
|
2257
|
+
if (existing.examples.length < o.maxExamples) existing.examples.push({
|
|
2258
|
+
line: trimmed,
|
|
2259
|
+
pageId: page.id
|
|
2260
|
+
});
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
const compareSpecificityThenCount = (a, b) => {
|
|
2265
|
+
const sa = computeSpecificity(a.pattern);
|
|
2266
|
+
const sb = computeSpecificity(b.pattern);
|
|
2267
|
+
if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
|
|
2268
|
+
if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
|
|
2269
|
+
if (b.count !== a.count) return b.count - a.count;
|
|
2270
|
+
return a.pattern.localeCompare(b.pattern);
|
|
2271
|
+
};
|
|
2272
|
+
const compareCountThenSpecificity = (a, b) => {
|
|
2273
|
+
if (b.count !== a.count) return b.count - a.count;
|
|
2274
|
+
return compareSpecificityThenCount(a, b);
|
|
2275
|
+
};
|
|
2276
|
+
return [...counts.entries()].map(([pattern, v]) => ({
|
|
2277
|
+
count: v.count,
|
|
2278
|
+
examples: v.examples,
|
|
2279
|
+
pattern
|
|
2280
|
+
})).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
|
|
2281
|
+
};
|
|
2282
|
+
|
|
2283
|
+
//#endregion
|
|
2284
|
+
//#region src/detection.ts
|
|
1953
2285
|
/**
|
|
1954
2286
|
* Pattern detection utilities for recognizing template tokens in Arabic text.
|
|
1955
2287
|
* Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
|
|
@@ -1968,6 +2300,7 @@ const TOKEN_PRIORITY_ORDER = [
|
|
|
1968
2300
|
"bab",
|
|
1969
2301
|
"fasl",
|
|
1970
2302
|
"naql",
|
|
2303
|
+
"rumuz",
|
|
1971
2304
|
"numbered",
|
|
1972
2305
|
"raqms",
|
|
1973
2306
|
"raqm",
|
|
@@ -1986,6 +2319,17 @@ const getTokenPriority = () => {
|
|
|
1986
2319
|
const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
|
|
1987
2320
|
return [...prioritized, ...remaining];
|
|
1988
2321
|
};
|
|
2322
|
+
const isRumuzStandalone = (text, startIndex, endIndex) => {
|
|
2323
|
+
const before = startIndex > 0 ? text[startIndex - 1] : "";
|
|
2324
|
+
const after = endIndex < text.length ? text[endIndex] : "";
|
|
2325
|
+
const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
|
|
2326
|
+
const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
|
|
2327
|
+
const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
|
|
2328
|
+
const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
|
|
2329
|
+
const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
|
|
2330
|
+
const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
|
|
2331
|
+
return leftOk && rightOk;
|
|
2332
|
+
};
|
|
1989
2333
|
/**
|
|
1990
2334
|
* Analyzes text and returns all detected token patterns with their positions.
|
|
1991
2335
|
* Patterns are detected in priority order to avoid partial matches.
|
|
@@ -2017,6 +2361,7 @@ const detectTokenPatterns = (text) => {
|
|
|
2017
2361
|
while ((match = regex.exec(text)) !== null) {
|
|
2018
2362
|
const startIndex = match.index;
|
|
2019
2363
|
const endIndex = startIndex + match[0].length;
|
|
2364
|
+
if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
|
|
2020
2365
|
if (isPositionCovered(startIndex, endIndex)) continue;
|
|
2021
2366
|
results.push({
|
|
2022
2367
|
endIndex,
|
|
@@ -2104,5 +2449,5 @@ const analyzeTextForRule = (text) => {
|
|
|
2104
2449
|
};
|
|
2105
2450
|
|
|
2106
2451
|
//#endregion
|
|
2107
|
-
export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive,
|
|
2452
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
2108
2453
|
//# sourceMappingURL=index.mjs.map
|