flappa-doormal 2.3.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -662,12 +662,24 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
662
662
  break;
663
663
  }
664
664
  const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
665
- if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
665
+ const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
666
+ if (remainingSpan <= maxPages && !remainingHasExclusions) {
666
667
  const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
667
668
  if (finalSeg) result.push(finalSeg);
668
669
  break;
669
670
  }
670
671
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
672
+ logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
673
+ currentFromIdx,
674
+ currentFromPageId: pageIds[currentFromIdx],
675
+ remainingContentStart: remainingContent.slice(0, 50),
676
+ remainingContentLength: remainingContent.length,
677
+ remainingSpan,
678
+ toIdx,
679
+ toPageId: pageIds[toIdx],
680
+ windowEndIdx,
681
+ windowEndPageId: pageIds[windowEndIdx]
682
+ });
671
683
  const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
672
684
  const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
673
685
  let breakPosition = -1;
@@ -680,16 +692,35 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
680
692
  });
681
693
  if (breakPosition <= 0) breakPosition = windowEndPosition;
682
694
  const pieceContent = remainingContent.slice(0, breakPosition).trim();
695
+ logger?.debug?.("[breakpoints] selectedBreak", {
696
+ breakPosition,
697
+ pieceContentEnd: pieceContent.slice(-50),
698
+ pieceContentLength: pieceContent.length,
699
+ windowEndPosition
700
+ });
683
701
  const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
684
702
  if (pieceContent) {
685
703
  const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
686
704
  if (pieceSeg) result.push(pieceSeg);
687
705
  }
688
706
  remainingContent = remainingContent.slice(breakPosition).trim();
689
- if (!remainingContent) break;
707
+ logger?.debug?.("[breakpoints] afterSlice", {
708
+ actualEndIdx,
709
+ remainingContentLength: remainingContent.length,
710
+ remainingContentStart: remainingContent.slice(0, 60)
711
+ });
712
+ if (!remainingContent) {
713
+ logger?.debug?.("[breakpoints] done: no remaining content");
714
+ break;
715
+ }
690
716
  currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
717
+ logger?.debug?.("[breakpoints] nextIteration", {
718
+ currentFromIdx,
719
+ currentFromPageId: pageIds[currentFromIdx]
720
+ });
691
721
  isFirstPiece = false;
692
722
  }
723
+ logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
693
724
  return result;
694
725
  };
695
726
  /**
@@ -708,6 +739,14 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
708
739
  maxPages,
709
740
  segmentCount: segments.length
710
741
  });
742
+ logger?.debug?.("[breakpoints] inputSegments", {
743
+ segmentCount: segments.length,
744
+ segments: segments.map((s) => ({
745
+ contentLength: s.content.length,
746
+ from: s.from,
747
+ to: s.to
748
+ }))
749
+ });
711
750
  for (const segment of segments) {
712
751
  const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
713
752
  const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
@@ -732,6 +771,138 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
732
771
  return result;
733
772
  };
734
773
 
774
+ //#endregion
775
+ //#region src/segmentation/match-utils.ts
776
+ /**
777
+ * Utility functions for regex matching and result processing.
778
+ *
779
+ * These functions were extracted from `segmenter.ts` to reduce complexity
780
+ * and enable independent testing. They handle match filtering, capture
781
+ * extraction, and occurrence-based selection.
782
+ *
783
+ * @module match-utils
784
+ */
785
+ /**
786
+ * Extracts named capture groups from a regex match.
787
+ *
788
+ * Only includes groups that are in the `captureNames` list and have
789
+ * defined values. This filters out positional captures and ensures
790
+ * only explicitly requested named captures are returned.
791
+ *
792
+ * @param groups - The `match.groups` object from `RegExp.exec()`
793
+ * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
794
+ * @returns Object with capture name → value pairs, or `undefined` if none found
795
+ *
796
+ * @example
797
+ * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
798
+ * extractNamedCaptures(match.groups, ['num'])
799
+ * // → { num: '٦٦٩٦' }
800
+ *
801
+ * @example
802
+ * // No matching captures
803
+ * extractNamedCaptures({}, ['num'])
804
+ * // → undefined
805
+ *
806
+ * @example
807
+ * // Undefined groups
808
+ * extractNamedCaptures(undefined, ['num'])
809
+ * // → undefined
810
+ */
811
+ const extractNamedCaptures = (groups, captureNames) => {
812
+ if (!groups || captureNames.length === 0) return;
813
+ const namedCaptures = {};
814
+ for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
815
+ return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
816
+ };
817
+ /**
818
+ * Gets the last defined positional capture group from a match array.
819
+ *
820
+ * Used for `lineStartsAfter` patterns where the content capture (`.*`)
821
+ * is always at the end of the pattern. Named captures may shift the
822
+ * positional indices, so we iterate backward to find the actual content.
823
+ *
824
+ * @param match - RegExp exec result array
825
+ * @returns The last defined capture group value, or `undefined` if none
826
+ *
827
+ * @example
828
+ * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
829
+ * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
830
+ * getLastPositionalCapture(match)
831
+ * // → 'content'
832
+ *
833
+ * @example
834
+ * // No captures
835
+ * getLastPositionalCapture(['full match'])
836
+ * // → undefined
837
+ */
838
+ const getLastPositionalCapture = (match) => {
839
+ if (match.length <= 1) return;
840
+ for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
841
+ };
842
+ /**
843
+ * Filters matches to only include those within page ID constraints.
844
+ *
845
+ * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
846
+ * matches that occur on pages outside the allowed range or explicitly excluded.
847
+ *
848
+ * @param matches - Array of match results to filter
849
+ * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
850
+ * @param getId - Function that returns the page ID for a given offset
851
+ * @returns Filtered array containing only matches within constraints
852
+ *
853
+ * @example
854
+ * const matches = [
855
+ * { start: 0, end: 10 }, // Page 1
856
+ * { start: 100, end: 110 }, // Page 5
857
+ * { start: 200, end: 210 }, // Page 10
858
+ * ];
859
+ * filterByConstraints(matches, { min: 3, max: 8 }, getId)
860
+ * // → [{ start: 100, end: 110 }] (only page 5 match)
861
+ */
862
+ const filterByConstraints = (matches, rule, getId) => {
863
+ return matches.filter((m) => {
864
+ const id = getId(m.start);
865
+ if (rule.min !== void 0 && id < rule.min) return false;
866
+ if (rule.max !== void 0 && id > rule.max) return false;
867
+ if (isPageExcluded(id, rule.exclude)) return false;
868
+ return true;
869
+ });
870
+ };
871
+ /**
872
+ * Checks if any rule in the list allows the given page ID.
873
+ *
874
+ * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
875
+ * Rules without constraints allow all page IDs.
876
+ *
877
+ * This is used to determine whether to create a segment for content
878
+ * that appears before any split points (the "first segment").
879
+ *
880
+ * @param rules - Array of rules with optional `min` and `max` constraints
881
+ * @param pageId - Page ID to check
882
+ * @returns `true` if at least one rule allows the page ID
883
+ *
884
+ * @example
885
+ * const rules = [
886
+ * { min: 5, max: 10 }, // Allows pages 5-10
887
+ * { min: 20 }, // Allows pages 20+
888
+ * ];
889
+ *
890
+ * anyRuleAllowsId(rules, 7) // → true (first rule allows)
891
+ * anyRuleAllowsId(rules, 3) // → false (no rule allows)
892
+ * anyRuleAllowsId(rules, 25) // → true (second rule allows)
893
+ *
894
+ * @example
895
+ * // Rules without constraints allow everything
896
+ * anyRuleAllowsId([{}], 999) // → true
897
+ */
898
+ const anyRuleAllowsId = (rules, pageId) => {
899
+ return rules.some((r) => {
900
+ const minOk = r.min === void 0 || pageId >= r.min;
901
+ const maxOk = r.max === void 0 || pageId <= r.max;
902
+ return minOk && maxOk;
903
+ });
904
+ };
905
+
735
906
  //#endregion
736
907
  //#region src/segmentation/tokens.ts
737
908
  /**
@@ -798,19 +969,13 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
798
969
  * // → '{{harf}}' (unchanged - no brackets outside tokens)
799
970
  */
800
971
  const escapeTemplateBrackets = (pattern) => {
801
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (match, token, bracket) => {
972
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
802
973
  if (token) return token;
803
974
  return `\\${bracket}`;
804
975
  });
805
976
  };
806
- /**
807
- * Base token definitions mapping human-readable token names to regex patterns.
808
- *
809
- * These tokens contain raw regex patterns and do not reference other tokens.
810
- * For composite tokens that build on these, see `COMPOSITE_TOKENS`.
811
- *
812
- * @internal
813
- */
977
+ const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|كن|مد|قد|خد|فد|دل|كد|غد|صد|تم|فق|دق|[خرزيمنصسدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
978
+ const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
814
979
  const BASE_TOKENS = {
815
980
  bab: "باب",
816
981
  basmalah: ["بسم الله", "﷽"].join("|"),
@@ -818,7 +983,7 @@ const BASE_TOKENS = {
818
983
  dash: "[-–—ـ]",
819
984
  fasl: ["مسألة", "فصل"].join("|"),
820
985
  harf: "[أ-ي]",
821
- harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
986
+ harfs: "[أ-ي](?:\\s+[أ-ي])*",
822
987
  kitab: "كتاب",
823
988
  naql: [
824
989
  "حدثني",
@@ -831,6 +996,7 @@ const BASE_TOKENS = {
831
996
  ].join("|"),
832
997
  raqm: "[\\u0660-\\u0669]",
833
998
  raqms: "[\\u0660-\\u0669]+",
999
+ rumuz: RUMUZ_BLOCK,
834
1000
  tarqim: "[.!?؟؛]"
835
1001
  };
836
1002
  /**
@@ -1090,236 +1256,18 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
1090
1256
  /**
1091
1257
  * Gets the regex pattern for a specific token name.
1092
1258
  *
1093
- * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
1094
- * without any expansion or capture group wrapping.
1095
- *
1096
- * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
1097
- * @returns The regex pattern string, or `undefined` if token doesn't exist
1098
- *
1099
- * @example
1100
- * getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
1101
- * getTokenPattern('dash') // → '[-–—ـ]'
1102
- * getTokenPattern('unknown') // → undefined
1103
- */
1104
- const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1105
-
1106
- //#endregion
1107
- //#region src/segmentation/fast-fuzzy-prefix.ts
1108
- /**
1109
- * Fast-path fuzzy prefix matching for common Arabic line-start markers.
1110
- *
1111
- * This exists to avoid running expensive fuzzy-expanded regex alternations over
1112
- * a giant concatenated string. Instead, we match only at known line-start
1113
- * offsets and perform a small deterministic comparison:
1114
- * - Skip Arabic diacritics in the CONTENT
1115
- * - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
1116
- *
1117
- * This module is intentionally conservative: it only supports "literal"
1118
- * token patterns (plain text alternation via `|`), not general regex.
1119
- */
1120
- const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
1121
- const equivKey = (ch) => {
1122
- switch (ch) {
1123
- case "آ":
1124
- case "أ":
1125
- case "إ": return "ا";
1126
- case "ه": return "ة";
1127
- case "ي": return "ى";
1128
- default: return ch;
1129
- }
1130
- };
1131
- /**
1132
- * Match a fuzzy literal prefix at a given offset.
1133
- *
1134
- * - Skips diacritics in the content
1135
- * - Applies equivalence groups on both content and literal
1136
- *
1137
- * @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
1138
- */
1139
- const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
1140
- let i = offset;
1141
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1142
- for (let j = 0; j < literal.length; j++) {
1143
- const litCh = literal[j];
1144
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1145
- if (i >= content.length) return null;
1146
- const cCh = content[i];
1147
- if (equivKey(cCh) !== equivKey(litCh)) return null;
1148
- i++;
1149
- }
1150
- while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1151
- return i;
1152
- };
1153
- const isLiteralOnly = (s) => {
1154
- return !/[\\[\]{}()^$.*+?]/.test(s);
1155
- };
1156
- const compileLiteralAlternation = (pattern) => {
1157
- if (!pattern) return null;
1158
- if (!isLiteralOnly(pattern)) return null;
1159
- const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
1160
- if (!alternatives.length) return null;
1161
- return { alternatives };
1162
- };
1163
- /**
1164
- * Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
1165
- * Returns null if not eligible.
1166
- */
1167
- const compileFastFuzzyTokenRule = (tokenTemplate) => {
1168
- const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
1169
- if (!m) return null;
1170
- const token = m[1];
1171
- const tokenPattern = getTokenPattern(token);
1172
- if (!tokenPattern) return null;
1173
- const compiled = compileLiteralAlternation(tokenPattern);
1174
- if (!compiled) return null;
1175
- return {
1176
- alternatives: compiled.alternatives,
1177
- token
1178
- };
1179
- };
1180
- /**
1181
- * Try matching any alternative for a compiled token at a line-start offset.
1182
- * Returns endOffset (exclusive) on match, else null.
1183
- */
1184
- const matchFastFuzzyTokenAt = (content, offset, compiled) => {
1185
- for (const alt of compiled.alternatives) {
1186
- const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
1187
- if (end !== null) return end;
1188
- }
1189
- return null;
1190
- };
1191
-
1192
- //#endregion
1193
- //#region src/segmentation/match-utils.ts
1194
- /**
1195
- * Utility functions for regex matching and result processing.
1196
- *
1197
- * These functions were extracted from `segmenter.ts` to reduce complexity
1198
- * and enable independent testing. They handle match filtering, capture
1199
- * extraction, and occurrence-based selection.
1200
- *
1201
- * @module match-utils
1202
- */
1203
- /**
1204
- * Extracts named capture groups from a regex match.
1205
- *
1206
- * Only includes groups that are in the `captureNames` list and have
1207
- * defined values. This filters out positional captures and ensures
1208
- * only explicitly requested named captures are returned.
1209
- *
1210
- * @param groups - The `match.groups` object from `RegExp.exec()`
1211
- * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
1212
- * @returns Object with capture name → value pairs, or `undefined` if none found
1213
- *
1214
- * @example
1215
- * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
1216
- * extractNamedCaptures(match.groups, ['num'])
1217
- * // → { num: '٦٦٩٦' }
1218
- *
1219
- * @example
1220
- * // No matching captures
1221
- * extractNamedCaptures({}, ['num'])
1222
- * // → undefined
1223
- *
1224
- * @example
1225
- * // Undefined groups
1226
- * extractNamedCaptures(undefined, ['num'])
1227
- * // → undefined
1228
- */
1229
- const extractNamedCaptures = (groups, captureNames) => {
1230
- if (!groups || captureNames.length === 0) return;
1231
- const namedCaptures = {};
1232
- for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
1233
- return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
1234
- };
1235
- /**
1236
- * Gets the last defined positional capture group from a match array.
1237
- *
1238
- * Used for `lineStartsAfter` patterns where the content capture (`.*`)
1239
- * is always at the end of the pattern. Named captures may shift the
1240
- * positional indices, so we iterate backward to find the actual content.
1241
- *
1242
- * @param match - RegExp exec result array
1243
- * @returns The last defined capture group value, or `undefined` if none
1244
- *
1245
- * @example
1246
- * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
1247
- * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
1248
- * getLastPositionalCapture(match)
1249
- * // → 'content'
1250
- *
1251
- * @example
1252
- * // No captures
1253
- * getLastPositionalCapture(['full match'])
1254
- * // → undefined
1255
- */
1256
- const getLastPositionalCapture = (match) => {
1257
- if (match.length <= 1) return;
1258
- for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
1259
- };
1260
- /**
1261
- * Filters matches to only include those within page ID constraints.
1262
- *
1263
- * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
1264
- * matches that occur on pages outside the allowed range or explicitly excluded.
1265
- *
1266
- * @param matches - Array of match results to filter
1267
- * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
1268
- * @param getId - Function that returns the page ID for a given offset
1269
- * @returns Filtered array containing only matches within constraints
1270
- *
1271
- * @example
1272
- * const matches = [
1273
- * { start: 0, end: 10 }, // Page 1
1274
- * { start: 100, end: 110 }, // Page 5
1275
- * { start: 200, end: 210 }, // Page 10
1276
- * ];
1277
- * filterByConstraints(matches, { min: 3, max: 8 }, getId)
1278
- * // → [{ start: 100, end: 110 }] (only page 5 match)
1279
- */
1280
- const filterByConstraints = (matches, rule, getId) => {
1281
- return matches.filter((m) => {
1282
- const id = getId(m.start);
1283
- if (rule.min !== void 0 && id < rule.min) return false;
1284
- if (rule.max !== void 0 && id > rule.max) return false;
1285
- if (isPageExcluded(id, rule.exclude)) return false;
1286
- return true;
1287
- });
1288
- };
1289
- /**
1290
- * Checks if any rule in the list allows the given page ID.
1291
- *
1292
- * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
1293
- * Rules without constraints allow all page IDs.
1294
- *
1295
- * This is used to determine whether to create a segment for content
1296
- * that appears before any split points (the "first segment").
1297
- *
1298
- * @param rules - Array of rules with optional `min` and `max` constraints
1299
- * @param pageId - Page ID to check
1300
- * @returns `true` if at least one rule allows the page ID
1301
- *
1302
- * @example
1303
- * const rules = [
1304
- * { min: 5, max: 10 }, // Allows pages 5-10
1305
- * { min: 20 }, // Allows pages 20+
1306
- * ];
1259
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
1260
+ * without any expansion or capture group wrapping.
1307
1261
  *
1308
- * anyRuleAllowsId(rules, 7) // true (first rule allows)
1309
- * anyRuleAllowsId(rules, 3) // false (no rule allows)
1310
- * anyRuleAllowsId(rules, 25) // → true (second rule allows)
1262
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
1263
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
1311
1264
  *
1312
1265
  * @example
1313
- * // Rules without constraints allow everything
1314
- * anyRuleAllowsId([{}], 999) // → true
1266
+ * getTokenPattern('raqms') // '[\\u0660-\\u0669]+'
1267
+ * getTokenPattern('dash') // → '[-–—ـ]'
1268
+ * getTokenPattern('unknown') // → undefined
1315
1269
  */
1316
- const anyRuleAllowsId = (rules, pageId) => {
1317
- return rules.some((r) => {
1318
- const minOk = r.min === void 0 || pageId >= r.min;
1319
- const maxOk = r.max === void 0 || pageId <= r.max;
1320
- return minOk && maxOk;
1321
- });
1322
- };
1270
+ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1323
1271
 
1324
1272
  //#endregion
1325
1273
  //#region src/segmentation/rule-regex.ts
@@ -1462,16 +1410,231 @@ const buildRuleRegex = (rule, capturePrefix) => {
1462
1410
  };
1463
1411
 
1464
1412
  //#endregion
1465
- //#region src/segmentation/textUtils.ts
1413
+ //#region src/segmentation/fast-fuzzy-prefix.ts
1414
+ /**
1415
+ * Fast-path fuzzy prefix matching for common Arabic line-start markers.
1416
+ *
1417
+ * This exists to avoid running expensive fuzzy-expanded regex alternations over
1418
+ * a giant concatenated string. Instead, we match only at known line-start
1419
+ * offsets and perform a small deterministic comparison:
1420
+ * - Skip Arabic diacritics in the CONTENT
1421
+ * - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
1422
+ *
1423
+ * This module is intentionally conservative: it only supports "literal"
1424
+ * token patterns (plain text alternation via `|`), not general regex.
1425
+ */
1426
+ const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
1427
+ const equivKey = (ch) => {
1428
+ switch (ch) {
1429
+ case "آ":
1430
+ case "أ":
1431
+ case "إ": return "ا";
1432
+ case "ه": return "ة";
1433
+ case "ي": return "ى";
1434
+ default: return ch;
1435
+ }
1436
+ };
1466
1437
  /**
1467
- * Strip all HTML tags from content, keeping only text.
1438
+ * Match a fuzzy literal prefix at a given offset.
1439
+ *
1440
+ * - Skips diacritics in the content
1441
+ * - Applies equivalence groups on both content and literal
1468
1442
  *
1469
- * @param html - HTML content
1470
- * @returns Plain text content
1443
+ * @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
1444
+ */
1445
+ const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
1446
+ let i = offset;
1447
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1448
+ for (let j = 0; j < literal.length; j++) {
1449
+ const litCh = literal[j];
1450
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1451
+ if (i >= content.length) return null;
1452
+ const cCh = content[i];
1453
+ if (equivKey(cCh) !== equivKey(litCh)) return null;
1454
+ i++;
1455
+ }
1456
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1457
+ return i;
1458
+ };
1459
+ const isLiteralOnly = (s) => {
1460
+ return !/[\\[\]{}()^$.*+?]/.test(s);
1461
+ };
1462
+ const compileLiteralAlternation = (pattern) => {
1463
+ if (!pattern) return null;
1464
+ if (!isLiteralOnly(pattern)) return null;
1465
+ const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
1466
+ if (!alternatives.length) return null;
1467
+ return { alternatives };
1468
+ };
1469
+ /**
1470
+ * Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
1471
+ * Returns null if not eligible.
1472
+ */
1473
+ const compileFastFuzzyTokenRule = (tokenTemplate) => {
1474
+ const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
1475
+ if (!m) return null;
1476
+ const token = m[1];
1477
+ const tokenPattern = getTokenPattern(token);
1478
+ if (!tokenPattern) return null;
1479
+ const compiled = compileLiteralAlternation(tokenPattern);
1480
+ if (!compiled) return null;
1481
+ return {
1482
+ alternatives: compiled.alternatives,
1483
+ token
1484
+ };
1485
+ };
1486
+ /**
1487
+ * Try matching any alternative for a compiled token at a line-start offset.
1488
+ * Returns endOffset (exclusive) on match, else null.
1471
1489
  */
1472
- const stripHtmlTags = (html) => {
1473
- return html.replace(/<[^>]*>/g, "");
1490
+ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
1491
+ for (const alt of compiled.alternatives) {
1492
+ const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
1493
+ if (end !== null) return end;
1494
+ }
1495
+ return null;
1496
+ };
1497
+
1498
+ //#endregion
1499
+ //#region src/segmentation/segmenter-rule-utils.ts
1500
+ const partitionRulesForMatching = (rules) => {
1501
+ const combinableRules = [];
1502
+ const standaloneRules = [];
1503
+ const fastFuzzyRules = [];
1504
+ rules.forEach((rule, index) => {
1505
+ if (rule.fuzzy && "lineStartsWith" in rule) {
1506
+ const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
1507
+ if (compiled) {
1508
+ fastFuzzyRules.push({
1509
+ compiled,
1510
+ kind: "startsWith",
1511
+ rule,
1512
+ ruleIndex: index
1513
+ });
1514
+ return;
1515
+ }
1516
+ }
1517
+ if (rule.fuzzy && "lineStartsAfter" in rule) {
1518
+ const compiled = rule.lineStartsAfter.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsAfter[0]) : null;
1519
+ if (compiled) {
1520
+ fastFuzzyRules.push({
1521
+ compiled,
1522
+ kind: "startsAfter",
1523
+ rule,
1524
+ ruleIndex: index
1525
+ });
1526
+ return;
1527
+ }
1528
+ }
1529
+ let isCombinable = true;
1530
+ if ("regex" in rule && rule.regex) {
1531
+ const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
1532
+ const hasBackreferences = /\\[1-9]/.test(rule.regex);
1533
+ const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
1534
+ if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
1535
+ }
1536
+ if (isCombinable) combinableRules.push({
1537
+ index,
1538
+ prefix: `r${index}_`,
1539
+ rule
1540
+ });
1541
+ else standaloneRules.push(rule);
1542
+ });
1543
+ return {
1544
+ combinableRules,
1545
+ fastFuzzyRules,
1546
+ standaloneRules
1547
+ };
1548
+ };
1549
+ const createPageStartGuardChecker = (matchContent, pageMap) => {
1550
+ const pageStartToBoundaryIndex = /* @__PURE__ */ new Map();
1551
+ for (let i = 0; i < pageMap.boundaries.length; i++) pageStartToBoundaryIndex.set(pageMap.boundaries[i].start, i);
1552
+ const compiledPageStartPrev = /* @__PURE__ */ new Map();
1553
+ const getPageStartPrevRegex = (rule, ruleIndex) => {
1554
+ if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
1555
+ const pattern = rule.pageStartGuard;
1556
+ if (!pattern) {
1557
+ compiledPageStartPrev.set(ruleIndex, null);
1558
+ return null;
1559
+ }
1560
+ const expanded = processPattern(pattern, false).pattern;
1561
+ const re = new RegExp(`(?:${expanded})$`, "u");
1562
+ compiledPageStartPrev.set(ruleIndex, re);
1563
+ return re;
1564
+ };
1565
+ const getPrevPageLastNonWsChar = (boundaryIndex) => {
1566
+ if (boundaryIndex <= 0) return "";
1567
+ const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
1568
+ for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
1569
+ const ch = matchContent[i];
1570
+ if (!ch) continue;
1571
+ if (/\s/u.test(ch)) continue;
1572
+ return ch;
1573
+ }
1574
+ return "";
1575
+ };
1576
+ return (rule, ruleIndex, matchStart) => {
1577
+ const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
1578
+ if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
1579
+ const prevReq = getPageStartPrevRegex(rule, ruleIndex);
1580
+ if (!prevReq) return true;
1581
+ const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
1582
+ if (!lastChar) return false;
1583
+ return prevReq.test(lastChar);
1584
+ };
1585
+ };
1586
+ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
1587
+ const splitPointsByRule = /* @__PURE__ */ new Map();
1588
+ if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
1589
+ let boundaryIdx = 0;
1590
+ let currentBoundary = pageMap.boundaries[boundaryIdx];
1591
+ const advanceBoundaryTo = (offset) => {
1592
+ while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
1593
+ boundaryIdx++;
1594
+ currentBoundary = pageMap.boundaries[boundaryIdx];
1595
+ }
1596
+ };
1597
+ const recordSplitPoint = (ruleIndex, sp) => {
1598
+ const arr = splitPointsByRule.get(ruleIndex);
1599
+ if (!arr) {
1600
+ splitPointsByRule.set(ruleIndex, [sp]);
1601
+ return;
1602
+ }
1603
+ arr.push(sp);
1604
+ };
1605
+ const isPageStart = (offset) => offset === currentBoundary?.start;
1606
+ for (let lineStart = 0; lineStart <= matchContent.length;) {
1607
+ advanceBoundaryTo(lineStart);
1608
+ const pageId = currentBoundary?.id ?? 0;
1609
+ if (lineStart >= matchContent.length) break;
1610
+ for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
1611
+ if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
1612
+ if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
1613
+ const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
1614
+ if (end === null) continue;
1615
+ const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
1616
+ if (kind === "startsWith") recordSplitPoint(ruleIndex, {
1617
+ index: splitIndex,
1618
+ meta: rule.meta
1619
+ });
1620
+ else {
1621
+ const markerLength = end - lineStart;
1622
+ recordSplitPoint(ruleIndex, {
1623
+ contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
1624
+ index: splitIndex,
1625
+ meta: rule.meta
1626
+ });
1627
+ }
1628
+ }
1629
+ const nextNl = matchContent.indexOf("\n", lineStart);
1630
+ if (nextNl === -1) break;
1631
+ lineStart = nextNl + 1;
1632
+ }
1633
+ return splitPointsByRule;
1474
1634
  };
1635
+
1636
+ //#endregion
1637
+ //#region src/segmentation/textUtils.ts
1475
1638
  /**
1476
1639
  * Normalizes line endings to Unix-style (`\n`).
1477
1640
  *
@@ -1481,7 +1644,9 @@ const stripHtmlTags = (html) => {
1481
1644
  * @param content - Raw content with potentially mixed line endings
1482
1645
  * @returns Content with all line endings normalized to `\n`
1483
1646
  */
1484
- const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1647
+ const normalizeLineEndings = (content) => {
1648
+ return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1649
+ };
1485
1650
 
1486
1651
  //#endregion
1487
1652
  //#region src/segmentation/segmenter.ts
@@ -1602,67 +1767,9 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
1602
1767
  return [initialSeg];
1603
1768
  };
1604
1769
  const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1605
- const combinableRules = [];
1606
- const standaloneRules = [];
1607
- const fastFuzzyRules = [];
1608
- rules.forEach((rule, index) => {
1609
- let isCombinable = true;
1610
- if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
1611
- const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
1612
- if (compiled) {
1613
- fastFuzzyRules.push({
1614
- compiled,
1615
- rule,
1616
- ruleIndex: index
1617
- });
1618
- return;
1619
- }
1620
- }
1621
- if ("regex" in rule && rule.regex) {
1622
- const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
1623
- const hasBackreferences = /\\[1-9]/.test(rule.regex);
1624
- const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
1625
- if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
1626
- }
1627
- if (isCombinable) combinableRules.push({
1628
- index,
1629
- prefix: `r${index}_`,
1630
- rule
1631
- });
1632
- else standaloneRules.push(rule);
1633
- });
1634
- const splitPointsByRule = /* @__PURE__ */ new Map();
1635
- if (fastFuzzyRules.length > 0) {
1636
- let boundaryIdx = 0;
1637
- let currentBoundary = pageMap.boundaries[boundaryIdx];
1638
- const advanceBoundaryTo = (offset) => {
1639
- while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
1640
- boundaryIdx++;
1641
- currentBoundary = pageMap.boundaries[boundaryIdx];
1642
- }
1643
- };
1644
- const recordSplitPoint = (ruleIndex, sp) => {
1645
- if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
1646
- splitPointsByRule.get(ruleIndex).push(sp);
1647
- };
1648
- for (let lineStart = 0; lineStart <= matchContent.length;) {
1649
- advanceBoundaryTo(lineStart);
1650
- const pageId = currentBoundary?.id ?? 0;
1651
- if (lineStart >= matchContent.length) break;
1652
- for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
1653
- if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
1654
- const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
1655
- if (end === null) continue;
1656
- recordSplitPoint(ruleIndex, {
1657
- index: (rule.split ?? "at") === "at" ? lineStart : end,
1658
- meta: rule.meta
1659
- });
1660
- }
1661
- const nextNl = matchContent.indexOf("\n", lineStart);
1662
- if (nextNl === -1) break;
1663
- lineStart = nextNl + 1;
1664
- }
1665
- }
1770
+ const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
1771
+ const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
1772
+ const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
1666
1773
  if (combinableRules.length > 0) {
1667
1774
  const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
1668
1775
  const built = buildRuleRegex(rule, prefix);
@@ -1698,6 +1805,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1698
1805
  const end = m.index + m[0].length;
1699
1806
  const pageId = pageMap.getId(start);
1700
1807
  if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
1808
+ if (!passesPageStartGuard(rule, originalIndex, start)) continue;
1701
1809
  const sp = {
1702
1810
  capturedContent: void 0,
1703
1811
  contentStartOffset,
@@ -1715,7 +1823,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1715
1823
  }
1716
1824
  const collectSplitPointsFromRule = (rule, ruleIndex) => {
1717
1825
  const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1718
- const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
1826
+ const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
1719
1827
  const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1720
1828
  const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1721
1829
  return {
@@ -1869,12 +1977,11 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1869
1977
  * });
1870
1978
  */
1871
1979
  const segmentPages = (pages, options) => {
1872
- const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
1873
- if (!pages.length) return [];
1980
+ const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
1874
1981
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1875
1982
  let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
1876
1983
  segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
1877
- if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
1984
+ if (maxPages >= 0 && breakpoints.length) {
1878
1985
  const patternProcessor = (p) => processPattern(p, false).pattern;
1879
1986
  return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
1880
1987
  }
@@ -1949,7 +2056,232 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
1949
2056
  };
1950
2057
 
1951
2058
  //#endregion
1952
- //#region src/pattern-detection.ts
2059
+ //#region src/analysis.ts
2060
+ const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2061
+ const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
2062
+ const computeSpecificity = (pattern) => {
2063
+ const tokenCount = countTokenMarkers(pattern);
2064
+ return {
2065
+ literalLen: stripWhitespacePlaceholders(pattern).length,
2066
+ tokenCount
2067
+ };
2068
+ };
2069
+ const DEFAULT_OPTIONS = {
2070
+ includeFirstWordFallback: true,
2071
+ lineFilter: void 0,
2072
+ maxExamples: 1,
2073
+ minCount: 3,
2074
+ minLineLength: 6,
2075
+ normalizeArabicDiacritics: true,
2076
+ prefixChars: 60,
2077
+ prefixMatchers: [/^#+/u],
2078
+ sortBy: "specificity",
2079
+ topK: 40,
2080
+ whitespace: "regex"
2081
+ };
2082
+ const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2083
+ const TOKEN_PRIORITY_ORDER$1 = [
2084
+ "basmalah",
2085
+ "kitab",
2086
+ "bab",
2087
+ "fasl",
2088
+ "naql",
2089
+ "rumuz",
2090
+ "numbered",
2091
+ "raqms",
2092
+ "raqm",
2093
+ "dash",
2094
+ "bullet",
2095
+ "tarqim"
2096
+ ];
2097
+ const buildTokenPriority = () => {
2098
+ const allTokens = new Set(getAvailableTokens());
2099
+ return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
2100
+ };
2101
+ const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
2102
+ const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
2103
+ const compileTokenRegexes = (tokenNames) => {
2104
+ const compiled = [];
2105
+ for (const token of tokenNames) {
2106
+ const pat = TOKEN_PATTERNS[token];
2107
+ if (!pat) continue;
2108
+ try {
2109
+ compiled.push({
2110
+ re: new RegExp(pat, "uy"),
2111
+ token
2112
+ });
2113
+ } catch {}
2114
+ }
2115
+ return compiled;
2116
+ };
2117
+ const appendWs = (out, mode) => {
2118
+ if (!out) return out;
2119
+ if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
2120
+ return out.endsWith("\\s*") ? out : `${out}\\s*`;
2121
+ };
2122
+ const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
2123
+ let matchedAny = false;
2124
+ let currentPos = pos;
2125
+ let currentOut = out;
2126
+ for (const re of prefixMatchers) {
2127
+ if (currentPos >= s.length) break;
2128
+ const m = re.exec(s.slice(currentPos));
2129
+ if (!m || m.index !== 0 || !m[0]) continue;
2130
+ currentOut += escapeRegexLiteral(m[0]);
2131
+ currentPos += m[0].length;
2132
+ matchedAny = true;
2133
+ const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
2134
+ if (wsAfter) {
2135
+ currentPos += wsAfter[0].length;
2136
+ currentOut = appendWs(currentOut, whitespace);
2137
+ }
2138
+ }
2139
+ return {
2140
+ matchedAny,
2141
+ out: currentOut,
2142
+ pos: currentPos
2143
+ };
2144
+ };
2145
+ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2146
+ let best = null;
2147
+ for (const { token, re } of compiled) {
2148
+ re.lastIndex = pos;
2149
+ const m = re.exec(s);
2150
+ if (!m || m.index !== pos) continue;
2151
+ if (!best || m[0].length > best.text.length) best = {
2152
+ text: m[0],
2153
+ token
2154
+ };
2155
+ }
2156
+ if (best?.token === "rumuz") {
2157
+ const end = pos + best.text.length;
2158
+ const next = end < s.length ? s[end] : "";
2159
+ if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
2160
+ }
2161
+ return best;
2162
+ };
2163
+ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers, whitespace) => {
2164
+ const trimmed = collapseWhitespace(line);
2165
+ if (!trimmed) return null;
2166
+ const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
2167
+ let pos = 0;
2168
+ let out = "";
2169
+ let matchedAny = false;
2170
+ let matchedToken = false;
2171
+ const compiled = compileTokenRegexes(tokenNames);
2172
+ const isArabicLetter = (ch) => /[\u0600-\u06FF]/u.test(ch);
2173
+ const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2174
+ {
2175
+ const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers, whitespace);
2176
+ pos = consumed.pos;
2177
+ out = consumed.out;
2178
+ matchedAny = consumed.matchedAny;
2179
+ }
2180
+ for (let steps = 0; steps < 6 && pos < s.length; steps++) {
2181
+ const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
2182
+ if (wsMatch) {
2183
+ pos += wsMatch[0].length;
2184
+ out = appendWs(out, whitespace);
2185
+ continue;
2186
+ }
2187
+ const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2188
+ if (best) {
2189
+ if (out && !out.endsWith("\\s*")) {}
2190
+ out += `{{${best.token}}}`;
2191
+ matchedAny = true;
2192
+ matchedToken = true;
2193
+ pos += best.text.length;
2194
+ continue;
2195
+ }
2196
+ if (matchedAny) {
2197
+ const ch = s[pos];
2198
+ if (ch && isCommonDelimiter(ch)) {
2199
+ out += escapeRegexLiteral(ch);
2200
+ pos += 1;
2201
+ continue;
2202
+ }
2203
+ }
2204
+ if (matchedAny) {
2205
+ if (includeFirstWordFallback && !matchedToken) {
2206
+ const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2207
+ if (!firstWord$1) break;
2208
+ out += escapeRegexLiteral(firstWord$1);
2209
+ }
2210
+ break;
2211
+ }
2212
+ if (!includeFirstWordFallback) return null;
2213
+ const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2214
+ if (!firstWord) return null;
2215
+ out += escapeRegexLiteral(firstWord);
2216
+ return out;
2217
+ }
2218
+ if (!matchedAny) return null;
2219
+ if (whitespace === "regex") while (out.endsWith("\\s*")) out = out.slice(0, -3);
2220
+ else while (out.endsWith(" ")) out = out.slice(0, -1);
2221
+ return out;
2222
+ };
2223
+ /**
2224
+ * Analyze pages and return the most common line-start patterns (top K).
2225
+ *
2226
+ * This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
2227
+ * template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
2228
+ */
2229
+ const analyzeCommonLineStarts = (pages, options = {}) => {
2230
+ const o = {
2231
+ ...DEFAULT_OPTIONS,
2232
+ ...options,
2233
+ lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
2234
+ prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
2235
+ whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
2236
+ };
2237
+ const tokenPriority = buildTokenPriority();
2238
+ const counts = /* @__PURE__ */ new Map();
2239
+ for (const page of pages) {
2240
+ const lines = normalizeLineEndings(page.content ?? "").split("\n");
2241
+ for (const line of lines) {
2242
+ const trimmed = collapseWhitespace(line);
2243
+ if (trimmed.length < o.minLineLength) continue;
2244
+ if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
2245
+ const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers, o.whitespace);
2246
+ if (!sig) continue;
2247
+ const existing = counts.get(sig);
2248
+ if (!existing) counts.set(sig, {
2249
+ count: 1,
2250
+ examples: [{
2251
+ line: trimmed,
2252
+ pageId: page.id
2253
+ }]
2254
+ });
2255
+ else {
2256
+ existing.count++;
2257
+ if (existing.examples.length < o.maxExamples) existing.examples.push({
2258
+ line: trimmed,
2259
+ pageId: page.id
2260
+ });
2261
+ }
2262
+ }
2263
+ }
2264
+ const compareSpecificityThenCount = (a, b) => {
2265
+ const sa = computeSpecificity(a.pattern);
2266
+ const sb = computeSpecificity(b.pattern);
2267
+ if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
2268
+ if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
2269
+ if (b.count !== a.count) return b.count - a.count;
2270
+ return a.pattern.localeCompare(b.pattern);
2271
+ };
2272
+ const compareCountThenSpecificity = (a, b) => {
2273
+ if (b.count !== a.count) return b.count - a.count;
2274
+ return compareSpecificityThenCount(a, b);
2275
+ };
2276
+ return [...counts.entries()].map(([pattern, v]) => ({
2277
+ count: v.count,
2278
+ examples: v.examples,
2279
+ pattern
2280
+ })).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
2281
+ };
2282
+
2283
+ //#endregion
2284
+ //#region src/detection.ts
1953
2285
  /**
1954
2286
  * Pattern detection utilities for recognizing template tokens in Arabic text.
1955
2287
  * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
@@ -1968,6 +2300,7 @@ const TOKEN_PRIORITY_ORDER = [
1968
2300
  "bab",
1969
2301
  "fasl",
1970
2302
  "naql",
2303
+ "rumuz",
1971
2304
  "numbered",
1972
2305
  "raqms",
1973
2306
  "raqm",
@@ -1986,6 +2319,17 @@ const getTokenPriority = () => {
1986
2319
  const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
1987
2320
  return [...prioritized, ...remaining];
1988
2321
  };
2322
+ const isRumuzStandalone = (text, startIndex, endIndex) => {
2323
+ const before = startIndex > 0 ? text[startIndex - 1] : "";
2324
+ const after = endIndex < text.length ? text[endIndex] : "";
2325
+ const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
2326
+ const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
2327
+ const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
2328
+ const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
2329
+ const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
2330
+ const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
2331
+ return leftOk && rightOk;
2332
+ };
1989
2333
  /**
1990
2334
  * Analyzes text and returns all detected token patterns with their positions.
1991
2335
  * Patterns are detected in priority order to avoid partial matches.
@@ -2017,6 +2361,7 @@ const detectTokenPatterns = (text) => {
2017
2361
  while ((match = regex.exec(text)) !== null) {
2018
2362
  const startIndex = match.index;
2019
2363
  const endIndex = startIndex + match[0].length;
2364
+ if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
2020
2365
  if (isPositionCovered(startIndex, endIndex)) continue;
2021
2366
  results.push({
2022
2367
  endIndex,
@@ -2104,5 +2449,5 @@ const analyzeTextForRule = (text) => {
2104
2449
  };
2105
2450
 
2106
2451
  //#endregion
2107
- export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
2452
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2108
2453
  //# sourceMappingURL=index.mjs.map