flappa-doormal 2.2.3 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -348,25 +348,25 @@ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalize
348
348
  for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
349
349
  const pageData = normalizedPages.get(pageIds[pi]);
350
350
  if (!pageData) continue;
351
- const trimmed = pageData.content.trimStart();
352
- let found = -1;
353
- for (const len of JOINER_PREFIX_LENGTHS) {
354
- const prefix = trimmed.slice(0, Math.min(len, trimmed.length)).trim();
355
- if (!prefix) continue;
356
- const pos = updated.indexOf(prefix, searchFrom);
357
- if (pos > 0) {
358
- found = pos;
359
- break;
360
- }
361
- }
362
- if (found > 0) {
363
- if (updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
364
- searchFrom = found;
365
- }
351
+ const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
352
+ if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
353
+ if (found > 0) searchFrom = found;
366
354
  }
367
355
  return updated;
368
356
  };
369
357
  /**
358
+ * Finds the position of a page prefix in content, trying multiple prefix lengths.
359
+ */
360
+ const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
361
+ for (const len of JOINER_PREFIX_LENGTHS) {
362
+ const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
363
+ if (!prefix) continue;
364
+ const pos = content.indexOf(prefix, searchFrom);
365
+ if (pos > 0) return pos;
366
+ }
367
+ return -1;
368
+ };
369
+ /**
370
370
  * Estimates how far into the current page `remainingContent` begins.
371
371
  *
372
372
  * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
@@ -390,7 +390,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
390
390
  * This is used to define breakpoint windows in terms of actual content being split, rather than
391
391
  * raw per-page offsets which can desync when structural rules strip markers.
392
392
  */
393
- const findPageStartNearExpectedBoundary = (remainingContent, currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
393
+ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
394
394
  const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
395
395
  if (!targetPageData) return -1;
396
396
  const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
@@ -548,6 +548,21 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
548
548
  return selected.index + selected.length;
549
549
  };
550
550
  /**
551
+ * Handles page boundary breakpoint (empty pattern).
552
+ * Returns break position or -1 if no valid position found.
553
+ */
554
+ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
555
+ const nextPageIdx = windowEndIdx + 1;
556
+ if (nextPageIdx <= toIdx) {
557
+ const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
558
+ if (nextPageData) {
559
+ const pos = findNextPagePosition(remainingContent, nextPageData);
560
+ if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
561
+ }
562
+ }
563
+ return Math.min(windowEndPosition, remainingContent.length);
564
+ };
565
+ /**
551
566
  * Tries to find a break position within the current window using breakpoint patterns.
552
567
  * Returns the break position or -1 if no suitable break was found.
553
568
  *
@@ -564,17 +579,7 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
564
579
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
565
580
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
566
581
  if (skipWhenRegex?.test(remainingContent)) continue;
567
- if (regex === null) {
568
- const nextPageIdx = windowEndIdx + 1;
569
- if (nextPageIdx <= toIdx) {
570
- const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
571
- if (nextPageData) {
572
- const pos = findNextPagePosition(remainingContent, nextPageData);
573
- if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
574
- }
575
- }
576
- return Math.min(windowEndPosition, remainingContent.length);
577
- }
582
+ if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
578
583
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
579
584
  if (breakPos > 0) return breakPos;
580
585
  }
@@ -636,7 +641,8 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
636
641
  const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
637
642
  if (nextPageData) {
638
643
  const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
639
- if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
644
+ const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
645
+ if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
640
646
  }
641
647
  }
642
648
  return nextFromIdx;
@@ -726,171 +732,6 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
726
732
  return result;
727
733
  };
728
734
 
729
- //#endregion
730
- //#region src/segmentation/match-utils.ts
731
- /**
732
- * Utility functions for regex matching and result processing.
733
- *
734
- * These functions were extracted from `segmenter.ts` to reduce complexity
735
- * and enable independent testing. They handle match filtering, capture
736
- * extraction, and occurrence-based selection.
737
- *
738
- * @module match-utils
739
- */
740
- /**
741
- * Extracts named capture groups from a regex match.
742
- *
743
- * Only includes groups that are in the `captureNames` list and have
744
- * defined values. This filters out positional captures and ensures
745
- * only explicitly requested named captures are returned.
746
- *
747
- * @param groups - The `match.groups` object from `RegExp.exec()`
748
- * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
749
- * @returns Object with capture name → value pairs, or `undefined` if none found
750
- *
751
- * @example
752
- * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
753
- * extractNamedCaptures(match.groups, ['num'])
754
- * // → { num: '٦٦٩٦' }
755
- *
756
- * @example
757
- * // No matching captures
758
- * extractNamedCaptures({}, ['num'])
759
- * // → undefined
760
- *
761
- * @example
762
- * // Undefined groups
763
- * extractNamedCaptures(undefined, ['num'])
764
- * // → undefined
765
- */
766
- const extractNamedCaptures = (groups, captureNames) => {
767
- if (!groups || captureNames.length === 0) return;
768
- const namedCaptures = {};
769
- for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
770
- return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
771
- };
772
- /**
773
- * Gets the last defined positional capture group from a match array.
774
- *
775
- * Used for `lineStartsAfter` patterns where the content capture (`.*`)
776
- * is always at the end of the pattern. Named captures may shift the
777
- * positional indices, so we iterate backward to find the actual content.
778
- *
779
- * @param match - RegExp exec result array
780
- * @returns The last defined capture group value, or `undefined` if none
781
- *
782
- * @example
783
- * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
784
- * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
785
- * getLastPositionalCapture(match)
786
- * // → 'content'
787
- *
788
- * @example
789
- * // No captures
790
- * getLastPositionalCapture(['full match'])
791
- * // → undefined
792
- */
793
- const getLastPositionalCapture = (match) => {
794
- if (match.length <= 1) return;
795
- for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
796
- };
797
- /**
798
- * Filters matches to only include those within page ID constraints.
799
- *
800
- * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
801
- * matches that occur on pages outside the allowed range or explicitly excluded.
802
- *
803
- * @param matches - Array of match results to filter
804
- * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
805
- * @param getId - Function that returns the page ID for a given offset
806
- * @returns Filtered array containing only matches within constraints
807
- *
808
- * @example
809
- * const matches = [
810
- * { start: 0, end: 10 }, // Page 1
811
- * { start: 100, end: 110 }, // Page 5
812
- * { start: 200, end: 210 }, // Page 10
813
- * ];
814
- * filterByConstraints(matches, { min: 3, max: 8 }, getId)
815
- * // → [{ start: 100, end: 110 }] (only page 5 match)
816
- */
817
- const filterByConstraints = (matches, rule, getId) => {
818
- return matches.filter((m) => {
819
- const id = getId(m.start);
820
- if (rule.min !== void 0 && id < rule.min) return false;
821
- if (rule.max !== void 0 && id > rule.max) return false;
822
- if (isPageExcluded(id, rule.exclude)) return false;
823
- return true;
824
- });
825
- };
826
- /**
827
- * Filters matches based on occurrence setting (first, last, or all).
828
- *
829
- * Applies occurrence-based selection to a list of matches:
830
- * - `'all'` or `undefined`: Return all matches (default)
831
- * - `'first'`: Return only the first match
832
- * - `'last'`: Return only the last match
833
- *
834
- * @param matches - Array of match results to filter
835
- * @param occurrence - Which occurrence(s) to keep
836
- * @returns Filtered array based on occurrence setting
837
- *
838
- * @example
839
- * const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
840
- *
841
- * filterByOccurrence(matches, 'first')
842
- * // → [{ start: 0 }]
843
- *
844
- * filterByOccurrence(matches, 'last')
845
- * // → [{ start: 20 }]
846
- *
847
- * filterByOccurrence(matches, 'all')
848
- * // → [{ start: 0 }, { start: 10 }, { start: 20 }]
849
- *
850
- * filterByOccurrence(matches, undefined)
851
- * // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
852
- */
853
- const filterByOccurrence = (matches, occurrence) => {
854
- if (!matches.length) return [];
855
- if (occurrence === "first") return [matches[0]];
856
- if (occurrence === "last") return [matches[matches.length - 1]];
857
- return matches;
858
- };
859
- /**
860
- * Checks if any rule in the list allows the given page ID.
861
- *
862
- * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
863
- * Rules without constraints allow all page IDs.
864
- *
865
- * This is used to determine whether to create a segment for content
866
- * that appears before any split points (the "first segment").
867
- *
868
- * @param rules - Array of rules with optional `min` and `max` constraints
869
- * @param pageId - Page ID to check
870
- * @returns `true` if at least one rule allows the page ID
871
- *
872
- * @example
873
- * const rules = [
874
- * { min: 5, max: 10 }, // Allows pages 5-10
875
- * { min: 20 }, // Allows pages 20+
876
- * ];
877
- *
878
- * anyRuleAllowsId(rules, 7) // → true (first rule allows)
879
- * anyRuleAllowsId(rules, 3) // → false (no rule allows)
880
- * anyRuleAllowsId(rules, 25) // → true (second rule allows)
881
- *
882
- * @example
883
- * // Rules without constraints allow everything
884
- * anyRuleAllowsId([{}], 999) // → true
885
- */
886
- const anyRuleAllowsId = (rules, pageId) => {
887
- return rules.some((r) => {
888
- const minOk = r.min === void 0 || pageId >= r.min;
889
- const maxOk = r.max === void 0 || pageId <= r.max;
890
- return minOk && maxOk;
891
- });
892
- };
893
-
894
735
  //#endregion
895
736
  //#region src/segmentation/tokens.ts
896
737
  /**
@@ -977,6 +818,7 @@ const BASE_TOKENS = {
977
818
  dash: "[-–—ـ]",
978
819
  fasl: ["مسألة", "فصل"].join("|"),
979
820
  harf: "[أ-ي]",
821
+ harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
980
822
  kitab: "كتاب",
981
823
  naql: [
982
824
  "حدثني",
@@ -1120,7 +962,7 @@ const containsTokens = (query) => {
1120
962
  * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
1121
963
  * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
1122
964
  */
1123
- const expandTokensWithCaptures = (query, fuzzyTransform) => {
965
+ const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
1124
966
  const captureNames = [];
1125
967
  const captureNameCounts = /* @__PURE__ */ new Map();
1126
968
  /**
@@ -1162,16 +1004,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
1162
1004
  const [, tokenName, captureName] = tokenMatch;
1163
1005
  if (!tokenName && captureName) {
1164
1006
  const uniqueName = getUniqueCaptureName(captureName);
1165
- captureNames.push(uniqueName);
1166
- return `(?<${uniqueName}>.+)`;
1007
+ const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
1008
+ captureNames.push(prefixedName);
1009
+ return `(?<${prefixedName}>.+)`;
1167
1010
  }
1168
1011
  let tokenPattern = TOKEN_PATTERNS[tokenName];
1169
1012
  if (!tokenPattern) return segment.value;
1170
1013
  if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
1171
1014
  if (captureName) {
1172
1015
  const uniqueName = getUniqueCaptureName(captureName);
1173
- captureNames.push(uniqueName);
1174
- return `(?<${uniqueName}>${tokenPattern})`;
1016
+ const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
1017
+ captureNames.push(prefixedName);
1018
+ return `(?<${prefixedName}>${tokenPattern})`;
1175
1019
  }
1176
1020
  return tokenPattern;
1177
1021
  });
@@ -1259,6 +1103,224 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
1259
1103
  */
1260
1104
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1261
1105
 
1106
+ //#endregion
1107
+ //#region src/segmentation/fast-fuzzy-prefix.ts
1108
+ /**
1109
+ * Fast-path fuzzy prefix matching for common Arabic line-start markers.
1110
+ *
1111
+ * This exists to avoid running expensive fuzzy-expanded regex alternations over
1112
+ * a giant concatenated string. Instead, we match only at known line-start
1113
+ * offsets and perform a small deterministic comparison:
1114
+ * - Skip Arabic diacritics in the CONTENT
1115
+ * - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
1116
+ *
1117
+ * This module is intentionally conservative: it only supports "literal"
1118
+ * token patterns (plain text alternation via `|`), not general regex.
1119
+ */
1120
+ const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
1121
+ const equivKey = (ch) => {
1122
+ switch (ch) {
1123
+ case "آ":
1124
+ case "أ":
1125
+ case "إ": return "ا";
1126
+ case "ه": return "ة";
1127
+ case "ي": return "ى";
1128
+ default: return ch;
1129
+ }
1130
+ };
1131
+ /**
1132
+ * Match a fuzzy literal prefix at a given offset.
1133
+ *
1134
+ * - Skips diacritics in the content
1135
+ * - Applies equivalence groups on both content and literal
1136
+ *
1137
+ * @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
1138
+ */
1139
+ const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
1140
+ let i = offset;
1141
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1142
+ for (let j = 0; j < literal.length; j++) {
1143
+ const litCh = literal[j];
1144
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1145
+ if (i >= content.length) return null;
1146
+ const cCh = content[i];
1147
+ if (equivKey(cCh) !== equivKey(litCh)) return null;
1148
+ i++;
1149
+ }
1150
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1151
+ return i;
1152
+ };
1153
+ const isLiteralOnly = (s) => {
1154
+ return !/[\\[\]{}()^$.*+?]/.test(s);
1155
+ };
1156
+ const compileLiteralAlternation = (pattern) => {
1157
+ if (!pattern) return null;
1158
+ if (!isLiteralOnly(pattern)) return null;
1159
+ const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
1160
+ if (!alternatives.length) return null;
1161
+ return { alternatives };
1162
+ };
1163
+ /**
1164
+ * Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
1165
+ * Returns null if not eligible.
1166
+ */
1167
+ const compileFastFuzzyTokenRule = (tokenTemplate) => {
1168
+ const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
1169
+ if (!m) return null;
1170
+ const token = m[1];
1171
+ const tokenPattern = getTokenPattern(token);
1172
+ if (!tokenPattern) return null;
1173
+ const compiled = compileLiteralAlternation(tokenPattern);
1174
+ if (!compiled) return null;
1175
+ return {
1176
+ alternatives: compiled.alternatives,
1177
+ token
1178
+ };
1179
+ };
1180
+ /**
1181
+ * Try matching any alternative for a compiled token at a line-start offset.
1182
+ * Returns endOffset (exclusive) on match, else null.
1183
+ */
1184
+ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
1185
+ for (const alt of compiled.alternatives) {
1186
+ const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
1187
+ if (end !== null) return end;
1188
+ }
1189
+ return null;
1190
+ };
1191
+
1192
+ //#endregion
1193
+ //#region src/segmentation/match-utils.ts
1194
+ /**
1195
+ * Utility functions for regex matching and result processing.
1196
+ *
1197
+ * These functions were extracted from `segmenter.ts` to reduce complexity
1198
+ * and enable independent testing. They handle match filtering, capture
1199
+ * extraction, and occurrence-based selection.
1200
+ *
1201
+ * @module match-utils
1202
+ */
1203
+ /**
1204
+ * Extracts named capture groups from a regex match.
1205
+ *
1206
+ * Only includes groups that are in the `captureNames` list and have
1207
+ * defined values. This filters out positional captures and ensures
1208
+ * only explicitly requested named captures are returned.
1209
+ *
1210
+ * @param groups - The `match.groups` object from `RegExp.exec()`
1211
+ * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
1212
+ * @returns Object with capture name → value pairs, or `undefined` if none found
1213
+ *
1214
+ * @example
1215
+ * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
1216
+ * extractNamedCaptures(match.groups, ['num'])
1217
+ * // → { num: '٦٦٩٦' }
1218
+ *
1219
+ * @example
1220
+ * // No matching captures
1221
+ * extractNamedCaptures({}, ['num'])
1222
+ * // → undefined
1223
+ *
1224
+ * @example
1225
+ * // Undefined groups
1226
+ * extractNamedCaptures(undefined, ['num'])
1227
+ * // → undefined
1228
+ */
1229
+ const extractNamedCaptures = (groups, captureNames) => {
1230
+ if (!groups || captureNames.length === 0) return;
1231
+ const namedCaptures = {};
1232
+ for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
1233
+ return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
1234
+ };
1235
+ /**
1236
+ * Gets the last defined positional capture group from a match array.
1237
+ *
1238
+ * Used for `lineStartsAfter` patterns where the content capture (`.*`)
1239
+ * is always at the end of the pattern. Named captures may shift the
1240
+ * positional indices, so we iterate backward to find the actual content.
1241
+ *
1242
+ * @param match - RegExp exec result array
1243
+ * @returns The last defined capture group value, or `undefined` if none
1244
+ *
1245
+ * @example
1246
+ * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
1247
+ * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
1248
+ * getLastPositionalCapture(match)
1249
+ * // → 'content'
1250
+ *
1251
+ * @example
1252
+ * // No captures
1253
+ * getLastPositionalCapture(['full match'])
1254
+ * // → undefined
1255
+ */
1256
+ const getLastPositionalCapture = (match) => {
1257
+ if (match.length <= 1) return;
1258
+ for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
1259
+ };
1260
+ /**
1261
+ * Filters matches to only include those within page ID constraints.
1262
+ *
1263
+ * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
1264
+ * matches that occur on pages outside the allowed range or explicitly excluded.
1265
+ *
1266
+ * @param matches - Array of match results to filter
1267
+ * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
1268
+ * @param getId - Function that returns the page ID for a given offset
1269
+ * @returns Filtered array containing only matches within constraints
1270
+ *
1271
+ * @example
1272
+ * const matches = [
1273
+ * { start: 0, end: 10 }, // Page 1
1274
+ * { start: 100, end: 110 }, // Page 5
1275
+ * { start: 200, end: 210 }, // Page 10
1276
+ * ];
1277
+ * filterByConstraints(matches, { min: 3, max: 8 }, getId)
1278
+ * // → [{ start: 100, end: 110 }] (only page 5 match)
1279
+ */
1280
+ const filterByConstraints = (matches, rule, getId) => {
1281
+ return matches.filter((m) => {
1282
+ const id = getId(m.start);
1283
+ if (rule.min !== void 0 && id < rule.min) return false;
1284
+ if (rule.max !== void 0 && id > rule.max) return false;
1285
+ if (isPageExcluded(id, rule.exclude)) return false;
1286
+ return true;
1287
+ });
1288
+ };
1289
+ /**
1290
+ * Checks if any rule in the list allows the given page ID.
1291
+ *
1292
+ * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
1293
+ * Rules without constraints allow all page IDs.
1294
+ *
1295
+ * This is used to determine whether to create a segment for content
1296
+ * that appears before any split points (the "first segment").
1297
+ *
1298
+ * @param rules - Array of rules with optional `min` and `max` constraints
1299
+ * @param pageId - Page ID to check
1300
+ * @returns `true` if at least one rule allows the page ID
1301
+ *
1302
+ * @example
1303
+ * const rules = [
1304
+ * { min: 5, max: 10 }, // Allows pages 5-10
1305
+ * { min: 20 }, // Allows pages 20+
1306
+ * ];
1307
+ *
1308
+ * anyRuleAllowsId(rules, 7) // → true (first rule allows)
1309
+ * anyRuleAllowsId(rules, 3) // → false (no rule allows)
1310
+ * anyRuleAllowsId(rules, 25) // → true (second rule allows)
1311
+ *
1312
+ * @example
1313
+ * // Rules without constraints allow everything
1314
+ * anyRuleAllowsId([{}], 999) // → true
1315
+ */
1316
+ const anyRuleAllowsId = (rules, pageId) => {
1317
+ return rules.some((r) => {
1318
+ const minOk = r.min === void 0 || pageId >= r.min;
1319
+ const maxOk = r.max === void 0 || pageId <= r.max;
1320
+ return minOk && maxOk;
1321
+ });
1322
+ };
1323
+
1262
1324
  //#endregion
1263
1325
  //#region src/segmentation/rule-regex.ts
1264
1326
  /**
@@ -1282,6 +1344,21 @@ const hasCapturingGroup = (pattern) => {
1282
1344
  return /\((?!\?)/.test(pattern);
1283
1345
  };
1284
1346
  /**
1347
+ * Extracts named capture group names from a regex pattern.
1348
+ *
1349
+ * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
1350
+ *
1351
+ * @example
1352
+ * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
1353
+ * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
1354
+ * extractNamedCaptureNames('^\\d+') // []
1355
+ */
1356
+ const extractNamedCaptureNames = (pattern) => {
1357
+ const names = [];
1358
+ for (const match of pattern.matchAll(/\(\?<([^>]+)>/g)) names.push(match[1]);
1359
+ return names;
1360
+ };
1361
+ /**
1285
1362
  * Safely compiles a regex pattern, throwing a helpful error if invalid.
1286
1363
  */
1287
1364
  const compileRuleRegex = (pattern) => {
@@ -1297,56 +1374,59 @@ const compileRuleRegex = (pattern) => {
1297
1374
  *
1298
1375
  * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1299
1376
  */
1300
- const processPattern = (pattern, fuzzy) => {
1301
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
1377
+ const processPattern = (pattern, fuzzy, capturePrefix) => {
1378
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
1302
1379
  return {
1303
1380
  captureNames,
1304
1381
  pattern: expanded
1305
1382
  };
1306
1383
  };
1307
- const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
1308
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1384
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
1385
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1309
1386
  const union = processed.map((p) => p.pattern).join("|");
1387
+ const captureNames = processed.flatMap((p) => p.captureNames);
1388
+ const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
1389
+ if (capturePrefix) captureNames.push(`${capturePrefix}content`);
1310
1390
  return {
1311
- captureNames: processed.flatMap((p) => p.captureNames),
1312
- regex: `^(?:${union})(.*)`
1391
+ captureNames,
1392
+ regex: `^(?:${union})${contentCapture}`
1313
1393
  };
1314
1394
  };
1315
- const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
1316
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1395
+ const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1396
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1317
1397
  const union = processed.map((p) => p.pattern).join("|");
1318
1398
  return {
1319
1399
  captureNames: processed.flatMap((p) => p.captureNames),
1320
1400
  regex: `^(?:${union})`
1321
1401
  };
1322
1402
  };
1323
- const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
1324
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1403
+ const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1404
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1325
1405
  const union = processed.map((p) => p.pattern).join("|");
1326
1406
  return {
1327
1407
  captureNames: processed.flatMap((p) => p.captureNames),
1328
1408
  regex: `(?:${union})$`
1329
1409
  };
1330
1410
  };
1331
- const buildTemplateRegexSource = (template) => {
1332
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
1411
+ const buildTemplateRegexSource = (template, capturePrefix) => {
1412
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
1333
1413
  return {
1334
1414
  captureNames,
1335
1415
  regex: pattern
1336
1416
  };
1337
1417
  };
1338
- const determineUsesCapture = (regexSource, captureNames) => hasCapturingGroup(regexSource) || captureNames.length > 0;
1418
+ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(regexSource);
1339
1419
  /**
1340
1420
  * Builds a compiled regex and metadata from a split rule.
1341
1421
  *
1342
1422
  * Behavior mirrors the previous implementation in `segmenter.ts`.
1343
1423
  */
1344
- const buildRuleRegex = (rule) => {
1424
+ const buildRuleRegex = (rule, capturePrefix) => {
1345
1425
  const s = { ...rule };
1346
1426
  const fuzzy = rule.fuzzy ?? false;
1347
1427
  let allCaptureNames = [];
1348
1428
  if (s.lineStartsAfter?.length) {
1349
- const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
1429
+ const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
1350
1430
  allCaptureNames = captureNames;
1351
1431
  return {
1352
1432
  captureNames: allCaptureNames,
@@ -1356,21 +1436,22 @@ const buildRuleRegex = (rule) => {
1356
1436
  };
1357
1437
  }
1358
1438
  if (s.lineStartsWith?.length) {
1359
- const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
1439
+ const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
1360
1440
  s.regex = regex;
1361
1441
  allCaptureNames = captureNames;
1362
1442
  }
1363
1443
  if (s.lineEndsWith?.length) {
1364
- const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
1444
+ const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
1365
1445
  s.regex = regex;
1366
1446
  allCaptureNames = captureNames;
1367
1447
  }
1368
1448
  if (s.template) {
1369
- const { regex, captureNames } = buildTemplateRegexSource(s.template);
1449
+ const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
1370
1450
  s.regex = regex;
1371
1451
  allCaptureNames = [...allCaptureNames, ...captureNames];
1372
1452
  }
1373
1453
  if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1454
+ if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(s.regex);
1374
1455
  const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
1375
1456
  return {
1376
1457
  captureNames: allCaptureNames,
@@ -1521,9 +1602,120 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
1521
1602
  return [initialSeg];
1522
1603
  };
1523
1604
  const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1524
- const collectSplitPointsFromRule = (rule) => {
1605
+ const combinableRules = [];
1606
+ const standaloneRules = [];
1607
+ const fastFuzzyRules = [];
1608
+ rules.forEach((rule, index) => {
1609
+ let isCombinable = true;
1610
+ if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
1611
+ const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
1612
+ if (compiled) {
1613
+ fastFuzzyRules.push({
1614
+ compiled,
1615
+ rule,
1616
+ ruleIndex: index
1617
+ });
1618
+ return;
1619
+ }
1620
+ }
1621
+ if ("regex" in rule && rule.regex) {
1622
+ const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
1623
+ const hasBackreferences = /\\[1-9]/.test(rule.regex);
1624
+ const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
1625
+ if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
1626
+ }
1627
+ if (isCombinable) combinableRules.push({
1628
+ index,
1629
+ prefix: `r${index}_`,
1630
+ rule
1631
+ });
1632
+ else standaloneRules.push(rule);
1633
+ });
1634
+ const splitPointsByRule = /* @__PURE__ */ new Map();
1635
+ if (fastFuzzyRules.length > 0) {
1636
+ let boundaryIdx = 0;
1637
+ let currentBoundary = pageMap.boundaries[boundaryIdx];
1638
+ const advanceBoundaryTo = (offset) => {
1639
+ while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
1640
+ boundaryIdx++;
1641
+ currentBoundary = pageMap.boundaries[boundaryIdx];
1642
+ }
1643
+ };
1644
+ const recordSplitPoint = (ruleIndex, sp) => {
1645
+ if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
1646
+ splitPointsByRule.get(ruleIndex).push(sp);
1647
+ };
1648
+ for (let lineStart = 0; lineStart <= matchContent.length;) {
1649
+ advanceBoundaryTo(lineStart);
1650
+ const pageId = currentBoundary?.id ?? 0;
1651
+ if (lineStart >= matchContent.length) break;
1652
+ for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
1653
+ if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
1654
+ const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
1655
+ if (end === null) continue;
1656
+ recordSplitPoint(ruleIndex, {
1657
+ index: (rule.split ?? "at") === "at" ? lineStart : end,
1658
+ meta: rule.meta
1659
+ });
1660
+ }
1661
+ const nextNl = matchContent.indexOf("\n", lineStart);
1662
+ if (nextNl === -1) break;
1663
+ lineStart = nextNl + 1;
1664
+ }
1665
+ }
1666
+ if (combinableRules.length > 0) {
1667
+ const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
1668
+ const built = buildRuleRegex(rule, prefix);
1669
+ return {
1670
+ prefix,
1671
+ source: `(?<${prefix}>${built.regex.source})`,
1672
+ ...built
1673
+ };
1674
+ });
1675
+ const combinedSource = ruleRegexes.map((r) => r.source).join("|");
1676
+ const combinedRegex = new RegExp(combinedSource, "gm");
1677
+ combinedRegex.lastIndex = 0;
1678
+ let m = combinedRegex.exec(matchContent);
1679
+ while (m !== null) {
1680
+ const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
1681
+ if (matchedRuleIndex !== -1) {
1682
+ const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
1683
+ const ruleInfo = ruleRegexes[matchedRuleIndex];
1684
+ const namedCaptures = {};
1685
+ if (m.groups) {
1686
+ for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
1687
+ const cleanName = prefixedName.slice(prefix.length);
1688
+ namedCaptures[cleanName] = m.groups[prefixedName];
1689
+ }
1690
+ }
1691
+ let capturedContent;
1692
+ let contentStartOffset;
1693
+ if (ruleInfo.usesLineStartsAfter) {
1694
+ capturedContent = m.groups?.[`${prefix}content`];
1695
+ if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
1696
+ }
1697
+ const start = m.index;
1698
+ const end = m.index + m[0].length;
1699
+ const pageId = pageMap.getId(start);
1700
+ if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
1701
+ const sp = {
1702
+ capturedContent: void 0,
1703
+ contentStartOffset,
1704
+ index: (rule.split ?? "at") === "at" ? start : end,
1705
+ meta: rule.meta,
1706
+ namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
1707
+ };
1708
+ if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
1709
+ splitPointsByRule.get(originalIndex).push(sp);
1710
+ }
1711
+ }
1712
+ if (m[0].length === 0) combinedRegex.lastIndex++;
1713
+ m = combinedRegex.exec(matchContent);
1714
+ }
1715
+ }
1716
+ const collectSplitPointsFromRule = (rule, ruleIndex) => {
1525
1717
  const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1526
- return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
1718
+ const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
1527
1719
  const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1528
1720
  const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1529
1721
  return {
@@ -1534,8 +1726,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1534
1726
  namedCaptures: m.namedCaptures
1535
1727
  };
1536
1728
  });
1729
+ if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
1730
+ splitPointsByRule.get(ruleIndex).push(...points);
1537
1731
  };
1538
- return rules.flatMap(collectSplitPointsFromRule);
1732
+ standaloneRules.forEach((rule) => {
1733
+ collectSplitPointsFromRule(rule, rules.indexOf(rule));
1734
+ });
1735
+ const finalSplitPoints = [];
1736
+ rules.forEach((rule, index) => {
1737
+ const points = splitPointsByRule.get(index);
1738
+ if (!points || points.length === 0) return;
1739
+ let filtered = points;
1740
+ if (rule.occurrence === "first") filtered = [points[0]];
1741
+ else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
1742
+ finalSplitPoints.push(...filtered);
1743
+ });
1744
+ return finalSplitPoints;
1539
1745
  };
1540
1746
  /**
1541
1747
  * Executes a regex against content and extracts match results with capture information.