flappa-doormal 2.17.1 → 2.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,141 +1,25 @@
1
- //#region src/utils/textUtils.ts
2
- /**
3
- * Normalizes line endings to Unix-style (`\n`).
4
- *
5
- * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
6
- * for consistent pattern matching across platforms.
7
- *
8
- * @param content - Raw content with potentially mixed line endings
9
- * @returns Content with all line endings normalized to `\n`
10
- */
11
- const normalizeLineEndings = (content) => {
12
- return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
13
- };
14
- /**
15
- * Escapes regex metacharacters (parentheses and brackets) in template patterns,
16
- * but preserves content inside `{{...}}` token delimiters.
17
- *
18
- * This allows users to write intuitive patterns like `({{harf}}):` instead of
19
- * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
20
- * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
21
- *
22
- * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
23
- * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
24
- *
25
- * @example
26
- * escapeTemplateBrackets('({{harf}}): ')
27
- * // → '\\({{harf}}\\): '
28
- *
29
- * @example
30
- * escapeTemplateBrackets('[{{raqm}}] ')
31
- * // → '\\[{{raqm}}\\] '
32
- *
33
- * @example
34
- * escapeTemplateBrackets('{{harf}}')
35
- * // → '{{harf}}' (unchanged - no brackets outside tokens)
36
- */
37
- const escapeTemplateBrackets = (pattern) => {
38
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
39
- };
1
+ //#region src/segmentation/tokens.ts
40
2
  /**
41
- * Character class matching all Arabic diacritics (Tashkeel/Harakat).
3
+ * Arabic base letters used by low-level dictionary-style regex helpers.
42
4
  *
43
- * Includes the following diacritical marks:
44
- * - U+064B: ً (fathatan - double fatha)
45
- * - U+064C: ٌ (dammatan - double damma)
46
- * - U+064D: ٍ (kasratan - double kasra)
47
- * - U+064E: َ (fatha - short a)
48
- * - U+064F: ُ (damma - short u)
49
- * - U+0650: ِ (kasra - short i)
50
- * - U+0651: ّ (shadda - gemination)
51
- * - U+0652: ْ (sukun - no vowel)
52
- *
53
- * @internal
5
+ * This is intentionally broader than `{{harf}}`:
6
+ * - includes standalone hamza `ء`
7
+ * - stays as a raw regex fragment rather than a template token
54
8
  */
55
- const DIACRITICS_CLASS = "[ًٌٍَُِّْ]";
9
+ const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
56
10
  /**
57
- * Groups of equivalent Arabic characters.
58
- *
59
- * Characters within the same group are considered equivalent for matching purposes.
60
- * This handles common variations in Arabic text where different characters are
61
- * used interchangeably or have the same underlying meaning.
62
- *
63
- * Equivalence groups:
64
- * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
65
- * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
66
- * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
67
- *
68
- * @internal
11
+ * Arabic combining marks / annotation signs used by low-level regex helpers.
69
12
  */
70
- const EQUIV_GROUPS = [
71
- [
72
- "ا",
73
- "آ",
74
- "أ",
75
- "إ"
76
- ],
77
- ["ة", "ه"],
78
- ["ى", "ي"]
79
- ];
13
+ const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
80
14
  /**
81
- * Escapes a string for safe inclusion in a regular expression.
82
- *
83
- * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
84
- *
85
- * @param s - Any string to escape
86
- * @returns String with regex metacharacters escaped
87
- *
88
- * @example
89
- * escapeRegex('hello.world') // → 'hello\\.world'
90
- * escapeRegex('[test]') // → '\\[test\\]'
91
- * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
15
+ * A single Arabic base letter followed by zero or more combining marks.
92
16
  */
93
- const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
94
- const getEquivClass = (ch) => {
95
- const group = EQUIV_GROUPS.find((g) => g.includes(ch));
96
- return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
97
- };
98
- const normalizeArabicLight = (str) => {
99
- return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
100
- };
101
- const makeDiacriticInsensitive = (text) => {
102
- const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
103
- return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
104
- };
105
- const isCombiningMarkOrSelector = (char) => {
106
- if (!char) return false;
107
- return /\p{M}/u.test(char) || char === "︎" || char === "️";
108
- };
109
- const isJoiner = (char) => char === "‌" || char === "‍";
17
+ const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
110
18
  /**
111
- * Ensures the position does not split a grapheme cluster (surrogate pairs,
112
- * combining marks, or zero-width joiners / variation selectors).
113
- *
114
- * This is only used as a last-resort fallback when we are forced to split
115
- * near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
19
+ * One or more Arabic letters, where each letter may carry combining marks.
116
20
  */
117
- const adjustForUnicodeBoundary = (content, position) => {
118
- let adjusted = position;
119
- while (adjusted > 0) {
120
- const high = content.charCodeAt(adjusted - 1);
121
- const low = content.charCodeAt(adjusted);
122
- if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
123
- adjusted -= 1;
124
- continue;
125
- }
126
- const nextChar = content[adjusted];
127
- const prevChar = content[adjusted - 1];
128
- if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
129
- adjusted -= 1;
130
- continue;
131
- }
132
- break;
133
- }
134
- return adjusted;
135
- };
136
-
137
- //#endregion
138
- //#region src/segmentation/tokens.ts
21
+ const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
22
+ const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
139
23
  const RUMUZ_ATOM = `(?:${[
140
24
  "تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
141
25
  "خت",
@@ -166,15 +50,25 @@ const RUMUZ_ATOM = `(?:${[
166
50
  ].join("|")})`;
167
51
  const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
168
52
  const BASE_TOKENS = {
53
+ /** Chapter marker (باب). */
169
54
  bab: "باب",
55
+ /** Basmala (بسم الله). Also matches ﷽. */
170
56
  basmalah: ["بسم الله", "﷽"].join("|"),
57
+ /** Bullet point variants: `•`, `*`, `°`. */
171
58
  bullet: "[•*°]",
59
+ /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
172
60
  dash: "[-–—ـ]",
61
+ /** Section marker (فصل / مسألة). */
173
62
  fasl: ["مسألة", "فصل"].join("|"),
63
+ /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
174
64
  harf: "[أ-ي]",
175
- harfs: "[أ-ي](?:\\s+[أ-ي])*",
65
+ /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
66
+ harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
67
+ /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
176
68
  hr: "[-–—ـ_=]{5,}",
69
+ /** Book marker (كتاب). */
177
70
  kitab: "كتاب",
71
+ /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
178
72
  naql: [
179
73
  "حدثني",
180
74
  "وأخبرنا",
@@ -186,33 +80,58 @@ const BASE_TOKENS = {
186
80
  "وحدثني",
187
81
  "وحدثنيه"
188
82
  ].join("|"),
83
+ /** Newline character. Useful for breakpoints that split on line boundaries. */
189
84
  newline: "\\n",
85
+ /** Single ASCII digit (0-9). */
190
86
  num: "\\d",
87
+ /** One or more ASCII digits (0-9)+. */
191
88
  nums: "\\d+",
89
+ /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
192
90
  raqm: "[\\u0660-\\u0669]",
91
+ /** One or more Arabic-Indic digits (٠-٩)+. */
193
92
  raqms: "[\\u0660-\\u0669]+",
93
+ /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
194
94
  rumuz: RUMUZ_BLOCK,
95
+ /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
195
96
  tarqim: "[.!?؟؛]"
196
97
  };
197
98
  /** Pre-defined token constants for use in patterns. */
198
99
  const Token = {
100
+ /** Chapter marker - باب */
199
101
  BAB: "{{bab}}",
102
+ /** Basmala - بسم الله */
200
103
  BASMALAH: "{{basmalah}}",
104
+ /** Bullet point variants */
201
105
  BULLET: "{{bullet}}",
106
+ /** Dash variants (hyphen, en-dash, em-dash, tatweel) */
202
107
  DASH: "{{dash}}",
108
+ /** Section marker - فصل / مسألة */
203
109
  FASL: "{{fasl}}",
110
+ /** Single Arabic letter */
204
111
  HARF: "{{harf}}",
112
+ /** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
205
113
  HARFS: "{{harfs}}",
114
+ /** Horizontal rule / separator (repeated dashes) */
206
115
  HR: "{{hr}}",
116
+ /** Book marker - كتاب */
207
117
  KITAB: "{{kitab}}",
118
+ /** Hadith transmission phrases */
208
119
  NAQL: "{{naql}}",
120
+ /** Newline character (for breakpoints) */
209
121
  NEWLINE: "{{newline}}",
122
+ /** Single ASCII digit */
210
123
  NUM: "{{num}}",
124
+ /** Composite: {{raqms}} {{dash}} (space) */
211
125
  NUMBERED: "{{numbered}}",
126
+ /** One or more ASCII digits */
212
127
  NUMS: "{{nums}}",
128
+ /** Single Arabic-Indic digit */
213
129
  RAQM: "{{raqm}}",
130
+ /** One or more Arabic-Indic digits */
214
131
  RAQMS: "{{raqms}}",
132
+ /** Source abbreviations (rijāl/takhrīj) */
215
133
  RUMUZ: "{{rumuz}}",
134
+ /** Punctuation marks */
216
135
  TARQIM: "{{tarqim}}"
217
136
  };
218
137
  /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
@@ -222,7 +141,9 @@ const withCapture = (token, name) => {
222
141
  return `{{${match[1]}:${name}}}`;
223
142
  };
224
143
  /** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
225
- const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
144
+ const COMPOSITE_TOKENS = {
145
+ /** Common hadith numbering format: Arabic-Indic digits + dash + space. */
146
+ numbered: "{{raqms}} {{dash}} " };
226
147
  /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
227
148
  const expandCompositeTokensInTemplate = (template) => {
228
149
  let out = template;
@@ -473,11 +394,11 @@ const templateToRegex = (template) => {
473
394
  * Useful for documentation, validation, or building user interfaces
474
395
  * that show available tokens.
475
396
  *
476
- * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
397
+ * @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
477
398
  *
478
399
  * @example
479
400
  * getAvailableTokens()
480
- * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
401
+ * // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
481
402
  */
482
403
  const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
483
404
  /**
@@ -486,13 +407,13 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
486
407
  * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
487
408
  * without any expansion or capture group wrapping.
488
409
  *
489
- * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
490
- * @returns The regex pattern string, or `undefined` if token doesn't exist
410
+ * @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
411
+ * @returns The regex pattern string for that known token
491
412
  *
492
413
  * @example
493
414
  * getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
494
415
  * getTokenPattern('dash') // → '[-–—ـ]'
495
- * getTokenPattern('unknown') // → undefined
416
+ * getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
496
417
  */
497
418
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
498
419
  /**
@@ -571,7 +492,161 @@ const applyTokenMappings = (template, mappings) => {
571
492
  const stripTokenMappings = (template) => {
572
493
  return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
573
494
  };
574
-
495
+ //#endregion
496
+ //#region src/utils/textUtils.ts
497
+ /**
498
+ * Normalizes line endings to Unix-style (`\n`).
499
+ *
500
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
501
+ * for consistent pattern matching across platforms.
502
+ *
503
+ * @param content - Raw content with potentially mixed line endings
504
+ * @returns Content with all line endings normalized to `\n`
505
+ */
506
+ const normalizeLineEndings = (content) => {
507
+ return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
508
+ };
509
+ /**
510
+ * Escapes regex metacharacters (parentheses and brackets) in template patterns,
511
+ * but preserves content inside `{{...}}` token delimiters.
512
+ *
513
+ * This allows users to write intuitive patterns like `({{harf}}):` instead of
514
+ * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
515
+ * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
516
+ *
517
+ * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
518
+ * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
519
+ *
520
+ * @example
521
+ * escapeTemplateBrackets('({{harf}}): ')
522
+ * // → '\\({{harf}}\\): '
523
+ *
524
+ * @example
525
+ * escapeTemplateBrackets('[{{raqm}}] ')
526
+ * // → '\\[{{raqm}}\\] '
527
+ *
528
+ * @example
529
+ * escapeTemplateBrackets('{{harf}}')
530
+ * // → '{{harf}}' (unchanged - no brackets outside tokens)
531
+ */
532
+ const escapeTemplateBrackets = (pattern) => {
533
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
534
+ };
535
+ /**
536
+ * Character class matching all Arabic diacritics (Tashkeel/Harakat).
537
+ *
538
+ * Includes the following diacritical marks:
539
+ * - U+0640: ـ (tatweel / kashida)
540
+ * - U+064B: ً (fathatan - double fatha)
541
+ * - U+064C: ٌ (dammatan - double damma)
542
+ * - U+064D: ٍ (kasratan - double kasra)
543
+ * - U+064E: َ (fatha - short a)
544
+ * - U+064F: ُ (damma - short u)
545
+ * - U+0650: ِ (kasra - short i)
546
+ * - U+0651: ّ (shadda - gemination)
547
+ * - U+0652: ْ (sukun - no vowel)
548
+ *
549
+ * @internal
550
+ */
551
+ const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
552
+ /**
553
+ * Groups of equivalent Arabic characters.
554
+ *
555
+ * Characters within the same group are considered equivalent for matching purposes.
556
+ * This handles common variations in Arabic text where different characters are
557
+ * used interchangeably or have the same underlying meaning.
558
+ *
559
+ * Equivalence groups:
560
+ * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
561
+ * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
562
+ * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
563
+ *
564
+ * @internal
565
+ */
566
+ const EQUIV_GROUPS = [
567
+ [
568
+ "ا",
569
+ "آ",
570
+ "أ",
571
+ "إ"
572
+ ],
573
+ ["ة", "ه"],
574
+ ["ى", "ي"]
575
+ ];
576
+ const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
577
+ /**
578
+ * Escapes a string for safe inclusion in a regular expression.
579
+ *
580
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
581
+ *
582
+ * @param s - Any string to escape
583
+ * @returns String with regex metacharacters escaped
584
+ *
585
+ * @example
586
+ * escapeRegex('hello.world') // → 'hello\\.world'
587
+ * escapeRegex('[test]') // → '\\[test\\]'
588
+ * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
589
+ */
590
+ const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
591
+ const getEquivClass = (ch) => {
592
+ const group = EQUIV_GROUPS.find((g) => g.includes(ch));
593
+ return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
594
+ };
595
+ const normalizeArabicLight = (str) => {
596
+ return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
597
+ };
598
+ /**
599
+ * Normalizes Arabic text for exact comparisons while tolerating common variants.
600
+ *
601
+ * This removes Arabic diacritics, collapses whitespace, removes joiners, and
602
+ * maps common equivalent letters to a shared canonical form:
603
+ * - ا/آ/أ/إ -> ا
604
+ * - ة/ه -> ه
605
+ * - ى/ي -> ي
606
+ */
607
+ const normalizeArabicForComparison = (text) => {
608
+ return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
609
+ if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
610
+ if (ch === "ة") return "ه";
611
+ if (ch === "ى") return "ي";
612
+ return ch;
613
+ }).join("");
614
+ };
615
+ const makeDiacriticInsensitive = (text) => {
616
+ const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
617
+ return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
618
+ };
619
+ const isCombiningMarkOrSelector = (char) => {
620
+ if (!char) return false;
621
+ return /\p{M}/u.test(char) || char === "︎" || char === "️";
622
+ };
623
+ const isJoiner = (char) => char === "‌" || char === "‍";
624
+ /**
625
+ * Ensures the position does not split a grapheme cluster (surrogate pairs,
626
+ * combining marks, or zero-width joiners / variation selectors).
627
+ *
628
+ * This is only used as a last-resort fallback when we are forced to split
629
+ * near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
630
+ */
631
+ const adjustForUnicodeBoundary = (content, position) => {
632
+ let adjusted = position;
633
+ while (adjusted > 0) {
634
+ const high = content.charCodeAt(adjusted - 1);
635
+ const low = content.charCodeAt(adjusted);
636
+ if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
637
+ adjusted -= 1;
638
+ continue;
639
+ }
640
+ const nextChar = content[adjusted];
641
+ const prevChar = content[adjusted - 1];
642
+ if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
643
+ adjusted -= 1;
644
+ continue;
645
+ }
646
+ break;
647
+ }
648
+ return adjusted;
649
+ };
575
650
  //#endregion
576
651
  //#region src/analysis/shared.ts
577
652
  const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
@@ -632,7 +707,6 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
632
707
  };
633
708
  const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
634
709
  const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
635
-
636
710
  //#endregion
637
711
  //#region src/analysis/line-starts.ts
638
712
  const resolveOptions$1 = (options = {}) => ({
@@ -658,65 +732,141 @@ const compareBySpecificity = (a, b) => {
658
732
  return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
659
733
  };
660
734
  const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
661
- /** Remove trailing whitespace placeholders */
662
- const trimTrailingWs = (out, mode) => {
663
- const suffix = mode === "regex" ? "\\s*" : " ";
664
- while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
665
- return out;
666
- };
667
- /** Try to extract first word for fallback */
668
- const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
669
- /** Consume prefix matchers at current position */
670
- const consumePrefixes = (s, pos, out, matchers, ws) => {
671
- let matched = false;
735
+ const appendPrefix = (s, pos, out, matchers, ws) => {
672
736
  for (const re of matchers) {
673
737
  if (pos >= s.length) break;
674
738
  const m = re.exec(s.slice(pos));
675
739
  if (!m?.index && m?.[0]) {
676
740
  out += escapeSignatureLiteral(m[0]);
677
741
  pos += m[0].length;
678
- matched = true;
679
742
  const wsm = /^[ \t]+/u.exec(s.slice(pos));
680
743
  if (wsm) {
681
744
  pos += wsm[0].length;
682
745
  out = appendWs(out, ws);
683
746
  }
747
+ return {
748
+ matched: true,
749
+ out,
750
+ pos
751
+ };
684
752
  }
685
753
  }
686
754
  return {
687
- matched,
755
+ matched: false,
688
756
  out,
689
757
  pos
690
758
  };
691
759
  };
692
- /** Try to match a token at current position and append to signature */
693
- const tryMatchToken = (s, pos, out, compiled) => {
760
+ const appendToken = (s, pos, out, compiled) => {
694
761
  const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
695
- if (!best) return {
696
- matched: false,
697
- out,
698
- pos
699
- };
700
- return {
762
+ return best ? {
701
763
  matched: true,
702
764
  out: `${out}{{${best.token}}}`,
703
765
  pos: pos + best.text.length
766
+ } : {
767
+ matched: false,
768
+ out,
769
+ pos
704
770
  };
705
771
  };
706
- /** Try to match a delimiter at current position */
707
- const tryMatchDelimiter = (s, pos, out) => {
772
+ const appendDelimiter = (s, pos, out) => {
708
773
  const ch = s[pos];
709
- if (!ch || !isCommonDelimiter(ch)) return {
774
+ return ch && isCommonDelimiter(ch) ? {
775
+ matched: true,
776
+ out: `${out}${escapeSignatureLiteral(ch)}`,
777
+ pos: pos + 1
778
+ } : {
710
779
  matched: false,
711
780
  out,
712
781
  pos
713
782
  };
714
- return {
715
- matched: true,
716
- out: out + escapeSignatureLiteral(ch),
717
- pos: pos + 1
783
+ };
784
+ const appendFallbackWord = (s, pos, out) => {
785
+ const word = extractFirstWord(s.slice(pos));
786
+ return word ? `${out}${escapeSignatureLiteral(word)}` : null;
787
+ };
788
+ const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
789
+ const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
790
+ if (ws.skipped) return {
791
+ done: false,
792
+ matchedAny,
793
+ matchedToken,
794
+ out: ws.out,
795
+ pos: ws.pos,
796
+ steps: 0
797
+ };
798
+ const tok = appendToken(s, pos, out, compiled);
799
+ if (tok.matched) return {
800
+ done: false,
801
+ matchedAny: true,
802
+ matchedToken: true,
803
+ out: tok.out,
804
+ pos: tok.pos,
805
+ steps: 1
806
+ };
807
+ if (matchedAny) {
808
+ const delim = appendDelimiter(s, pos, out);
809
+ if (delim.matched) return {
810
+ done: false,
811
+ matchedAny,
812
+ matchedToken,
813
+ out: delim.out,
814
+ pos: delim.pos,
815
+ steps: 0
816
+ };
817
+ if (opts.includeFirstWordFallback && !matchedToken) {
818
+ const fallback = appendFallbackWord(s, pos, out);
819
+ if (fallback) return {
820
+ done: true,
821
+ matchedAny,
822
+ matchedToken,
823
+ out: fallback,
824
+ pos,
825
+ steps: 1
826
+ };
827
+ }
828
+ return {
829
+ done: true,
830
+ matchedAny,
831
+ matchedToken,
832
+ out,
833
+ pos,
834
+ steps: 0
835
+ };
836
+ }
837
+ if (!opts.includeFirstWordFallback) return {
838
+ done: true,
839
+ matchedAny,
840
+ matchedToken,
841
+ out,
842
+ pos,
843
+ steps: 0
844
+ };
845
+ const fallback = appendFallbackWord(s, pos, out);
846
+ return fallback ? {
847
+ done: true,
848
+ matchedAny: true,
849
+ matchedToken,
850
+ out: fallback,
851
+ pos,
852
+ steps: 0
853
+ } : {
854
+ done: true,
855
+ matchedAny,
856
+ matchedToken,
857
+ out,
858
+ pos,
859
+ steps: 0
718
860
  };
719
861
  };
862
+ /** Remove trailing whitespace placeholders */
863
+ const trimTrailingWs = (out, mode) => {
864
+ const suffix = mode === "regex" ? "\\s*" : " ";
865
+ while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
866
+ return out;
867
+ };
868
+ /** Try to extract first word for fallback */
869
+ const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
720
870
  /** Skip whitespace at position */
721
871
  const skipWhitespace$1 = (s, pos, out, ws) => {
722
872
  const m = /^[ \t]+/u.exec(s.slice(pos));
@@ -737,47 +887,25 @@ const tokenizeLineStart = (line, tokenNames, opts) => {
737
887
  const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
738
888
  const compiled = compileTokenRegexes(tokenNames);
739
889
  let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
740
- const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
890
+ const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
741
891
  pos = prefix.pos;
742
892
  out = prefix.out;
743
893
  matchedAny = prefix.matched;
744
894
  while (steps < 6 && pos < s.length) {
745
- const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
746
- if (ws.skipped) {
747
- pos = ws.pos;
748
- out = ws.out;
749
- continue;
750
- }
751
- const tok = tryMatchToken(s, pos, out, compiled);
752
- if (tok.matched) {
753
- pos = tok.pos;
754
- out = tok.out;
755
- matchedAny = matchedToken = true;
756
- steps++;
757
- continue;
758
- }
759
- if (matchedAny) {
760
- const delim = tryMatchDelimiter(s, pos, out);
761
- if (delim.matched) {
762
- pos = delim.pos;
763
- out = delim.out;
764
- continue;
765
- }
766
- }
767
- if (matchedAny) {
768
- if (opts.includeFirstWordFallback && !matchedToken) {
769
- const word = extractFirstWord(s.slice(pos));
770
- if (word) {
771
- out += escapeSignatureLiteral(word);
772
- steps++;
773
- }
774
- }
895
+ const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
896
+ if (next.done) {
897
+ if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
898
+ if (next.steps > 0) steps += next.steps;
899
+ matchedAny = next.matchedAny;
900
+ matchedToken = next.matchedToken;
901
+ out = next.out;
775
902
  break;
776
903
  }
777
- if (!opts.includeFirstWordFallback) return null;
778
- const word = extractFirstWord(s.slice(pos));
779
- if (!word) return null;
780
- return escapeSignatureLiteral(word);
904
+ pos = next.pos;
905
+ out = next.out;
906
+ matchedAny = next.matchedAny;
907
+ matchedToken = next.matchedToken;
908
+ steps += next.steps;
781
909
  }
782
910
  return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
783
911
  };
@@ -821,7 +949,6 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
821
949
  pattern
822
950
  })).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
823
951
  };
824
-
825
952
  //#endregion
826
953
  //#region src/analysis/repeating-sequences.ts
827
954
  const resolveOptions = (options) => {
@@ -843,6 +970,7 @@ const resolveOptions = (options) => {
843
970
  const createRawCursor = (text, normalize) => {
844
971
  let rawPos = 0;
845
972
  return {
973
+ /** Advance cursor, returning the raw text chunk consumed */
846
974
  advance(normalizedLen) {
847
975
  if (!normalize) {
848
976
  const chunk = text.slice(rawPos, rawPos + normalizedLen);
@@ -947,23 +1075,27 @@ const buildExample = (page, window, contextChars) => {
947
1075
  text: page.content.slice(start, end)
948
1076
  };
949
1077
  };
1078
+ const recordPattern = (page, window, opts, stats) => {
1079
+ if (opts.requireToken && !hasTokenInWindow(window)) return;
1080
+ const pattern = buildPattern(window, opts.whitespace);
1081
+ let entry = stats.get(pattern);
1082
+ if (!entry) {
1083
+ if (stats.size >= opts.maxUniquePatterns) return;
1084
+ entry = {
1085
+ count: 0,
1086
+ examples: [],
1087
+ ...computeWindowStats(window)
1088
+ };
1089
+ stats.set(pattern, entry);
1090
+ }
1091
+ entry.count++;
1092
+ if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
1093
+ };
950
1094
  /** Extract N-grams from a single page */
951
1095
  const extractPageNgrams = (page, items, opts, stats) => {
952
- for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
953
- const window = items.slice(i, i + n);
954
- if (opts.requireToken && !hasTokenInWindow(window)) continue;
955
- const pattern = buildPattern(window, opts.whitespace);
956
- if (!stats.has(pattern)) {
957
- if (stats.size >= opts.maxUniquePatterns) continue;
958
- stats.set(pattern, {
959
- count: 0,
960
- examples: [],
961
- ...computeWindowStats(window)
962
- });
963
- }
964
- const entry = stats.get(pattern);
965
- entry.count++;
966
- if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
1096
+ for (let i = 0; i <= items.length - opts.minElements; i++) {
1097
+ const maxWindowSize = Math.min(opts.maxElements, items.length - i);
1098
+ for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
967
1099
  }
968
1100
  };
969
1101
  /**
@@ -985,7 +1117,6 @@ const analyzeRepeatingSequences = (pages, options) => {
985
1117
  pattern
986
1118
  }));
987
1119
  };
988
-
989
1120
  //#endregion
990
1121
  //#region src/detection.ts
991
1122
  /**
@@ -1147,7 +1278,6 @@ const analyzeTextForRule = (text) => {
1147
1278
  ...suggestPatternConfig(detected)
1148
1279
  };
1149
1280
  };
1150
-
1151
1281
  //#endregion
1152
1282
  //#region src/types/rules.ts
1153
1283
  /**
@@ -1170,9 +1300,9 @@ const PATTERN_TYPE_KEYS = [
1170
1300
  "lineStartsAfter",
1171
1301
  "lineEndsWith",
1172
1302
  "template",
1173
- "regex"
1303
+ "regex",
1304
+ "dictionaryEntry"
1174
1305
  ];
1175
-
1176
1306
  //#endregion
1177
1307
  //#region src/optimization/optimize-rules.ts
1178
1308
  const MERGEABLE_KEYS = new Set([
@@ -1190,11 +1320,17 @@ const getPatternArray = (rule, key) => {
1190
1320
  };
1191
1321
  const getPatternString = (rule, key) => {
1192
1322
  const value = rule[key];
1193
- return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : "";
1323
+ return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
1194
1324
  };
1195
1325
  const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
1326
+ const getDictionaryEntrySpecificityScore = (rule) => {
1327
+ if (!("dictionaryEntry" in rule)) return 0;
1328
+ const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
1329
+ return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
1330
+ };
1196
1331
  const getSpecificityScore = (rule) => {
1197
1332
  const key = getPatternKey(rule);
1333
+ if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
1198
1334
  return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
1199
1335
  };
1200
1336
  const createMergeKey = (rule) => {
@@ -1231,7 +1367,6 @@ const optimizeRules = (rules) => {
1231
1367
  rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
1232
1368
  };
1233
1369
  };
1234
-
1235
1370
  //#endregion
1236
1371
  //#region src/preprocessing/transforms.ts
1237
1372
  /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
@@ -1340,170 +1475,115 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
1340
1475
  }
1341
1476
  return result;
1342
1477
  };
1343
-
1344
1478
  //#endregion
1345
- //#region src/segmentation/rule-regex.ts
1346
- /**
1347
- * Checks if a regex pattern contains standard (anonymous) capturing groups.
1348
- *
1349
- * Detects standard capturing groups `(...)` while excluding:
1350
- * - Non-capturing groups `(?:...)`
1351
- * - Lookahead assertions `(?=...)` and `(?!...)`
1352
- * - Lookbehind assertions `(?<=...)` and `(?<!...)`
1353
- * - Named groups `(?<name>...)` (start with `(?` so excluded here)
1354
- *
1355
- * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
1356
- */
1357
- const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
1358
- /**
1359
- * Extracts named capture group names from a regex pattern.
1360
- *
1361
- * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
1362
- *
1363
- * @example
1364
- * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
1365
- * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
1366
- * extractNamedCaptureNames('^\\d+') // []
1367
- */
1368
- const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([^>]+)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
1369
- /**
1370
- * Safely compiles a regex pattern, throwing a helpful error if invalid.
1371
- */
1372
- const compileRuleRegex = (pattern) => {
1373
- try {
1374
- return new RegExp(pattern, "gmu");
1375
- } catch (error) {
1376
- throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
1479
+ //#region src/segmentation/arabic-dictionary-rule.ts
1480
+ const uniqueCanonicalWords = (words) => {
1481
+ const seen = /* @__PURE__ */ new Set();
1482
+ const result = [];
1483
+ for (const word of words) {
1484
+ const normalized = normalizeArabicForComparison(word);
1485
+ if (!normalized || seen.has(normalized)) continue;
1486
+ seen.add(normalized);
1487
+ result.push(word);
1377
1488
  }
1489
+ return result;
1378
1490
  };
1379
- /**
1380
- * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
1381
- *
1382
- * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1383
- */
1384
- const processPattern = (pattern, fuzzy, capturePrefix) => {
1385
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
1491
+ const buildStopAlternation = (stopWords) => {
1492
+ const unique = uniqueCanonicalWords(stopWords);
1493
+ if (unique.length === 0) return "";
1494
+ return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
1495
+ };
1496
+ const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1497
+ if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1498
+ const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1499
+ return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1500
+ };
1501
+ const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1502
+ const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1503
+ const withCapture = `(?<${captureName}>${headwordBody})`;
1504
+ if (!allowParenthesized) return `${withCapture}${colon}`;
1505
+ return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1506
+ };
1507
+ const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
1508
+ if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1509
+ if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1510
+ if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1511
+ };
1512
+ const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
1513
+ validateDictionaryEntryOptions({
1514
+ captureName,
1515
+ maxLetters,
1516
+ minLetters
1517
+ });
1518
+ const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1519
+ const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1520
+ const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1521
+ const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1522
+ const stopAlternation = buildStopAlternation(stopWords);
1523
+ const lemmaBody = buildHeadwordBody({
1524
+ allowCommaSeparated,
1525
+ colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1526
+ stopAlternation,
1527
+ stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1528
+ unit: lemmaUnit
1529
+ });
1530
+ const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
1531
+ const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
1532
+ const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
1533
+ const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
1534
+ allowParenthesized,
1535
+ allowWhitespaceBeforeColon,
1536
+ captureName: prefixedCaptureName,
1537
+ headwordBody: lemmaBody
1538
+ });
1386
1539
  return {
1387
- captureNames,
1388
- pattern: expanded
1540
+ captureNames: [prefixedCaptureName],
1541
+ regex
1389
1542
  };
1390
1543
  };
1391
1544
  /**
1392
- * Processes a breakpoint pattern by expanding tokens only.
1545
+ * Creates a reusable split rule for Arabic dictionary entries.
1393
1546
  *
1394
- * Unlike `processPattern`, this does NOT escape brackets because breakpoints
1395
- * are treated as raw regex patterns (like the `regex` rule type).
1396
- * Users have full control over regex syntax including `(?:...)` groups.
1397
- */
1398
- const processBreakpointPattern = (pattern) => {
1399
- const { pattern: expanded } = expandTokensWithCaptures(pattern);
1400
- return expanded;
1401
- };
1402
- const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
1403
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1404
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1405
- return {
1406
- captureNames: processed.flatMap((p) => p.captureNames),
1407
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
1408
- };
1409
- };
1410
- const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1411
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1412
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1413
- return {
1414
- captureNames: processed.flatMap((p) => p.captureNames),
1415
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
1416
- };
1417
- };
1418
- const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1419
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1420
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1421
- return {
1422
- captureNames: processed.flatMap((p) => p.captureNames),
1423
- regex: `(?:${alternatives})$`
1424
- };
1425
- };
1426
- const buildTemplateRegexSource = (template, capturePrefix) => {
1427
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
1428
- return {
1429
- captureNames,
1430
- regex: pattern
1431
- };
1432
- };
1433
- /**
1434
- * Builds a compiled regex and metadata from a split rule.
1547
+ * The returned rule preserves authoring intent as a serializable
1548
+ * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1549
+ * regex string.
1435
1550
  *
1436
- * Behavior mirrors the previous implementation in `segmenter.ts`.
1551
+ * @example
1552
+ * createArabicDictionaryEntryRule({
1553
+ * stopWords: ['وقيل', 'ويقال', 'قال'],
1554
+ * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1555
+ * })
1556
+ *
1557
+ * @example
1558
+ * createArabicDictionaryEntryRule({
1559
+ * allowParenthesized: true,
1560
+ * allowWhitespaceBeforeColon: true,
1561
+ * allowCommaSeparated: true,
1562
+ * stopWords: ['الليث', 'العجاج'],
1563
+ * })
1437
1564
  */
1438
- const buildRuleRegex = (rule, capturePrefix) => {
1439
- const { lineStartsWith, lineStartsAfter, lineEndsWith, template, regex } = rule;
1440
- const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy([
1441
- ...lineStartsWith ?? [],
1442
- ...lineStartsAfter ?? [],
1443
- ...lineEndsWith ?? []
1444
- ]);
1445
- if (lineStartsAfter?.length) {
1446
- const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(lineStartsAfter, fuzzy, capturePrefix);
1447
- return {
1448
- captureNames,
1449
- regex: compileRuleRegex(lsaRegex),
1450
- usesCapture: true,
1451
- usesLineStartsAfter: true
1452
- };
1453
- }
1454
- let finalRegex = regex;
1455
- let allCaptureNames = [];
1456
- if (lineStartsWith?.length) {
1457
- const res = buildLineStartsWithRegexSource(lineStartsWith, fuzzy, capturePrefix);
1458
- finalRegex = res.regex;
1459
- allCaptureNames = res.captureNames;
1460
- }
1461
- if (lineEndsWith?.length) {
1462
- const res = buildLineEndsWithRegexSource(lineEndsWith, fuzzy, capturePrefix);
1463
- finalRegex = res.regex;
1464
- allCaptureNames = res.captureNames;
1465
- }
1466
- if (template) {
1467
- const res = buildTemplateRegexSource(template, capturePrefix);
1468
- finalRegex = res.regex;
1469
- allCaptureNames = [...allCaptureNames, ...res.captureNames];
1470
- }
1471
- if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1472
- if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
1565
+ const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1566
+ validateDictionaryEntryOptions({
1567
+ captureName,
1568
+ maxLetters,
1569
+ minLetters
1570
+ });
1473
1571
  return {
1474
- captureNames: allCaptureNames,
1475
- regex: compileRuleRegex(finalRegex),
1476
- usesCapture: hasCapturingGroup(finalRegex),
1477
- usesLineStartsAfter: false
1572
+ dictionaryEntry: {
1573
+ allowCommaSeparated,
1574
+ allowParenthesized,
1575
+ allowWhitespaceBeforeColon,
1576
+ captureName,
1577
+ maxLetters,
1578
+ midLineSubentries,
1579
+ minLetters,
1580
+ stopWords: uniqueCanonicalWords(stopWords)
1581
+ },
1582
+ meta,
1583
+ pageStartPrevWordStoplist,
1584
+ samePagePrevWordStoplist
1478
1585
  };
1479
1586
  };
1480
-
1481
- //#endregion
1482
- //#region src/segmentation/breakpoint-constants.ts
1483
- /**
1484
- * Shared constants for segmentation breakpoint processing.
1485
- */
1486
- /**
1487
- * Threshold for using offset-based fast path in boundary processing.
1488
- *
1489
- * Below this: accurate string-search (handles offset drift from structural rules).
1490
- * At or above this: O(n) arithmetic (performance critical for large books).
1491
- *
1492
- * The value of 1000 is chosen based on typical Arabic book sizes:
1493
- * - Sahih al-Bukhari: ~1000-3000 pages
1494
- * - Standard hadith collections: 1000-7000 pages
1495
- * - Large aggregated corpora: 10k-50k pages
1496
- *
1497
- * For segments ≥1000 pages, the performance gain from offset-based slicing
1498
- * outweighs the minor accuracy loss from potential offset drift.
1499
- *
1500
- * @remarks
1501
- * Fast path is skipped when:
1502
- * - `maxContentLength` is set (requires character-accurate splitting)
1503
- * - `debugMetaKey` is set (requires proper provenance tracking)
1504
- * - Content was structurally modified by marker stripping (offsets may drift)
1505
- */
1506
- const FAST_PATH_THRESHOLD = 1e3;
1507
1587
  const WINDOW_PREFIX_LENGTHS = [
1508
1588
  80,
1509
1589
  60,
@@ -1530,23 +1610,6 @@ const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۝۞]/;
1530
1610
  * Matches outside this range are rejected unless `ignoreDeviation` is active.
1531
1611
  */
1532
1612
  const MAX_DEVIATION = 2e3;
1533
- /**
1534
- * Penalty score applied to non-newline anchor candidates.
1535
- *
1536
- * Designed to prioritize newline-aligned boundaries unless a whitespace match is
1537
- * significantly closer (within 20 chars). Handles cases where marker stripping
1538
- * shifts the boundary slightly.
1539
- */
1540
- const NON_NEWLINE_PENALTY = 20;
1541
- /**
1542
- * Limit for inferring start offset from a relaxed search (characters).
1543
- *
1544
- * If the relaxed search finds a match more than this distance away from the
1545
- * expected position, we assume it's a false positive (e.g. repeated content)
1546
- * and do not use it to infer the start offset.
1547
- */
1548
- const INFERENCE_PROXIMITY_LIMIT = 500;
1549
-
1550
1613
  //#endregion
1551
1614
  //#region src/segmentation/match-utils.ts
1552
1615
  /**
@@ -1665,7 +1728,6 @@ const extractDebugIndex = (groups, prefix) => {
1665
1728
  if (!Number.isNaN(idx)) return idx;
1666
1729
  }
1667
1730
  };
1668
-
1669
1731
  //#endregion
1670
1732
  //#region src/segmentation/breakpoint-utils.ts
1671
1733
  /**
@@ -2067,8 +2129,8 @@ const findAnchorCandidates = (content, prefix, start, end) => {
2067
2129
  /** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
2068
2130
  const selectBestAnchor = (candidates, expectedBoundary) => {
2069
2131
  return candidates.reduce((best, curr) => {
2070
- const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : NON_NEWLINE_PENALTY);
2071
- return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : NON_NEWLINE_PENALTY) < bestScore ? curr : best;
2132
+ const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
2133
+ return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
2072
2134
  });
2073
2135
  };
2074
2136
  /**
@@ -2122,7 +2184,7 @@ const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetI
2122
2184
  if (relaxedPos > 0) {
2123
2185
  const inferredStartOffset = rawBoundary - relaxedPos;
2124
2186
  const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
2125
- if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < INFERENCE_PROXIMITY_LIMIT) {
2187
+ if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
2126
2188
  startOffsetInFromPage = inferredStartOffset;
2127
2189
  expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
2128
2190
  pos = relaxedPos;
@@ -2196,7 +2258,7 @@ const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCoun
2196
2258
  const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2197
2259
  const pageCount = toIdx - fromIdx + 1;
2198
2260
  const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
2199
- if (pageCount >= FAST_PATH_THRESHOLD && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
2261
+ if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
2200
2262
  return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
2201
2263
  };
2202
2264
  /**
@@ -2428,7 +2490,6 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
2428
2490
  }
2429
2491
  return -1;
2430
2492
  };
2431
-
2432
2493
  //#endregion
2433
2494
  //#region src/segmentation/debug-meta.ts
2434
2495
  const resolveDebugConfig = (debug) => {
@@ -2470,59 +2531,222 @@ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
2470
2531
  ...word !== void 0 ? { word } : {}
2471
2532
  } };
2472
2533
  };
2473
- const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2474
- index: breakpointIndex,
2475
- kind: rule.pattern === "" ? "pageBoundary" : "pattern",
2476
- pattern: rule.pattern ?? rule.regex,
2477
- ...wordIndex !== void 0 ? { wordIndex } : {},
2478
- ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2479
- } });
2534
+ const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2535
+ index: breakpointIndex,
2536
+ kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
2537
+ pattern: rule.pattern ?? rule.regex,
2538
+ ...wordIndex !== void 0 ? { wordIndex } : {},
2539
+ ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2540
+ } });
2541
+ /**
2542
+ * Helper to format the debug info into a human-readable string.
2543
+ * @param meta - The segment metadata object
2544
+ * @param options - Formatting options
2545
+ */
2546
+ const formatRuleReason = (rule, concise) => {
2547
+ const { index, patternType, wordIndex, word } = rule;
2548
+ if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2549
+ const wordInfo = word ? ` (Matched: "${word}")` : "";
2550
+ return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2551
+ };
2552
+ const formatBreakpointReason = (breakpoint, concise) => {
2553
+ const { index, kind, pattern, wordIndex, word } = breakpoint;
2554
+ if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2555
+ if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2556
+ if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2557
+ return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2558
+ };
2559
+ const formatContentLengthReason = (split, concise) => {
2560
+ const { maxContentLength, splitReason } = split;
2561
+ if (concise) return `> ${maxContentLength} (${splitReason})`;
2562
+ return `Safety Split (${splitReason}) > ${maxContentLength}`;
2563
+ };
2564
+ /**
2565
+ * Helper to format the debug info into a human-readable string.
2566
+ * @param meta - The segment metadata object
2567
+ * @param options - Formatting options
2568
+ */
2569
+ const getDebugReason = (meta, options) => {
2570
+ const debug = meta?._flappa;
2571
+ if (!debug) return "-";
2572
+ const concise = options?.concise;
2573
+ if (debug.rule) return formatRuleReason(debug.rule, concise);
2574
+ if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2575
+ if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2576
+ return "Unknown";
2577
+ };
2578
+ /**
2579
+ * Convenience helper to get the formatted debug reason directly from a segment.
2580
+ * @param segment - The segment object
2581
+ * @param options - Formatting options
2582
+ */
2583
+ const getSegmentDebugReason = (segment, options) => {
2584
+ return getDebugReason(segment.meta, options);
2585
+ };
2586
+ //#endregion
2587
+ //#region src/segmentation/pattern-validator.ts
2588
+ const KNOWN_TOKENS = new Set(getAvailableTokens());
2589
+ const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
2590
+ const buildBareTokenRegex = () => {
2591
+ const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
2592
+ return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
2593
+ };
2594
+ /**
2595
+ * Validates a single pattern for common issues.
2596
+ */
2597
+ const validatePattern = (pattern, seenPatterns) => {
2598
+ if (!pattern.trim()) return {
2599
+ message: "Empty pattern is not allowed",
2600
+ type: "empty_pattern"
2601
+ };
2602
+ if (seenPatterns.has(pattern)) return {
2603
+ message: `Duplicate pattern: "${pattern}"`,
2604
+ pattern,
2605
+ type: "duplicate"
2606
+ };
2607
+ seenPatterns.add(pattern);
2608
+ TOKEN_INSIDE_BRACES.lastIndex = 0;
2609
+ for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
2610
+ const name = match[1];
2611
+ if (!KNOWN_TOKENS.has(name)) return {
2612
+ message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
2613
+ suggestion: "Check spelling or use a known token",
2614
+ token: name,
2615
+ type: "unknown_token"
2616
+ };
2617
+ }
2618
+ for (const match of pattern.matchAll(buildBareTokenRegex())) {
2619
+ const [full, name] = match;
2620
+ const idx = match.index;
2621
+ if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
2622
+ message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
2623
+ suggestion: `{{${full}}}`,
2624
+ token: name,
2625
+ type: "missing_braces"
2626
+ };
2627
+ }
2628
+ };
2480
2629
  /**
2481
- * Helper to format the debug info into a human-readable string.
2482
- * @param meta - The segment metadata object
2483
- * @param options - Formatting options
2630
+ * Validates an array of patterns, returning parallel array of issues.
2484
2631
  */
2485
- const formatRuleReason = (rule, concise) => {
2486
- const { index, patternType, wordIndex, word } = rule;
2487
- if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2488
- const wordInfo = word ? ` (Matched: "${word}")` : "";
2489
- return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2632
+ const validatePatternArray = (patterns) => {
2633
+ const seen = /* @__PURE__ */ new Set();
2634
+ const issues = patterns.map((p) => validatePattern(p, seen));
2635
+ return issues.some(Boolean) ? issues : void 0;
2490
2636
  };
2491
- const formatBreakpointReason = (breakpoint, concise) => {
2492
- const { index, kind, pattern, wordIndex, word } = breakpoint;
2493
- if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2494
- if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2495
- if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2496
- return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2637
+ const applyRulePatternValidation = (result, key, patterns) => {
2638
+ if (!patterns) return false;
2639
+ const issues = validatePatternArray(patterns);
2640
+ if (!issues) return false;
2641
+ result[key] = issues;
2642
+ return true;
2497
2643
  };
2498
- const formatContentLengthReason = (split, concise) => {
2499
- const { maxContentLength, splitReason } = split;
2500
- if (concise) return `> ${maxContentLength} (${splitReason})`;
2501
- return `Safety Split (${splitReason}) > ${maxContentLength}`;
2644
+ const validateTemplateRule = (rule, result) => {
2645
+ if (rule.template === void 0) return false;
2646
+ const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
2647
+ if (!issue) return false;
2648
+ result.template = issue;
2649
+ return true;
2650
+ };
2651
+ const validateRegexRule = (rule, result) => {
2652
+ if (rule.regex === void 0) return false;
2653
+ if (!rule.regex.trim()) {
2654
+ result.regex = {
2655
+ message: "Empty pattern is not allowed",
2656
+ type: "empty_pattern"
2657
+ };
2658
+ return true;
2659
+ }
2660
+ try {
2661
+ new RegExp(rule.regex, "u");
2662
+ return false;
2663
+ } catch (error) {
2664
+ result.regex = {
2665
+ message: error instanceof Error ? error.message : String(error),
2666
+ pattern: rule.regex,
2667
+ type: "invalid_regex"
2668
+ };
2669
+ return true;
2670
+ }
2671
+ };
2672
+ const invalidDictionaryEntryIssue = (message) => ({
2673
+ message,
2674
+ type: "invalid_option"
2675
+ });
2676
+ const validateDictionaryEntryRule = (rule, result) => {
2677
+ if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
2678
+ const issues = {};
2679
+ const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
2680
+ if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
2681
+ if (allowCommaSeparated !== void 0 && typeof allowCommaSeparated !== "boolean") issues.allowCommaSeparated = invalidDictionaryEntryIssue("allowCommaSeparated must be a boolean");
2682
+ if (allowParenthesized !== void 0 && typeof allowParenthesized !== "boolean") issues.allowParenthesized = invalidDictionaryEntryIssue("allowParenthesized must be a boolean");
2683
+ if (allowWhitespaceBeforeColon !== void 0 && typeof allowWhitespaceBeforeColon !== "boolean") issues.allowWhitespaceBeforeColon = invalidDictionaryEntryIssue("allowWhitespaceBeforeColon must be a boolean");
2684
+ if (midLineSubentries !== void 0 && typeof midLineSubentries !== "boolean") issues.midLineSubentries = invalidDictionaryEntryIssue("midLineSubentries must be a boolean");
2685
+ if (captureName !== void 0 && !captureName.match(/^[A-Za-z_]\w*$/)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
2686
+ if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
2687
+ if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < (minLetters ?? 2))) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${minLetters ?? 2}`);
2688
+ if (Object.keys(issues).length === 0) return false;
2689
+ result.dictionaryEntry = issues;
2690
+ return true;
2691
+ };
2692
+ const formatValidationIssue = (_type, issue, loc) => {
2693
+ if (!issue) return null;
2694
+ if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
2695
+ if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
2696
+ if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
2697
+ if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
2698
+ return `${loc}: ${issue.message || issue.type}`;
2502
2699
  };
2503
2700
  /**
2504
- * Helper to format the debug info into a human-readable string.
2505
- * @param meta - The segment metadata object
2506
- * @param options - Formatting options
2701
+ * Validates split rules for common pattern issues.
2702
+ *
2703
+ * Checks for:
2704
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
2705
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
2706
+ * - Duplicate patterns within the same rule
2707
+ *
2708
+ * @param rules - Array of split rules to validate
2709
+ * @returns Array parallel to input with validation results (undefined if no issues)
2710
+ *
2711
+ * @example
2712
+ * const issues = validateRules([
2713
+ * { lineStartsAfter: ['raqms:num'] }, // Missing braces
2714
+ * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
2715
+ * ]);
2716
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
2717
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
2507
2718
  */
2508
- const getDebugReason = (meta, options) => {
2509
- const debug = meta?._flappa;
2510
- if (!debug) return "-";
2511
- const concise = options?.concise;
2512
- if (debug.rule) return formatRuleReason(debug.rule, concise);
2513
- if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2514
- if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2515
- return "Unknown";
2516
- };
2719
+ const validateRules = (rules) => rules.map((rule) => {
2720
+ const result = {};
2721
+ const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
2722
+ const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
2723
+ const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
2724
+ const templateIssues = validateTemplateRule(rule, result);
2725
+ const regexIssues = validateRegexRule(rule, result);
2726
+ const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
2727
+ return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
2728
+ });
2517
2729
  /**
2518
- * Convenience helper to get the formatted debug reason directly from a segment.
2519
- * @param segment - The segment object
2520
- * @param options - Formatting options
2730
+ * Formats a validation result array into a list of human-readable error messages.
2731
+ *
2732
+ * Useful for displaying validation errors in UIs.
2733
+ *
2734
+ * @param results - The result array from `validateRules()`
2735
+ * @returns Array of formatted error strings
2736
+ *
2737
+ * @example
2738
+ * const issues = validateRules(rules);
2739
+ * const errors = formatValidationReport(issues);
2740
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
2521
2741
  */
2522
- const getSegmentDebugReason = (segment, options) => {
2523
- return getDebugReason(segment.meta, options);
2742
+ const formatValidationReport = (results) => results.flatMap((result, i) => {
2743
+ if (!result) return [];
2744
+ return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
2745
+ });
2746
+ const formatValidationIssues = (type, issues, ruleNumber) => {
2747
+ if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
2748
+ return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
2524
2749
  };
2525
-
2526
2750
  //#endregion
2527
2751
  //#region src/segmentation/breakpoint-processor.ts
2528
2752
  const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
@@ -2650,7 +2874,7 @@ const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx,
2650
2874
  const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
2651
2875
  const driftTolerance = Math.max(100, fullContent.length * .01);
2652
2876
  const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
2653
- if (!isAligned && pageCount >= FAST_PATH_THRESHOLD) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
2877
+ if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
2654
2878
  actualLength: fullContent.length,
2655
2879
  drift: Math.abs(expectedLength - fullContent.length),
2656
2880
  expectedLength,
@@ -2791,8 +3015,7 @@ const computeWindowEndPositionForIteration = (remainingContent, cursorPos, curre
2791
3015
  if (maxPages === 0) {
2792
3016
  const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
2793
3017
  const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
2794
- const capped = maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage;
2795
- return Math.min(capped, remainingContent.length);
3018
+ return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
2796
3019
  }
2797
3020
  const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
2798
3021
  return Math.min(pos, remainingContent.length);
@@ -2847,7 +3070,7 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
2847
3070
  const pageCount = toIdx - fromIdx + 1;
2848
3071
  const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
2849
3072
  const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
2850
- if (pageCount < FAST_PATH_THRESHOLD || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
3073
+ if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
2851
3074
  if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
2852
3075
  return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
2853
3076
  };
@@ -3030,7 +3253,179 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
3030
3253
  logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
3031
3254
  return result;
3032
3255
  };
3033
-
3256
+ //#endregion
3257
+ //#region src/segmentation/rule-regex.ts
3258
+ /**
3259
+ * Checks if a regex pattern contains standard (anonymous) capturing groups.
3260
+ *
3261
+ * Detects standard capturing groups `(...)` while excluding:
3262
+ * - Non-capturing groups `(?:...)`
3263
+ * - Lookahead assertions `(?=...)` and `(?!...)`
3264
+ * - Lookbehind assertions `(?<=...)` and `(?<!...)`
3265
+ * - Named groups `(?<name>...)` (start with `(?` so excluded here)
3266
+ *
3267
+ * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
3268
+ */
3269
+ const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
3270
+ /**
3271
+ * Extracts named capture group names from a regex pattern.
3272
+ *
3273
+ * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
3274
+ *
3275
+ * @example
3276
+ * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
3277
+ * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
3278
+ * extractNamedCaptureNames('^\\d+') // []
3279
+ */
3280
+ const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
3281
+ /**
3282
+ * Safely compiles a regex pattern, throwing a helpful error if invalid.
3283
+ */
3284
+ const compileRuleRegex = (pattern) => {
3285
+ try {
3286
+ return new RegExp(pattern, "gmu");
3287
+ } catch (error) {
3288
+ throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
3289
+ }
3290
+ };
3291
+ /**
3292
+ * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
3293
+ *
3294
+ * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
3295
+ */
3296
+ const processPattern = (pattern, fuzzy, capturePrefix) => {
3297
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
3298
+ return {
3299
+ captureNames,
3300
+ pattern: expanded
3301
+ };
3302
+ };
3303
+ /**
3304
+ * Processes a breakpoint pattern by expanding tokens only.
3305
+ *
3306
+ * Unlike `processPattern`, this does NOT escape brackets because breakpoints
3307
+ * are treated as raw regex patterns (like the `regex` rule type).
3308
+ * Users have full control over regex syntax including `(?:...)` groups.
3309
+ */
3310
+ const processBreakpointPattern = (pattern) => {
3311
+ const { pattern: expanded } = expandTokensWithCaptures(pattern);
3312
+ return expanded;
3313
+ };
3314
+ /**
3315
+ * Builds the raw regex source for a `lineStartsAfter` rule.
3316
+ *
3317
+ * Expands each pattern through `processPattern()`, combines them into an
3318
+ * alternation at the start of a line, and appends a trailing content capture.
3319
+ *
3320
+ * @param patterns - Template-like line-start markers to match
3321
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3322
+ * @param capturePrefix - Optional prefix used for internal named captures
3323
+ * @returns Regex source plus the named captures extracted from the patterns
3324
+ */
3325
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
3326
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3327
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3328
+ return {
3329
+ captureNames: processed.flatMap((p) => p.captureNames),
3330
+ regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
3331
+ };
3332
+ };
3333
+ /**
3334
+ * Builds the raw regex source for a `lineStartsWith` rule.
3335
+ *
3336
+ * Expands each pattern through `processPattern()` and combines them into an
3337
+ * alternation anchored at the start of a line.
3338
+ *
3339
+ * @param patterns - Template-like line-start markers to match
3340
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3341
+ * @param capturePrefix - Optional prefix used for internal named captures
3342
+ * @returns Regex source plus the named captures extracted from the patterns
3343
+ */
3344
+ const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3345
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3346
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3347
+ return {
3348
+ captureNames: processed.flatMap((p) => p.captureNames),
3349
+ regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
3350
+ };
3351
+ };
3352
+ /**
3353
+ * Builds the raw regex source for a `lineEndsWith` rule.
3354
+ *
3355
+ * Expands each pattern through `processPattern()` and combines them into an
3356
+ * end-anchored alternation.
3357
+ *
3358
+ * @param patterns - Template-like line-end markers to match
3359
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3360
+ * @param capturePrefix - Optional prefix used for internal named captures
3361
+ * @returns Regex source plus the named captures extracted from the patterns
3362
+ */
3363
+ const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3364
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3365
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3366
+ return {
3367
+ captureNames: processed.flatMap((p) => p.captureNames),
3368
+ regex: `(?:${alternatives})$`
3369
+ };
3370
+ };
3371
+ /**
3372
+ * Builds the raw regex source for a `template` rule.
3373
+ *
3374
+ * Expands tokens and named captures via `expandTokensWithCaptures()` after
3375
+ * applying `escapeTemplateBrackets()` to non-token brackets.
3376
+ *
3377
+ * @param template - Template string containing optional `{{token}}` markers
3378
+ * @param capturePrefix - Optional prefix used for internal named captures
3379
+ * @returns Regex source plus the named captures extracted from the template
3380
+ */
3381
+ const buildTemplateRegexSource = (template, capturePrefix) => {
3382
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
3383
+ return {
3384
+ captureNames,
3385
+ regex: pattern
3386
+ };
3387
+ };
3388
+ const getFuzzyCandidatePatterns = (rule) => [
3389
+ ..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
3390
+ ..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
3391
+ ..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
3392
+ ];
3393
+ const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
3394
+ if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
3395
+ if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
3396
+ if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
3397
+ if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
3398
+ return null;
3399
+ };
3400
+ /**
3401
+ * Builds a compiled regex and metadata from a split rule.
3402
+ *
3403
+ * Behavior mirrors the previous implementation in `segmenter.ts`.
3404
+ */
3405
+ const buildRuleRegex = (rule, capturePrefix) => {
3406
+ const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
3407
+ if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
3408
+ const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
3409
+ return {
3410
+ captureNames,
3411
+ regex: compileRuleRegex(lsaRegex),
3412
+ usesCapture: true,
3413
+ usesLineStartsAfter: true
3414
+ };
3415
+ }
3416
+ const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
3417
+ let finalRegex = ruleRegexSource?.regex;
3418
+ let allCaptureNames = ruleRegexSource?.captureNames ?? [];
3419
+ if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
3420
+ if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
3421
+ if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
3422
+ return {
3423
+ captureNames: allCaptureNames,
3424
+ regex: compileRuleRegex(finalRegex),
3425
+ usesCapture: hasCapturingGroup(finalRegex),
3426
+ usesLineStartsAfter: false
3427
+ };
3428
+ };
3034
3429
  //#endregion
3035
3430
  //#region src/segmentation/fast-fuzzy-prefix.ts
3036
3431
  /**
@@ -3078,9 +3473,8 @@ const compileFastFuzzyTokenRule = (tokenTemplate) => {
3078
3473
  const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
3079
3474
  if (!m) return null;
3080
3475
  const token = m[1];
3081
- const tokenPattern = getTokenPattern(token);
3082
- if (!tokenPattern) return null;
3083
- const compiled = compileLiteralAlternation(tokenPattern);
3476
+ if (!(token in TOKEN_PATTERNS)) return null;
3477
+ const compiled = compileLiteralAlternation(getTokenPattern(token));
3084
3478
  return compiled ? {
3085
3479
  alternatives: compiled.alternatives,
3086
3480
  token
@@ -3093,11 +3487,11 @@ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
3093
3487
  }
3094
3488
  return null;
3095
3489
  };
3096
-
3097
3490
  //#endregion
3098
3491
  //#region src/segmentation/segmenter-rule-utils.ts
3099
3492
  const tryCompileFastFuzzyRule = (rule) => {
3100
- if (!rule.fuzzy) return null;
3493
+ const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
3494
+ if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
3101
3495
  if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
3102
3496
  const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
3103
3497
  if (compiled) return {
@@ -3139,7 +3533,10 @@ const partitionRulesForMatching = (rules) => {
3139
3533
  prefix: `r${index}_`,
3140
3534
  rule
3141
3535
  });
3142
- else standaloneRules.push(rule);
3536
+ else standaloneRules.push({
3537
+ index,
3538
+ rule
3539
+ });
3143
3540
  }
3144
3541
  return {
3145
3542
  combinableRules,
@@ -3147,9 +3544,37 @@ const partitionRulesForMatching = (rules) => {
3147
3544
  standaloneRules
3148
3545
  };
3149
3546
  };
3547
+ const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
3548
+ const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
3549
+ const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
3550
+ const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
3551
+ const trimTrailingPageWrapNoise = (text) => {
3552
+ let trimmed = text.trimEnd();
3553
+ while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
3554
+ return trimmed;
3555
+ };
3556
+ const endsWithStrongSentenceTerminator = (pageContent) => {
3557
+ return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
3558
+ };
3559
+ const extractLastArabicWord = (pageContent) => {
3560
+ return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
3561
+ };
3562
+ const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
3563
+ if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
3564
+ const lastWord = extractLastArabicWord(previousPageContent);
3565
+ return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
3566
+ };
3567
+ const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
3568
+ if (!stoplist) return true;
3569
+ const lastWord = extractLastArabicWord(contentBeforeMatch);
3570
+ return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
3571
+ };
3150
3572
  const createPageStartGuardChecker = (matchContent, pageMap) => {
3151
3573
  const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
3152
3574
  const compiledPageStartPrev = /* @__PURE__ */ new Map();
3575
+ const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
3576
+ const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
3577
+ const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
3153
3578
  const getPageStartPrevRegex = (rule, ruleIndex) => {
3154
3579
  if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
3155
3580
  const pattern = rule.pageStartGuard;
@@ -3161,6 +3586,33 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
3161
3586
  compiledPageStartPrev.set(ruleIndex, re);
3162
3587
  return re;
3163
3588
  };
3589
+ const getPrevWordStoplist = (rule, ruleIndex) => {
3590
+ if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
3591
+ const stoplist = rule.pageStartPrevWordStoplist;
3592
+ if (!stoplist?.length) {
3593
+ compiledPrevWordStoplists.set(ruleIndex, null);
3594
+ return null;
3595
+ }
3596
+ const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3597
+ compiledPrevWordStoplists.set(ruleIndex, normalized);
3598
+ return normalized;
3599
+ };
3600
+ const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
3601
+ if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
3602
+ const stoplist = rule.samePagePrevWordStoplist;
3603
+ if (!stoplist?.length) {
3604
+ compiledSamePagePrevWordStoplists.set(ruleIndex, null);
3605
+ return null;
3606
+ }
3607
+ const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3608
+ compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
3609
+ return normalized;
3610
+ };
3611
+ const getPreviousPageContent = (boundaryIndex) => {
3612
+ if (boundaryIndex <= 0) return "";
3613
+ const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
3614
+ return matchContent.slice(prevBoundary.start, prevBoundary.end);
3615
+ };
3164
3616
  const getPrevPageLastNonWsChar = (boundaryIndex) => {
3165
3617
  if (boundaryIndex <= 0) return "";
3166
3618
  const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
@@ -3170,13 +3622,24 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
3170
3622
  }
3171
3623
  return "";
3172
3624
  };
3625
+ const getCurrentPageContentBeforeMatch = (matchStart) => {
3626
+ const pageId = pageMap.getId(matchStart);
3627
+ const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
3628
+ if (boundaryIndex === void 0) return "";
3629
+ const boundary = pageMap.boundaries[boundaryIndex];
3630
+ return matchContent.slice(boundary.start, matchStart);
3631
+ };
3173
3632
  return (rule, ruleIndex, matchStart) => {
3174
3633
  const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
3175
- if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
3176
- const prevReq = getPageStartPrevRegex(rule, ruleIndex);
3177
- if (!prevReq) return true;
3178
- const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
3179
- return lastChar ? prevReq.test(lastChar) : false;
3634
+ if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
3635
+ const prevReq = getPageStartPrevRegex(rule, ruleIndex);
3636
+ if (prevReq) {
3637
+ const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
3638
+ if (!lastChar || !prevReq.test(lastChar)) return false;
3639
+ }
3640
+ return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
3641
+ }
3642
+ return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
3180
3643
  };
3181
3644
  };
3182
3645
  /**
@@ -3212,10 +3675,10 @@ const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule,
3212
3675
  /**
3213
3676
  * Processes matches for all fast-fuzzy rules at a specific line start.
3214
3677
  */
3215
- const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart, splitPointsByRule) => {
3678
+ const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
3216
3679
  for (const ffRule of fastFuzzyRules) {
3217
3680
  if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
3218
- if (isPageStart && !passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
3681
+ if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
3219
3682
  attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
3220
3683
  }
3221
3684
  };
@@ -3230,19 +3693,17 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
3230
3693
  currentBoundary = pageMap.boundaries[boundaryIdx];
3231
3694
  }
3232
3695
  };
3233
- const isPageStart = (offset) => offset === currentBoundary?.start;
3234
3696
  for (let lineStart = 0; lineStart <= matchContent.length;) {
3235
3697
  advanceBoundaryTo(lineStart);
3236
3698
  const pageId = currentBoundary?.id ?? 0;
3237
3699
  if (lineStart >= matchContent.length) break;
3238
- processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart(lineStart), splitPointsByRule);
3700
+ processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
3239
3701
  const nextNl = matchContent.indexOf("\n", lineStart);
3240
3702
  if (nextNl === -1) break;
3241
3703
  lineStart = nextNl + 1;
3242
3704
  }
3243
3705
  return splitPointsByRule;
3244
3706
  };
3245
-
3246
3707
  //#endregion
3247
3708
  //#region src/segmentation/split-point-helpers.ts
3248
3709
  const MAX_REGEX_ITERATIONS = 1e5;
@@ -3256,7 +3717,7 @@ const buildContentOffsets = (match, ruleInfo) => {
3256
3717
  if (!ruleInfo.usesLineStartsAfter) return {};
3257
3718
  const captured = match.groups?.[`${ruleInfo.prefix}__content`];
3258
3719
  if (captured === void 0) return {};
3259
- return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
3720
+ return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
3260
3721
  };
3261
3722
  const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
3262
3723
  const createSplitPointFromMatch = (match, rule, ruleInfo) => {
@@ -3271,7 +3732,32 @@ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
3271
3732
  wordIndex
3272
3733
  };
3273
3734
  };
3735
+ const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
3736
+ const arr = splitPointsByRule.get(originalIndex);
3737
+ if (!arr) {
3738
+ splitPointsByRule.set(originalIndex, [point]);
3739
+ return;
3740
+ }
3741
+ arr.push(point);
3742
+ };
3743
+ /**
3744
+ * Executes a combined regex over the content for combinable rules and records
3745
+ * any resulting split points into `splitPointsByRule`.
3746
+ *
3747
+ * This function mutates `splitPointsByRule` in place and throws if the regex
3748
+ * iteration guard is exceeded.
3749
+ *
3750
+ * @param matchContent - Concatenated content being segmented
3751
+ * @param combinableRules - Rules that can be combined into a single alternation
3752
+ * @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
3753
+ * @param pageMap - Page boundary mapping utilities for the content
3754
+ * @param passesPageStartGuard - Callback that decides whether a match is allowed
3755
+ * @param splitPointsByRule - Mutable map collecting split points by rule index
3756
+ * @param logger - Optional logger for iteration diagnostics
3757
+ * @returns Nothing; results are written into `splitPointsByRule`
3758
+ */
3274
3759
  const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
3760
+ assertCombinedRuleAlignment(combinableRules, ruleRegexes);
3275
3761
  const combinedSource = ruleRegexes.map((r) => r.source).join("|");
3276
3762
  const combinedRegex = new RegExp(combinedSource, "gm");
3277
3763
  logger?.debug?.("[segmenter] combined regex built", {
@@ -3286,19 +3772,29 @@ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, page
3286
3772
  iterations,
3287
3773
  position: m.index
3288
3774
  });
3289
- const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
3290
- if (matchedIndex !== -1) {
3291
- const { rule, index: originalIndex } = combinableRules[matchedIndex];
3292
- if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
3293
- const arr = splitPointsByRule.get(originalIndex);
3294
- if (!arr) splitPointsByRule.set(originalIndex, [createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex])]);
3295
- else arr.push(createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex]));
3296
- }
3297
- }
3775
+ processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
3298
3776
  if (m[0].length === 0) combinedRegex.lastIndex++;
3299
3777
  m = combinedRegex.exec(matchContent);
3300
3778
  }
3301
3779
  };
3780
+ const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
3781
+ if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
3782
+ for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
3783
+ };
3784
+ const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
3785
+ const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
3786
+ if (matchedIndex === -1) return;
3787
+ const { rule, index: originalIndex } = combinableRules[matchedIndex];
3788
+ if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
3789
+ addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
3790
+ };
3791
+ /**
3792
+ * Builds compiled regex metadata for each combinable rule while preserving the
3793
+ * prefix used to identify the matching branch inside a combined alternation.
3794
+ *
3795
+ * @param combinableRules - Rules eligible for combined-regex processing
3796
+ * @returns Rule regex metadata aligned with the input order
3797
+ */
3302
3798
  const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
3303
3799
  const built = buildRuleRegex(rule, prefix);
3304
3800
  return {
@@ -3307,6 +3803,18 @@ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefi
3307
3803
  source: `(?<${prefix}>${built.regex.source})`
3308
3804
  };
3309
3805
  });
3806
+ /**
3807
+ * Processes a standalone rule by matching it independently and appending its
3808
+ * resulting split points into `splitPointsByRule`.
3809
+ *
3810
+ * @param rule - The standalone split rule to evaluate
3811
+ * @param ruleIndex - Original rule index in the caller's rules array
3812
+ * @param matchContent - Concatenated content being segmented
3813
+ * @param pageMap - Page boundary mapping utilities for the content
3814
+ * @param passesPageStartGuard - Callback that decides whether a match is allowed
3815
+ * @param splitPointsByRule - Mutable map collecting split points by rule index
3816
+ * @returns Nothing; results are written into `splitPointsByRule`
3817
+ */
3310
3818
  const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
3311
3819
  const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
3312
3820
  const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
@@ -3341,6 +3849,15 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
3341
3849
  }
3342
3850
  return matches;
3343
3851
  };
3852
+ /**
3853
+ * Applies per-rule occurrence filtering and optional debug metadata patches to
3854
+ * the collected split points.
3855
+ *
3856
+ * @param rules - Full rule list in original order
3857
+ * @param splitPointsByRule - Split points grouped by originating rule index
3858
+ * @param debugMetaKey - Optional metadata key used for debug provenance patches
3859
+ * @returns Flattened split points after occurrence filtering and debug merging
3860
+ */
3344
3861
  const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
3345
3862
  const result = [];
3346
3863
  rules.forEach((rule, index) => {
@@ -3358,7 +3875,6 @@ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
3358
3875
  });
3359
3876
  return result;
3360
3877
  };
3361
-
3362
3878
  //#endregion
3363
3879
  //#region src/segmentation/segmenter.ts
3364
3880
  /**
@@ -3432,10 +3948,30 @@ const dedupeSplitPoints = (splitPoints) => {
3432
3948
  const byIndex = /* @__PURE__ */ new Map();
3433
3949
  for (const p of splitPoints) {
3434
3950
  const existing = byIndex.get(p.index);
3435
- if (!existing || p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
3951
+ if (!existing) {
3952
+ byIndex.set(p.index, p);
3953
+ continue;
3954
+ }
3955
+ byIndex.set(p.index, mergeSplitPoints(existing, p));
3436
3956
  }
3437
3957
  return [...byIndex.values()].sort((a, b) => a.index - b.index);
3438
3958
  };
3959
+ const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
3960
+ const mergeRecord = (existing, incoming) => existing || incoming ? {
3961
+ ...existing ?? {},
3962
+ ...incoming ?? {}
3963
+ } : void 0;
3964
+ const mergeSplitPoints = (existing, incoming) => {
3965
+ const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
3966
+ const fallback = preferred === incoming ? existing : incoming;
3967
+ return {
3968
+ ...fallback,
3969
+ ...preferred,
3970
+ contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
3971
+ meta: mergeRecord(existing.meta, incoming.meta),
3972
+ namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
3973
+ };
3974
+ };
3439
3975
  /**
3440
3976
  * If no structural rules produced segments, create a single segment spanning all pages.
3441
3977
  * This allows breakpoint processing to still run.
@@ -3468,7 +4004,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey,
3468
4004
  });
3469
4005
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
3470
4006
  if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
3471
- for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
4007
+ for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
3472
4008
  return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
3473
4009
  };
3474
4010
  /**
@@ -3508,7 +4044,7 @@ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
3508
4044
  * @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
3509
4045
  */
3510
4046
  const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
3511
- if (!content || !content.includes("\n")) return content;
4047
+ if (!content?.includes("\n")) return content;
3512
4048
  if (pageJoiner === "newline") return content;
3513
4049
  const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
3514
4050
  if (breaksInRange.length === 0) return content;
@@ -3616,16 +4152,23 @@ const segmentPages = (pages, options) => {
3616
4152
  * @returns Array of segment objects
3617
4153
  */
3618
4154
  const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
4155
+ const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
4156
+ const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4157
+ const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
4158
+ const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
4159
+ ...meta,
4160
+ ...namedCaptures
4161
+ } : void 0;
3619
4162
  /**
3620
4163
  * Creates a single segment from a content range.
3621
4164
  */
3622
4165
  const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
3623
- const actualStart = start + (contentStartOffset ?? 0);
4166
+ const actualStart = getActualStart(start, contentStartOffset);
3624
4167
  const sliced = content.slice(actualStart, end);
3625
- let text = capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4168
+ let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
3626
4169
  if (!text) return null;
3627
4170
  if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
3628
- const adjustedStart = actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
4171
+ const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
3629
4172
  const from = pageMap.getId(adjustedStart);
3630
4173
  const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
3631
4174
  const seg = {
@@ -3633,10 +4176,8 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
3633
4176
  from
3634
4177
  };
3635
4178
  if (to !== from) seg.to = to;
3636
- if (meta || namedCaptures) seg.meta = {
3637
- ...meta,
3638
- ...namedCaptures
3639
- };
4179
+ const mergedMeta = applyMeta(meta, namedCaptures);
4180
+ if (mergedMeta) seg.meta = mergedMeta;
3640
4181
  return seg;
3641
4182
  };
3642
4183
  /**
@@ -3668,659 +4209,6 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
3668
4209
  }
3669
4210
  return [...segments, ...createSegmentsFromSplitPoints()];
3670
4211
  };
3671
-
3672
- //#endregion
3673
- //#region src/recovery.ts
3674
- const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
3675
- const normalizeForCompare = (s, mode) => {
3676
- if (mode === "none") return s;
3677
- let out = s;
3678
- if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
3679
- out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
3680
- return out;
3681
- };
3682
- const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
3683
- const buildFixedOptions = (options, selectedRuleIndices) => {
3684
- const fixedRules = (options.rules ?? []).map((r, idx) => {
3685
- if (!selectedRuleIndices.has(idx)) return r;
3686
- if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
3687
- const { lineStartsAfter, ...rest } = r;
3688
- return {
3689
- ...rest,
3690
- lineStartsWith: lineStartsAfter
3691
- };
3692
- });
3693
- return {
3694
- ...options,
3695
- rules: fixedRules
3696
- };
3697
- };
3698
- const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
3699
- const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
3700
- const parts = [];
3701
- for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
3702
- const matchContent = parts.join("\n");
3703
- if (pageJoiner === "newline") return {
3704
- matchContent,
3705
- outputContent: matchContent
3706
- };
3707
- return {
3708
- matchContent,
3709
- outputContent: parts.join(" ")
3710
- };
3711
- };
3712
- const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
3713
- const rules = options.rules ?? [];
3714
- const compiled = [];
3715
- for (const idx of selectedRuleIndices) {
3716
- const r = rules[idx];
3717
- if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3718
- const { lineStartsAfter, ...rest } = r;
3719
- const built = buildRuleRegex({
3720
- ...rest,
3721
- lineStartsWith: lineStartsAfter
3722
- });
3723
- compiled.push({
3724
- ruleIndex: idx,
3725
- startsWithRegex: new RegExp(built.regex.source, "mu")
3726
- });
3727
- }
3728
- return compiled;
3729
- };
3730
- const findUniqueAnchorPos = (outputContent, segmentContent) => {
3731
- for (const len of [
3732
- 80,
3733
- 60,
3734
- 40,
3735
- 30,
3736
- 20,
3737
- 15
3738
- ]) {
3739
- const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
3740
- if (!needle.trim()) continue;
3741
- const first = outputContent.indexOf(needle);
3742
- if (first === -1) continue;
3743
- if (outputContent.indexOf(needle, first + 1) === -1) return first;
3744
- }
3745
- return null;
3746
- };
3747
- const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
3748
- const line = matchContent.slice(lineStart);
3749
- for (const mr of compiledMistaken) {
3750
- mr.startsWithRegex.lastIndex = 0;
3751
- const m = mr.startsWithRegex.exec(line);
3752
- if (!m || m.index !== 0) continue;
3753
- const markerMatch = m[0];
3754
- const markerEnd = lineStart + markerMatch.length;
3755
- if (anchorPos < markerEnd) continue;
3756
- const gap = matchContent.slice(markerEnd, anchorPos);
3757
- const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
3758
- if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
3759
- return { prefix: recoveredPrefix };
3760
- }
3761
- return { reason: "no selected marker pattern matched at anchored line start" };
3762
- };
3763
- const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
3764
- const fromIdx = pageIdToIndex.get(segment.from);
3765
- const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
3766
- if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
3767
- kind: "unresolved",
3768
- reason: "segment page range not found in pages"
3769
- };
3770
- const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
3771
- if (!segment.content) return {
3772
- kind: "unresolved",
3773
- reason: "empty segment content"
3774
- };
3775
- const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
3776
- if (anchorPos === null) return {
3777
- kind: "unresolved",
3778
- reason: "could not uniquely anchor segment content in page range"
3779
- };
3780
- const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
3781
- const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
3782
- if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
3783
- kind: "unresolved",
3784
- reason: found.reason
3785
- };
3786
- return {
3787
- kind: "recovered",
3788
- recoveredContent: `${found.prefix}${segment.content}`,
3789
- recoveredPrefix: found.prefix
3790
- };
3791
- };
3792
- const resolveRuleIndicesSelector = (rules, indicesIn) => {
3793
- const errors = [];
3794
- const indices = /* @__PURE__ */ new Set();
3795
- for (const idx of indicesIn) {
3796
- if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
3797
- errors.push(`Selector index out of range: ${idx}`);
3798
- continue;
3799
- }
3800
- const rule = rules[idx];
3801
- if (!rule || !("lineStartsAfter" in rule)) {
3802
- errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
3803
- continue;
3804
- }
3805
- indices.add(idx);
3806
- }
3807
- return {
3808
- errors,
3809
- indices,
3810
- warnings: []
3811
- };
3812
- };
3813
- const resolvePredicateSelector = (rules, predicate) => {
3814
- const errors = [];
3815
- const warnings = [];
3816
- const indices = /* @__PURE__ */ new Set();
3817
- rules.forEach((r, i) => {
3818
- try {
3819
- if (!predicate(r, i)) return;
3820
- if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
3821
- indices.add(i);
3822
- return;
3823
- }
3824
- warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
3825
- } catch (e) {
3826
- const msg = e instanceof Error ? e.message : String(e);
3827
- errors.push(`Predicate threw at rule ${i}: ${msg}`);
3828
- }
3829
- });
3830
- if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
3831
- return {
3832
- errors,
3833
- indices,
3834
- warnings
3835
- };
3836
- };
3837
- const resolvePatternsSelector = (rules, patterns, matchMode) => {
3838
- const errors = [];
3839
- const warnings = [];
3840
- const indices = /* @__PURE__ */ new Set();
3841
- const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
3842
- const targets = patterns.map(normalizePattern);
3843
- for (let pi = 0; pi < patterns.length; pi++) {
3844
- const rawPattern = patterns[pi];
3845
- const pat = targets[pi];
3846
- const matched = [];
3847
- for (let i = 0; i < rules.length; i++) {
3848
- const r = rules[i];
3849
- if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3850
- if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
3851
- }
3852
- if (matched.length === 0) {
3853
- errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
3854
- continue;
3855
- }
3856
- if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
3857
- matched.forEach((i) => {
3858
- indices.add(i);
3859
- });
3860
- }
3861
- return {
3862
- errors,
3863
- indices,
3864
- warnings
3865
- };
3866
- };
3867
- const resolveSelectorToRuleIndices = (options, selector) => {
3868
- const rules = options.rules ?? [];
3869
- if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
3870
- if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
3871
- return resolvePatternsSelector(rules, selector.patterns, selector.match);
3872
- };
3873
- const longestCommonSuffixLength = (a, b) => {
3874
- const max = Math.min(a.length, b.length);
3875
- let i = 0;
3876
- while (i < max) {
3877
- if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
3878
- i++;
3879
- }
3880
- return i;
3881
- };
3882
- const AMBIGUITY_SCORE_GAP = 5;
3883
- const scoreCandidate = (orig, fixed, normalizeMode) => {
3884
- if (fixed.content === orig.content) return {
3885
- fixedIndex: -1,
3886
- kind: "exact",
3887
- score: 100
3888
- };
3889
- if (fixed.content.endsWith(orig.content)) {
3890
- const markerLen = fixed.content.length - orig.content.length;
3891
- return {
3892
- fixedIndex: -1,
3893
- kind: "exact_suffix",
3894
- score: 90 + Math.min(30, markerLen)
3895
- };
3896
- }
3897
- if (normalizeMode !== "none") {
3898
- const normFixed = normalizeForCompare(fixed.content, normalizeMode);
3899
- const normOrig = normalizeForCompare(orig.content, normalizeMode);
3900
- if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
3901
- const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
3902
- return {
3903
- fixedIndex: -1,
3904
- kind: "normalized_suffix",
3905
- score: 70 + Math.floor(overlap * 20)
3906
- };
3907
- }
3908
- }
3909
- return null;
3910
- };
3911
- const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
3912
- const warnings = [...reportBase.warnings];
3913
- warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
3914
- const details = segments.map((s, i) => {
3915
- const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
3916
- return {
3917
- from: s.from,
3918
- notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
3919
- originalStartPreview: preview(s.content),
3920
- segmentIndex: i,
3921
- status,
3922
- strategy: "none",
3923
- to: s.to
3924
- };
3925
- });
3926
- return {
3927
- report: {
3928
- ...reportBase,
3929
- details,
3930
- summary: {
3931
- mode,
3932
- recovered: 0,
3933
- totalSegments: segments.length,
3934
- unchanged: segments.length,
3935
- unresolved: selectorErrors.length ? segments.length : 0
3936
- },
3937
- warnings
3938
- },
3939
- segments
3940
- };
3941
- };
3942
- const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
3943
- const recoveredAtIndex = /* @__PURE__ */ new Map();
3944
- const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
3945
- if (mode !== "best_effort_then_rerun") return {
3946
- recoveredAtIndex,
3947
- recoveredDetailAtIndex
3948
- };
3949
- const pageIdToIndex = buildPageIdToIndex(pages);
3950
- const pageJoiner = options.pageJoiner ?? "space";
3951
- const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
3952
- for (let i = 0; i < segments.length; i++) {
3953
- const orig = segments[i];
3954
- const r = tryBestEffortRecoverOneSegment(orig, pages, pageIdToIndex, compiledMistaken, pageJoiner);
3955
- if (r.kind !== "recovered") continue;
3956
- const seg = {
3957
- ...orig,
3958
- content: r.recoveredContent
3959
- };
3960
- recoveredAtIndex.set(i, seg);
3961
- recoveredDetailAtIndex.set(i, {
3962
- from: orig.from,
3963
- originalStartPreview: preview(orig.content),
3964
- recoveredPrefixPreview: preview(r.recoveredPrefix),
3965
- recoveredStartPreview: preview(seg.content),
3966
- segmentIndex: i,
3967
- status: "recovered",
3968
- strategy: "stage1",
3969
- to: orig.to
3970
- });
3971
- }
3972
- return {
3973
- recoveredAtIndex,
3974
- recoveredDetailAtIndex
3975
- };
3976
- };
3977
- const buildFixedBuckets = (fixedSegments) => {
3978
- const buckets = /* @__PURE__ */ new Map();
3979
- for (let i = 0; i < fixedSegments.length; i++) {
3980
- const k = segmentRangeKey(fixedSegments[i]);
3981
- const arr = buckets.get(k);
3982
- if (!arr) buckets.set(k, [i]);
3983
- else arr.push(i);
3984
- }
3985
- return buckets;
3986
- };
3987
- const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
3988
- let best = null;
3989
- let secondBestScore = -Infinity;
3990
- for (const fixedIdx of candidates) {
3991
- if (usedFixed.has(fixedIdx)) continue;
3992
- const fixed = fixedSegments[fixedIdx];
3993
- const scored = scoreCandidate(orig, fixed, normalizeCompare);
3994
- if (!scored) continue;
3995
- const candidateScore = scored.score;
3996
- if (!best || candidateScore > best.score) {
3997
- secondBestScore = best?.score ?? -Infinity;
3998
- best = {
3999
- fixedIdx,
4000
- score: candidateScore
4001
- };
4002
- } else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
4003
- }
4004
- if (!best) return { kind: "none" };
4005
- if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
4006
- return {
4007
- fixedIdx: best.fixedIdx,
4008
- kind: "match"
4009
- };
4010
- };
4011
- const detailUnresolved = (orig, segmentIndex, notes) => ({
4012
- from: orig.from,
4013
- notes,
4014
- originalStartPreview: preview(orig.content),
4015
- segmentIndex,
4016
- status: "unresolved_alignment",
4017
- strategy: "rerun",
4018
- to: orig.to
4019
- });
4020
- const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
4021
- from: orig.from,
4022
- notes,
4023
- originalStartPreview: preview(orig.content),
4024
- segmentIndex,
4025
- status: "skipped_idempotent",
4026
- strategy: "rerun",
4027
- to: orig.to
4028
- });
4029
- const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
4030
- let recoveredPrefixPreview;
4031
- if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
4032
- return {
4033
- from: orig.from,
4034
- originalStartPreview: preview(orig.content),
4035
- recoveredPrefixPreview,
4036
- recoveredStartPreview: preview(fixed.content),
4037
- segmentIndex,
4038
- status: "recovered",
4039
- strategy: "rerun",
4040
- to: orig.to
4041
- };
4042
- };
4043
- const mergeWithRerun = (params) => {
4044
- const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
4045
- const usedFixed = /* @__PURE__ */ new Set();
4046
- const out = [];
4047
- const details = [];
4048
- let recovered = 0;
4049
- let unresolved = 0;
4050
- let unchanged = 0;
4051
- for (let i = 0; i < originalSegments.length; i++) {
4052
- const stage1Recovered = stage1RecoveredAtIndex.get(i);
4053
- if (stage1Recovered) {
4054
- out.push(stage1Recovered);
4055
- recovered++;
4056
- details.push(recoveredDetailAtIndex.get(i) ?? {
4057
- from: stage1Recovered.from,
4058
- originalStartPreview: preview(originalSegments[i].content),
4059
- recoveredStartPreview: preview(stage1Recovered.content),
4060
- segmentIndex: i,
4061
- status: "recovered",
4062
- strategy: "stage1",
4063
- to: stage1Recovered.to
4064
- });
4065
- continue;
4066
- }
4067
- const orig = originalSegments[i];
4068
- const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
4069
- if (best.kind === "none") {
4070
- out.push(orig);
4071
- unresolved++;
4072
- details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
4073
- continue;
4074
- }
4075
- if (best.kind === "ambiguous") {
4076
- out.push(orig);
4077
- unresolved++;
4078
- details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
4079
- continue;
4080
- }
4081
- usedFixed.add(best.fixedIdx);
4082
- const fixed = fixedSegments[best.fixedIdx];
4083
- if (fixed.content === orig.content) {
4084
- out.push(orig);
4085
- unchanged++;
4086
- details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
4087
- continue;
4088
- }
4089
- out.push({
4090
- ...orig,
4091
- content: fixed.content
4092
- });
4093
- recovered++;
4094
- details.push(detailRecoveredRerun(orig, fixed, i));
4095
- }
4096
- return {
4097
- details,
4098
- segments: out,
4099
- summary: {
4100
- recovered,
4101
- unchanged,
4102
- unresolved
4103
- }
4104
- };
4105
- };
4106
- function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
4107
- const mode = opts?.mode ?? "rerun_only";
4108
- const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
4109
- const resolved = resolveSelectorToRuleIndices(options, selector);
4110
- const reportBase = {
4111
- byRun: void 0,
4112
- errors: resolved.errors,
4113
- warnings: resolved.warnings
4114
- };
4115
- if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
4116
- const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
4117
- const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
4118
- const merged = mergeWithRerun({
4119
- fixedBuckets: buildFixedBuckets(fixedSegments),
4120
- fixedSegments,
4121
- normalizeCompare,
4122
- originalSegments: segments,
4123
- recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
4124
- stage1RecoveredAtIndex: stage1.recoveredAtIndex
4125
- });
4126
- return {
4127
- report: {
4128
- ...reportBase,
4129
- details: merged.details,
4130
- summary: {
4131
- mode,
4132
- recovered: merged.summary.recovered,
4133
- totalSegments: segments.length,
4134
- unchanged: merged.summary.unchanged,
4135
- unresolved: merged.summary.unresolved
4136
- }
4137
- },
4138
- segments: merged.segments
4139
- };
4140
- }
4141
- function recoverMistakenMarkersForRuns(runs, opts) {
4142
- const allSegments = [];
4143
- const byRun = [];
4144
- const details = [];
4145
- const warnings = [];
4146
- const errors = [];
4147
- let recovered = 0;
4148
- let unchanged = 0;
4149
- let unresolved = 0;
4150
- let offset = 0;
4151
- for (let i = 0; i < runs.length; i++) {
4152
- const run = runs[i];
4153
- const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
4154
- allSegments.push(...res.segments);
4155
- for (const d of res.report.details) details.push({
4156
- ...d,
4157
- segmentIndex: d.segmentIndex + offset
4158
- });
4159
- offset += run.segments.length;
4160
- recovered += res.report.summary.recovered;
4161
- unchanged += res.report.summary.unchanged;
4162
- unresolved += res.report.summary.unresolved;
4163
- warnings.push(...res.report.warnings);
4164
- errors.push(...res.report.errors);
4165
- byRun.push({
4166
- recovered: res.report.summary.recovered,
4167
- runIndex: i,
4168
- totalSegments: run.segments.length,
4169
- unresolved: res.report.summary.unresolved
4170
- });
4171
- }
4172
- return {
4173
- report: {
4174
- byRun,
4175
- details,
4176
- errors,
4177
- summary: {
4178
- mode: opts?.mode ?? "rerun_only",
4179
- recovered,
4180
- totalSegments: offset,
4181
- unchanged,
4182
- unresolved
4183
- },
4184
- warnings
4185
- },
4186
- segments: allSegments
4187
- };
4188
- }
4189
-
4190
- //#endregion
4191
- //#region src/segmentation/pattern-validator.ts
4192
- const KNOWN_TOKENS = new Set(getAvailableTokens());
4193
- const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
4194
- const buildBareTokenRegex = () => {
4195
- const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
4196
- return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
4197
- };
4198
- /**
4199
- * Validates a single pattern for common issues.
4200
- */
4201
- const validatePattern = (pattern, seenPatterns) => {
4202
- if (!pattern.trim()) return {
4203
- message: "Empty pattern is not allowed",
4204
- type: "empty_pattern"
4205
- };
4206
- if (seenPatterns.has(pattern)) return {
4207
- message: `Duplicate pattern: "${pattern}"`,
4208
- pattern,
4209
- type: "duplicate"
4210
- };
4211
- seenPatterns.add(pattern);
4212
- TOKEN_INSIDE_BRACES.lastIndex = 0;
4213
- for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
4214
- const name = match[1];
4215
- if (!KNOWN_TOKENS.has(name)) return {
4216
- message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
4217
- suggestion: "Check spelling or use a known token",
4218
- token: name,
4219
- type: "unknown_token"
4220
- };
4221
- }
4222
- for (const match of pattern.matchAll(buildBareTokenRegex())) {
4223
- const [full, name] = match;
4224
- const idx = match.index;
4225
- if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
4226
- message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
4227
- suggestion: `{{${full}}}`,
4228
- token: name,
4229
- type: "missing_braces"
4230
- };
4231
- }
4232
- };
4233
- /**
4234
- * Validates an array of patterns, returning parallel array of issues.
4235
- */
4236
- const validatePatternArray = (patterns) => {
4237
- const seen = /* @__PURE__ */ new Set();
4238
- const issues = patterns.map((p) => validatePattern(p, seen));
4239
- return issues.some(Boolean) ? issues : void 0;
4240
- };
4241
- /**
4242
- * Validates split rules for common pattern issues.
4243
- *
4244
- * Checks for:
4245
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
4246
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
4247
- * - Duplicate patterns within the same rule
4248
- *
4249
- * @param rules - Array of split rules to validate
4250
- * @returns Array parallel to input with validation results (undefined if no issues)
4251
- *
4252
- * @example
4253
- * const issues = validateRules([
4254
- * { lineStartsAfter: ['raqms:num'] }, // Missing braces
4255
- * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
4256
- * ]);
4257
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
4258
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
4259
- */
4260
- const validateRules = (rules) => rules.map((rule) => {
4261
- const result = {};
4262
- let hasIssues = false;
4263
- for (const key of [
4264
- "lineStartsWith",
4265
- "lineStartsAfter",
4266
- "lineEndsWith"
4267
- ]) if (key in rule && rule[key]) {
4268
- const issues = validatePatternArray(rule[key]);
4269
- if (issues) {
4270
- result[key] = issues;
4271
- hasIssues = true;
4272
- }
4273
- }
4274
- if ("template" in rule && rule.template !== void 0) {
4275
- const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
4276
- if (issue) {
4277
- result.template = issue;
4278
- hasIssues = true;
4279
- }
4280
- }
4281
- return hasIssues ? result : void 0;
4282
- });
4283
- /**
4284
- * Formats a validation result array into a list of human-readable error messages.
4285
- *
4286
- * Useful for displaying validation errors in UIs.
4287
- *
4288
- * @param results - The result array from `validateRules()`
4289
- * @returns Array of formatted error strings
4290
- *
4291
- * @example
4292
- * const issues = validateRules(rules);
4293
- * const errors = formatValidationReport(issues);
4294
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
4295
- */
4296
- const formatValidationReport = (results) => results.flatMap((result, i) => {
4297
- if (!result) return [];
4298
- return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => {
4299
- if (!issue) return null;
4300
- const loc = `Rule ${i + 1}, ${type}`;
4301
- if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
4302
- if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
4303
- if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
4304
- return `${loc}: ${issue.message || issue.type}`;
4305
- })).filter((msg) => msg !== null);
4306
- });
4307
-
4308
- //#endregion
4309
- //#region src/validation/validation-constants.ts
4310
- /**
4311
- * Validation-specific constants
4312
- */
4313
- /**
4314
- * Limit for validation issue preview length (characters).
4315
- */
4316
- const PREVIEW_LIMIT = 140;
4317
- /**
4318
- * Threshold for short segment content (characters).
4319
- * Segments shorter than this will trigger a full-document search fallback
4320
- * if not found in the expected window.
4321
- */
4322
- const FULL_SEARCH_THRESHOLD = 500;
4323
-
4324
4212
  //#endregion
4325
4213
  //#region src/validation/validate-segments.ts
4326
4214
  /**
@@ -4329,8 +4217,8 @@ const FULL_SEARCH_THRESHOLD = 500;
4329
4217
  */
4330
4218
  const buildPreview = (text) => {
4331
4219
  const normalized = text.replace(/\s+/g, " ").trim();
4332
- if (normalized.length <= PREVIEW_LIMIT) return normalized;
4333
- return `${normalized.slice(0, PREVIEW_LIMIT)}...`;
4220
+ if (normalized.length <= 140) return normalized;
4221
+ return `${normalized.slice(0, 140)}...`;
4334
4222
  };
4335
4223
  /**
4336
4224
  * Creates a lightweight snapshot of a segment for inclusion in validation checks.
@@ -4358,19 +4246,18 @@ const normalizePages = (pages, options) => {
4358
4246
  */
4359
4247
  const buildJoinedContent = (pages, joiner) => {
4360
4248
  const boundaries = [];
4361
- const nonEmptyPages = pages.filter((p) => p.content);
4362
- const joined = nonEmptyPages.map((p) => p.content).join(joiner);
4249
+ const joined = pages.map((p) => p.content).join(joiner);
4363
4250
  let offset = 0;
4364
- for (let i = 0; i < nonEmptyPages.length; i++) {
4365
- const content = nonEmptyPages[i].content;
4251
+ for (let i = 0; i < pages.length; i++) {
4252
+ const content = pages[i].content;
4366
4253
  const start = offset;
4367
- const end = start + content.length - 1;
4254
+ const end = start + content.length;
4368
4255
  boundaries.push({
4369
4256
  end,
4370
- id: nonEmptyPages[i].id,
4257
+ id: pages[i].id,
4371
4258
  start
4372
4259
  });
4373
- offset = end + 1 + (i < nonEmptyPages.length - 1 ? joiner.length : 0);
4260
+ offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
4374
4261
  }
4375
4262
  return {
4376
4263
  boundaries,
@@ -4561,7 +4448,7 @@ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, search
4561
4448
  const bufferSize = 1e3;
4562
4449
  const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
4563
4450
  if (rawMatches.length === 0) {
4564
- const threshold = validationOptions?.fullSearchThreshold ?? FULL_SEARCH_THRESHOLD;
4451
+ const threshold = validationOptions?.fullSearchThreshold ?? 500;
4565
4452
  if (content.length < threshold) {
4566
4453
  const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
4567
4454
  const validMatch = fullMatches.find((m) => {
@@ -4715,7 +4602,7 @@ const validateSegments = (pages, options, segments, validationOptions) => {
4715
4602
  }
4716
4603
  };
4717
4604
  };
4718
-
4719
4605
  //#endregion
4720
- export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
4606
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
4607
+
4721
4608
  //# sourceMappingURL=index.mjs.map