flappa-doormal 2.17.1 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,141 +1,25 @@
1
- //#region src/utils/textUtils.ts
1
+ //#region src/segmentation/tokens.ts
2
2
  /**
3
- * Normalizes line endings to Unix-style (`\n`).
4
- *
5
- * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
6
- * for consistent pattern matching across platforms.
3
+ * Arabic base letters used by low-level dictionary-style regex helpers.
7
4
  *
8
- * @param content - Raw content with potentially mixed line endings
9
- * @returns Content with all line endings normalized to `\n`
5
+ * This is intentionally broader than `{{harf}}`:
6
+ * - includes standalone hamza `ء`
7
+ * - stays as a raw regex fragment rather than a template token
10
8
  */
11
- const normalizeLineEndings = (content) => {
12
- return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
13
- };
9
+ const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
14
10
  /**
15
- * Escapes regex metacharacters (parentheses and brackets) in template patterns,
16
- * but preserves content inside `{{...}}` token delimiters.
17
- *
18
- * This allows users to write intuitive patterns like `({{harf}}):` instead of
19
- * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
20
- * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
21
- *
22
- * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
23
- * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
24
- *
25
- * @example
26
- * escapeTemplateBrackets('({{harf}}): ')
27
- * // → '\\({{harf}}\\): '
28
- *
29
- * @example
30
- * escapeTemplateBrackets('[{{raqm}}] ')
31
- * // → '\\[{{raqm}}\\] '
32
- *
33
- * @example
34
- * escapeTemplateBrackets('{{harf}}')
35
- * // → '{{harf}}' (unchanged - no brackets outside tokens)
11
+ * Arabic combining marks / annotation signs used by low-level regex helpers.
36
12
  */
37
- const escapeTemplateBrackets = (pattern) => {
38
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
39
- };
13
+ const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
40
14
  /**
41
- * Character class matching all Arabic diacritics (Tashkeel/Harakat).
42
- *
43
- * Includes the following diacritical marks:
44
- * - U+064B: ً (fathatan - double fatha)
45
- * - U+064C: ٌ (dammatan - double damma)
46
- * - U+064D: ٍ (kasratan - double kasra)
47
- * - U+064E: َ (fatha - short a)
48
- * - U+064F: ُ (damma - short u)
49
- * - U+0650: ِ (kasra - short i)
50
- * - U+0651: ّ (shadda - gemination)
51
- * - U+0652: ْ (sukun - no vowel)
52
- *
53
- * @internal
15
+ * A single Arabic base letter followed by zero or more combining marks.
54
16
  */
55
- const DIACRITICS_CLASS = "[ًٌٍَُِّْ]";
17
+ const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
56
18
  /**
57
- * Groups of equivalent Arabic characters.
58
- *
59
- * Characters within the same group are considered equivalent for matching purposes.
60
- * This handles common variations in Arabic text where different characters are
61
- * used interchangeably or have the same underlying meaning.
62
- *
63
- * Equivalence groups:
64
- * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
65
- * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
66
- * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
67
- *
68
- * @internal
19
+ * One or more Arabic letters, where each letter may carry combining marks.
69
20
  */
70
- const EQUIV_GROUPS = [
71
- [
72
- "ا",
73
- "آ",
74
- "أ",
75
- "إ"
76
- ],
77
- ["ة", "ه"],
78
- ["ى", "ي"]
79
- ];
80
- /**
81
- * Escapes a string for safe inclusion in a regular expression.
82
- *
83
- * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
84
- *
85
- * @param s - Any string to escape
86
- * @returns String with regex metacharacters escaped
87
- *
88
- * @example
89
- * escapeRegex('hello.world') // → 'hello\\.world'
90
- * escapeRegex('[test]') // → '\\[test\\]'
91
- * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
92
- */
93
- const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
94
- const getEquivClass = (ch) => {
95
- const group = EQUIV_GROUPS.find((g) => g.includes(ch));
96
- return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
97
- };
98
- const normalizeArabicLight = (str) => {
99
- return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
100
- };
101
- const makeDiacriticInsensitive = (text) => {
102
- const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
103
- return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
104
- };
105
- const isCombiningMarkOrSelector = (char) => {
106
- if (!char) return false;
107
- return /\p{M}/u.test(char) || char === "︎" || char === "️";
108
- };
109
- const isJoiner = (char) => char === "‌" || char === "‍";
110
- /**
111
- * Ensures the position does not split a grapheme cluster (surrogate pairs,
112
- * combining marks, or zero-width joiners / variation selectors).
113
- *
114
- * This is only used as a last-resort fallback when we are forced to split
115
- * near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
116
- */
117
- const adjustForUnicodeBoundary = (content, position) => {
118
- let adjusted = position;
119
- while (adjusted > 0) {
120
- const high = content.charCodeAt(adjusted - 1);
121
- const low = content.charCodeAt(adjusted);
122
- if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
123
- adjusted -= 1;
124
- continue;
125
- }
126
- const nextChar = content[adjusted];
127
- const prevChar = content[adjusted - 1];
128
- if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
129
- adjusted -= 1;
130
- continue;
131
- }
132
- break;
133
- }
134
- return adjusted;
135
- };
136
-
137
- //#endregion
138
- //#region src/segmentation/tokens.ts
21
+ const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
22
+ const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
139
23
  const RUMUZ_ATOM = `(?:${[
140
24
  "تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
141
25
  "خت",
@@ -166,15 +50,25 @@ const RUMUZ_ATOM = `(?:${[
166
50
  ].join("|")})`;
167
51
  const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
168
52
  const BASE_TOKENS = {
53
+ /** Chapter marker (باب). */
169
54
  bab: "باب",
55
+ /** Basmala (بسم الله). Also matches ﷽. */
170
56
  basmalah: ["بسم الله", "﷽"].join("|"),
57
+ /** Bullet point variants: `•`, `*`, `°`. */
171
58
  bullet: "[•*°]",
59
+ /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
172
60
  dash: "[-–—ـ]",
61
+ /** Section marker (فصل / مسألة). */
173
62
  fasl: ["مسألة", "فصل"].join("|"),
63
+ /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
174
64
  harf: "[أ-ي]",
175
- harfs: "[أ-ي](?:\\s+[أ-ي])*",
65
+ /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
66
+ harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
67
+ /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
176
68
  hr: "[-–—ـ_=]{5,}",
69
+ /** Book marker (كتاب). */
177
70
  kitab: "كتاب",
71
+ /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
178
72
  naql: [
179
73
  "حدثني",
180
74
  "وأخبرنا",
@@ -186,33 +80,58 @@ const BASE_TOKENS = {
186
80
  "وحدثني",
187
81
  "وحدثنيه"
188
82
  ].join("|"),
83
+ /** Newline character. Useful for breakpoints that split on line boundaries. */
189
84
  newline: "\\n",
85
+ /** Single ASCII digit (0-9). */
190
86
  num: "\\d",
87
+ /** One or more ASCII digits (0-9)+. */
191
88
  nums: "\\d+",
89
+ /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
192
90
  raqm: "[\\u0660-\\u0669]",
91
+ /** One or more Arabic-Indic digits (٠-٩)+. */
193
92
  raqms: "[\\u0660-\\u0669]+",
93
+ /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
194
94
  rumuz: RUMUZ_BLOCK,
95
+ /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
195
96
  tarqim: "[.!?؟؛]"
196
97
  };
197
98
  /** Pre-defined token constants for use in patterns. */
198
99
  const Token = {
100
+ /** Chapter marker - باب */
199
101
  BAB: "{{bab}}",
102
+ /** Basmala - بسم الله */
200
103
  BASMALAH: "{{basmalah}}",
104
+ /** Bullet point variants */
201
105
  BULLET: "{{bullet}}",
106
+ /** Dash variants (hyphen, en-dash, em-dash, tatweel) */
202
107
  DASH: "{{dash}}",
108
+ /** Section marker - فصل / مسألة */
203
109
  FASL: "{{fasl}}",
110
+ /** Single Arabic letter */
204
111
  HARF: "{{harf}}",
112
+ /** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
205
113
  HARFS: "{{harfs}}",
114
+ /** Horizontal rule / separator (repeated dashes) */
206
115
  HR: "{{hr}}",
116
+ /** Book marker - كتاب */
207
117
  KITAB: "{{kitab}}",
118
+ /** Hadith transmission phrases */
208
119
  NAQL: "{{naql}}",
120
+ /** Newline character (for breakpoints) */
209
121
  NEWLINE: "{{newline}}",
122
+ /** Single ASCII digit */
210
123
  NUM: "{{num}}",
124
+ /** Composite: {{raqms}} {{dash}} (space) */
211
125
  NUMBERED: "{{numbered}}",
126
+ /** One or more ASCII digits */
212
127
  NUMS: "{{nums}}",
128
+ /** Single Arabic-Indic digit */
213
129
  RAQM: "{{raqm}}",
130
+ /** One or more Arabic-Indic digits */
214
131
  RAQMS: "{{raqms}}",
132
+ /** Source abbreviations (rijāl/takhrīj) */
215
133
  RUMUZ: "{{rumuz}}",
134
+ /** Punctuation marks */
216
135
  TARQIM: "{{tarqim}}"
217
136
  };
218
137
  /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
@@ -222,7 +141,9 @@ const withCapture = (token, name) => {
222
141
  return `{{${match[1]}:${name}}}`;
223
142
  };
224
143
  /** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
225
- const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
144
+ const COMPOSITE_TOKENS = {
145
+ /** Common hadith numbering format: Arabic-Indic digits + dash + space. */
146
+ numbered: "{{raqms}} {{dash}} " };
226
147
  /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
227
148
  const expandCompositeTokensInTemplate = (template) => {
228
149
  let out = template;
@@ -473,11 +394,11 @@ const templateToRegex = (template) => {
473
394
  * Useful for documentation, validation, or building user interfaces
474
395
  * that show available tokens.
475
396
  *
476
- * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
397
+ * @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
477
398
  *
478
399
  * @example
479
400
  * getAvailableTokens()
480
- * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
401
+ * // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
481
402
  */
482
403
  const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
483
404
  /**
@@ -486,13 +407,13 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
486
407
  * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
487
408
  * without any expansion or capture group wrapping.
488
409
  *
489
- * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
490
- * @returns The regex pattern string, or `undefined` if token doesn't exist
410
+ * @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
411
+ * @returns The regex pattern string for that known token
491
412
  *
492
413
  * @example
493
414
  * getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
494
415
  * getTokenPattern('dash') // → '[-–—ـ]'
495
- * getTokenPattern('unknown') // → undefined
416
+ * getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
496
417
  */
497
418
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
498
419
  /**
@@ -571,7 +492,161 @@ const applyTokenMappings = (template, mappings) => {
571
492
  const stripTokenMappings = (template) => {
572
493
  return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
573
494
  };
574
-
495
+ //#endregion
496
+ //#region src/utils/textUtils.ts
497
+ /**
498
+ * Normalizes line endings to Unix-style (`\n`).
499
+ *
500
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
501
+ * for consistent pattern matching across platforms.
502
+ *
503
+ * @param content - Raw content with potentially mixed line endings
504
+ * @returns Content with all line endings normalized to `\n`
505
+ */
506
+ const normalizeLineEndings = (content) => {
507
+ return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
508
+ };
509
+ /**
510
+ * Escapes regex metacharacters (parentheses and brackets) in template patterns,
511
+ * but preserves content inside `{{...}}` token delimiters.
512
+ *
513
+ * This allows users to write intuitive patterns like `({{harf}}):` instead of
514
+ * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
515
+ * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
516
+ *
517
+ * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
518
+ * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
519
+ *
520
+ * @example
521
+ * escapeTemplateBrackets('({{harf}}): ')
522
+ * // → '\\({{harf}}\\): '
523
+ *
524
+ * @example
525
+ * escapeTemplateBrackets('[{{raqm}}] ')
526
+ * // → '\\[{{raqm}}\\] '
527
+ *
528
+ * @example
529
+ * escapeTemplateBrackets('{{harf}}')
530
+ * // → '{{harf}}' (unchanged - no brackets outside tokens)
531
+ */
532
+ const escapeTemplateBrackets = (pattern) => {
533
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
534
+ };
535
+ /**
536
+ * Character class matching all Arabic diacritics (Tashkeel/Harakat).
537
+ *
538
+ * Includes the following diacritical marks:
539
+ * - U+0640: ـ (tatweel / kashida)
540
+ * - U+064B: ً (fathatan - double fatha)
541
+ * - U+064C: ٌ (dammatan - double damma)
542
+ * - U+064D: ٍ (kasratan - double kasra)
543
+ * - U+064E: َ (fatha - short a)
544
+ * - U+064F: ُ (damma - short u)
545
+ * - U+0650: ِ (kasra - short i)
546
+ * - U+0651: ّ (shadda - gemination)
547
+ * - U+0652: ْ (sukun - no vowel)
548
+ *
549
+ * @internal
550
+ */
551
+ const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
552
+ /**
553
+ * Groups of equivalent Arabic characters.
554
+ *
555
+ * Characters within the same group are considered equivalent for matching purposes.
556
+ * This handles common variations in Arabic text where different characters are
557
+ * used interchangeably or have the same underlying meaning.
558
+ *
559
+ * Equivalence groups:
560
+ * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
561
+ * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
562
+ * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
563
+ *
564
+ * @internal
565
+ */
566
+ const EQUIV_GROUPS = [
567
+ [
568
+ "ا",
569
+ "آ",
570
+ "أ",
571
+ "إ"
572
+ ],
573
+ ["ة", "ه"],
574
+ ["ى", "ي"]
575
+ ];
576
+ const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
577
+ /**
578
+ * Escapes a string for safe inclusion in a regular expression.
579
+ *
580
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
581
+ *
582
+ * @param s - Any string to escape
583
+ * @returns String with regex metacharacters escaped
584
+ *
585
+ * @example
586
+ * escapeRegex('hello.world') // → 'hello\\.world'
587
+ * escapeRegex('[test]') // → '\\[test\\]'
588
+ * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
589
+ */
590
+ const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
591
+ const getEquivClass = (ch) => {
592
+ const group = EQUIV_GROUPS.find((g) => g.includes(ch));
593
+ return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
594
+ };
595
+ const normalizeArabicLight = (str) => {
596
+ return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
597
+ };
598
+ /**
599
+ * Normalizes Arabic text for exact comparisons while tolerating common variants.
600
+ *
601
+ * This removes Arabic diacritics, collapses whitespace, removes joiners, and
602
+ * maps common equivalent letters to a shared canonical form:
603
+ * - ا/آ/أ/إ -> ا
604
+ * - ة/ه -> ه
605
+ * - ى/ي -> ي
606
+ */
607
+ const normalizeArabicForComparison = (text) => {
608
+ return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
609
+ if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
610
+ if (ch === "ة") return "ه";
611
+ if (ch === "ى") return "ي";
612
+ return ch;
613
+ }).join("");
614
+ };
615
+ const makeDiacriticInsensitive = (text) => {
616
+ const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
617
+ return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
618
+ };
619
+ const isCombiningMarkOrSelector = (char) => {
620
+ if (!char) return false;
621
+ return /\p{M}/u.test(char) || char === "︎" || char === "️";
622
+ };
623
+ const isJoiner = (char) => char === "‌" || char === "‍";
624
+ /**
625
+ * Ensures the position does not split a grapheme cluster (surrogate pairs,
626
+ * combining marks, or zero-width joiners / variation selectors).
627
+ *
628
+ * This is only used as a last-resort fallback when we are forced to split
629
+ * near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
630
+ */
631
+ const adjustForUnicodeBoundary = (content, position) => {
632
+ let adjusted = position;
633
+ while (adjusted > 0) {
634
+ const high = content.charCodeAt(adjusted - 1);
635
+ const low = content.charCodeAt(adjusted);
636
+ if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
637
+ adjusted -= 1;
638
+ continue;
639
+ }
640
+ const nextChar = content[adjusted];
641
+ const prevChar = content[adjusted - 1];
642
+ if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
643
+ adjusted -= 1;
644
+ continue;
645
+ }
646
+ break;
647
+ }
648
+ return adjusted;
649
+ };
575
650
  //#endregion
576
651
  //#region src/analysis/shared.ts
577
652
  const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
@@ -632,7 +707,6 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
632
707
  };
633
708
  const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
634
709
  const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
635
-
636
710
  //#endregion
637
711
  //#region src/analysis/line-starts.ts
638
712
  const resolveOptions$1 = (options = {}) => ({
@@ -658,65 +732,141 @@ const compareBySpecificity = (a, b) => {
658
732
  return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
659
733
  };
660
734
  const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
661
- /** Remove trailing whitespace placeholders */
662
- const trimTrailingWs = (out, mode) => {
663
- const suffix = mode === "regex" ? "\\s*" : " ";
664
- while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
665
- return out;
666
- };
667
- /** Try to extract first word for fallback */
668
- const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
669
- /** Consume prefix matchers at current position */
670
- const consumePrefixes = (s, pos, out, matchers, ws) => {
671
- let matched = false;
735
+ const appendPrefix = (s, pos, out, matchers, ws) => {
672
736
  for (const re of matchers) {
673
737
  if (pos >= s.length) break;
674
738
  const m = re.exec(s.slice(pos));
675
739
  if (!m?.index && m?.[0]) {
676
740
  out += escapeSignatureLiteral(m[0]);
677
741
  pos += m[0].length;
678
- matched = true;
679
742
  const wsm = /^[ \t]+/u.exec(s.slice(pos));
680
743
  if (wsm) {
681
744
  pos += wsm[0].length;
682
745
  out = appendWs(out, ws);
683
746
  }
747
+ return {
748
+ matched: true,
749
+ out,
750
+ pos
751
+ };
684
752
  }
685
753
  }
686
754
  return {
687
- matched,
755
+ matched: false,
688
756
  out,
689
757
  pos
690
758
  };
691
759
  };
692
- /** Try to match a token at current position and append to signature */
693
- const tryMatchToken = (s, pos, out, compiled) => {
760
+ const appendToken = (s, pos, out, compiled) => {
694
761
  const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
695
- if (!best) return {
696
- matched: false,
697
- out,
698
- pos
699
- };
700
- return {
762
+ return best ? {
701
763
  matched: true,
702
764
  out: `${out}{{${best.token}}}`,
703
765
  pos: pos + best.text.length
766
+ } : {
767
+ matched: false,
768
+ out,
769
+ pos
704
770
  };
705
771
  };
706
- /** Try to match a delimiter at current position */
707
- const tryMatchDelimiter = (s, pos, out) => {
772
+ const appendDelimiter = (s, pos, out) => {
708
773
  const ch = s[pos];
709
- if (!ch || !isCommonDelimiter(ch)) return {
774
+ return ch && isCommonDelimiter(ch) ? {
775
+ matched: true,
776
+ out: `${out}${escapeSignatureLiteral(ch)}`,
777
+ pos: pos + 1
778
+ } : {
710
779
  matched: false,
711
780
  out,
712
- pos
781
+ pos
782
+ };
783
+ };
784
+ const appendFallbackWord = (s, pos, out) => {
785
+ const word = extractFirstWord(s.slice(pos));
786
+ return word ? `${out}${escapeSignatureLiteral(word)}` : null;
787
+ };
788
+ const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
789
+ const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
790
+ if (ws.skipped) return {
791
+ done: false,
792
+ matchedAny,
793
+ matchedToken,
794
+ out: ws.out,
795
+ pos: ws.pos,
796
+ steps: 0
797
+ };
798
+ const tok = appendToken(s, pos, out, compiled);
799
+ if (tok.matched) return {
800
+ done: false,
801
+ matchedAny: true,
802
+ matchedToken: true,
803
+ out: tok.out,
804
+ pos: tok.pos,
805
+ steps: 1
806
+ };
807
+ if (matchedAny) {
808
+ const delim = appendDelimiter(s, pos, out);
809
+ if (delim.matched) return {
810
+ done: false,
811
+ matchedAny,
812
+ matchedToken,
813
+ out: delim.out,
814
+ pos: delim.pos,
815
+ steps: 0
816
+ };
817
+ if (opts.includeFirstWordFallback && !matchedToken) {
818
+ const fallback = appendFallbackWord(s, pos, out);
819
+ if (fallback) return {
820
+ done: true,
821
+ matchedAny,
822
+ matchedToken,
823
+ out: fallback,
824
+ pos,
825
+ steps: 1
826
+ };
827
+ }
828
+ return {
829
+ done: true,
830
+ matchedAny,
831
+ matchedToken,
832
+ out,
833
+ pos,
834
+ steps: 0
835
+ };
836
+ }
837
+ if (!opts.includeFirstWordFallback) return {
838
+ done: true,
839
+ matchedAny,
840
+ matchedToken,
841
+ out,
842
+ pos,
843
+ steps: 0
713
844
  };
714
- return {
715
- matched: true,
716
- out: out + escapeSignatureLiteral(ch),
717
- pos: pos + 1
845
+ const fallback = appendFallbackWord(s, pos, out);
846
+ return fallback ? {
847
+ done: true,
848
+ matchedAny: true,
849
+ matchedToken,
850
+ out: fallback,
851
+ pos,
852
+ steps: 0
853
+ } : {
854
+ done: true,
855
+ matchedAny,
856
+ matchedToken,
857
+ out,
858
+ pos,
859
+ steps: 0
718
860
  };
719
861
  };
862
+ /** Remove trailing whitespace placeholders */
863
+ const trimTrailingWs = (out, mode) => {
864
+ const suffix = mode === "regex" ? "\\s*" : " ";
865
+ while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
866
+ return out;
867
+ };
868
+ /** Try to extract first word for fallback */
869
+ const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
720
870
  /** Skip whitespace at position */
721
871
  const skipWhitespace$1 = (s, pos, out, ws) => {
722
872
  const m = /^[ \t]+/u.exec(s.slice(pos));
@@ -737,47 +887,25 @@ const tokenizeLineStart = (line, tokenNames, opts) => {
737
887
  const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
738
888
  const compiled = compileTokenRegexes(tokenNames);
739
889
  let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
740
- const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
890
+ const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
741
891
  pos = prefix.pos;
742
892
  out = prefix.out;
743
893
  matchedAny = prefix.matched;
744
894
  while (steps < 6 && pos < s.length) {
745
- const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
746
- if (ws.skipped) {
747
- pos = ws.pos;
748
- out = ws.out;
749
- continue;
750
- }
751
- const tok = tryMatchToken(s, pos, out, compiled);
752
- if (tok.matched) {
753
- pos = tok.pos;
754
- out = tok.out;
755
- matchedAny = matchedToken = true;
756
- steps++;
757
- continue;
758
- }
759
- if (matchedAny) {
760
- const delim = tryMatchDelimiter(s, pos, out);
761
- if (delim.matched) {
762
- pos = delim.pos;
763
- out = delim.out;
764
- continue;
765
- }
766
- }
767
- if (matchedAny) {
768
- if (opts.includeFirstWordFallback && !matchedToken) {
769
- const word = extractFirstWord(s.slice(pos));
770
- if (word) {
771
- out += escapeSignatureLiteral(word);
772
- steps++;
773
- }
774
- }
895
+ const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
896
+ if (next.done) {
897
+ if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
898
+ if (next.steps > 0) steps += next.steps;
899
+ matchedAny = next.matchedAny;
900
+ matchedToken = next.matchedToken;
901
+ out = next.out;
775
902
  break;
776
903
  }
777
- if (!opts.includeFirstWordFallback) return null;
778
- const word = extractFirstWord(s.slice(pos));
779
- if (!word) return null;
780
- return escapeSignatureLiteral(word);
904
+ pos = next.pos;
905
+ out = next.out;
906
+ matchedAny = next.matchedAny;
907
+ matchedToken = next.matchedToken;
908
+ steps += next.steps;
781
909
  }
782
910
  return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
783
911
  };
@@ -821,7 +949,6 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
821
949
  pattern
822
950
  })).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
823
951
  };
824
-
825
952
  //#endregion
826
953
  //#region src/analysis/repeating-sequences.ts
827
954
  const resolveOptions = (options) => {
@@ -843,6 +970,7 @@ const resolveOptions = (options) => {
843
970
  const createRawCursor = (text, normalize) => {
844
971
  let rawPos = 0;
845
972
  return {
973
+ /** Advance cursor, returning the raw text chunk consumed */
846
974
  advance(normalizedLen) {
847
975
  if (!normalize) {
848
976
  const chunk = text.slice(rawPos, rawPos + normalizedLen);
@@ -947,23 +1075,27 @@ const buildExample = (page, window, contextChars) => {
947
1075
  text: page.content.slice(start, end)
948
1076
  };
949
1077
  };
1078
+ const recordPattern = (page, window, opts, stats) => {
1079
+ if (opts.requireToken && !hasTokenInWindow(window)) return;
1080
+ const pattern = buildPattern(window, opts.whitespace);
1081
+ let entry = stats.get(pattern);
1082
+ if (!entry) {
1083
+ if (stats.size >= opts.maxUniquePatterns) return;
1084
+ entry = {
1085
+ count: 0,
1086
+ examples: [],
1087
+ ...computeWindowStats(window)
1088
+ };
1089
+ stats.set(pattern, entry);
1090
+ }
1091
+ entry.count++;
1092
+ if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
1093
+ };
950
1094
  /** Extract N-grams from a single page */
951
1095
  const extractPageNgrams = (page, items, opts, stats) => {
952
- for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
953
- const window = items.slice(i, i + n);
954
- if (opts.requireToken && !hasTokenInWindow(window)) continue;
955
- const pattern = buildPattern(window, opts.whitespace);
956
- if (!stats.has(pattern)) {
957
- if (stats.size >= opts.maxUniquePatterns) continue;
958
- stats.set(pattern, {
959
- count: 0,
960
- examples: [],
961
- ...computeWindowStats(window)
962
- });
963
- }
964
- const entry = stats.get(pattern);
965
- entry.count++;
966
- if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
1096
+ for (let i = 0; i <= items.length - opts.minElements; i++) {
1097
+ const maxWindowSize = Math.min(opts.maxElements, items.length - i);
1098
+ for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
967
1099
  }
968
1100
  };
969
1101
  /**
@@ -985,7 +1117,6 @@ const analyzeRepeatingSequences = (pages, options) => {
985
1117
  pattern
986
1118
  }));
987
1119
  };
988
-
989
1120
  //#endregion
990
1121
  //#region src/detection.ts
991
1122
  /**
@@ -1147,7 +1278,6 @@ const analyzeTextForRule = (text) => {
1147
1278
  ...suggestPatternConfig(detected)
1148
1279
  };
1149
1280
  };
1150
-
1151
1281
  //#endregion
1152
1282
  //#region src/types/rules.ts
1153
1283
  /**
@@ -1172,7 +1302,6 @@ const PATTERN_TYPE_KEYS = [
1172
1302
  "template",
1173
1303
  "regex"
1174
1304
  ];
1175
-
1176
1305
  //#endregion
1177
1306
  //#region src/optimization/optimize-rules.ts
1178
1307
  const MERGEABLE_KEYS = new Set([
@@ -1231,7 +1360,6 @@ const optimizeRules = (rules) => {
1231
1360
  rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
1232
1361
  };
1233
1362
  };
1234
-
1235
1363
  //#endregion
1236
1364
  //#region src/preprocessing/transforms.ts
1237
1365
  /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
@@ -1340,170 +1468,89 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
1340
1468
  }
1341
1469
  return result;
1342
1470
  };
1343
-
1344
1471
  //#endregion
1345
- //#region src/segmentation/rule-regex.ts
1346
- /**
1347
- * Checks if a regex pattern contains standard (anonymous) capturing groups.
1348
- *
1349
- * Detects standard capturing groups `(...)` while excluding:
1350
- * - Non-capturing groups `(?:...)`
1351
- * - Lookahead assertions `(?=...)` and `(?!...)`
1352
- * - Lookbehind assertions `(?<=...)` and `(?<!...)`
1353
- * - Named groups `(?<name>...)` (start with `(?` so excluded here)
1354
- *
1355
- * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
1356
- */
1357
- const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
1358
- /**
1359
- * Extracts named capture group names from a regex pattern.
1360
- *
1361
- * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
1362
- *
1363
- * @example
1364
- * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
1365
- * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
1366
- * extractNamedCaptureNames('^\\d+') // []
1367
- */
1368
- const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([^>]+)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
1369
- /**
1370
- * Safely compiles a regex pattern, throwing a helpful error if invalid.
1371
- */
1372
- const compileRuleRegex = (pattern) => {
1373
- try {
1374
- return new RegExp(pattern, "gmu");
1375
- } catch (error) {
1376
- throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
1472
+ //#region src/segmentation/arabic-dictionary-rule.ts
1473
+ const uniqueNormalizedWords = (words) => {
1474
+ const seen = /* @__PURE__ */ new Set();
1475
+ const result = [];
1476
+ for (const word of words) {
1477
+ const normalized = normalizeArabicForComparison(word);
1478
+ if (!normalized || seen.has(normalized)) continue;
1479
+ seen.add(normalized);
1480
+ result.push(normalized);
1377
1481
  }
1482
+ return result;
1378
1483
  };
1379
- /**
1380
- * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
1381
- *
1382
- * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1383
- */
1384
- const processPattern = (pattern, fuzzy, capturePrefix) => {
1385
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
1386
- return {
1387
- captureNames,
1388
- pattern: expanded
1389
- };
1390
- };
1391
- /**
1392
- * Processes a breakpoint pattern by expanding tokens only.
1393
- *
1394
- * Unlike `processPattern`, this does NOT escape brackets because breakpoints
1395
- * are treated as raw regex patterns (like the `regex` rule type).
1396
- * Users have full control over regex syntax including `(?:...)` groups.
1397
- */
1398
- const processBreakpointPattern = (pattern) => {
1399
- const { pattern: expanded } = expandTokensWithCaptures(pattern);
1400
- return expanded;
1401
- };
1402
- const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
1403
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1404
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1405
- return {
1406
- captureNames: processed.flatMap((p) => p.captureNames),
1407
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
1408
- };
1409
- };
1410
- const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1411
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1412
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1413
- return {
1414
- captureNames: processed.flatMap((p) => p.captureNames),
1415
- regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
1416
- };
1484
+ const buildStopAlternation = (stopWords) => {
1485
+ const unique = uniqueNormalizedWords(stopWords);
1486
+ if (unique.length === 0) return "";
1487
+ return unique.map((word) => makeDiacriticInsensitive(word)).join("|");
1417
1488
  };
1418
- const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1419
- const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1420
- const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
1421
- return {
1422
- captureNames: processed.flatMap((p) => p.captureNames),
1423
- regex: `(?:${alternatives})$`
1424
- };
1489
+ const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1490
+ if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1491
+ const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1492
+ return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1425
1493
  };
1426
- const buildTemplateRegexSource = (template, capturePrefix) => {
1427
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
1428
- return {
1429
- captureNames,
1430
- regex: pattern
1431
- };
1494
+ const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1495
+ const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1496
+ const withCapture = captureName ? `(?<${captureName}>${headwordBody})` : `(?:${headwordBody})`;
1497
+ if (!allowParenthesized) return `${withCapture}${colon}`;
1498
+ return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1432
1499
  };
1433
1500
  /**
1434
- * Builds a compiled regex and metadata from a split rule.
1501
+ * Creates a reusable split rule for Arabic dictionary entries.
1435
1502
  *
1436
- * Behavior mirrors the previous implementation in `segmenter.ts`.
1503
+ * The generated rule:
1504
+ * - keeps the lemma marker in `segment.content`
1505
+ * - stores the lemma in `segment.meta[captureName]`
1506
+ * - matches root entries at true line/page starts
1507
+ * - matches mid-line subentries conservatively when they begin with `و`
1508
+ * - can optionally support parenthesized headwords like `(عنبر) :`
1509
+ * - can optionally support comma-separated headword lists like `سبد، دبس:`
1510
+ *
1511
+ * @example
1512
+ * createArabicDictionaryEntryRule({
1513
+ * stopWords: ['وقيل', 'ويقال', 'قال'],
1514
+ * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1515
+ * })
1516
+ *
1517
+ * @example
1518
+ * createArabicDictionaryEntryRule({
1519
+ * allowParenthesized: true,
1520
+ * allowWhitespaceBeforeColon: true,
1521
+ * allowCommaSeparated: true,
1522
+ * stopWords: ['الليث', 'العجاج'],
1523
+ * })
1437
1524
  */
1438
- const buildRuleRegex = (rule, capturePrefix) => {
1439
- const { lineStartsWith, lineStartsAfter, lineEndsWith, template, regex } = rule;
1440
- const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy([
1441
- ...lineStartsWith ?? [],
1442
- ...lineStartsAfter ?? [],
1443
- ...lineEndsWith ?? []
1444
- ]);
1445
- if (lineStartsAfter?.length) {
1446
- const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(lineStartsAfter, fuzzy, capturePrefix);
1447
- return {
1448
- captureNames,
1449
- regex: compileRuleRegex(lsaRegex),
1450
- usesCapture: true,
1451
- usesLineStartsAfter: true
1452
- };
1453
- }
1454
- let finalRegex = regex;
1455
- let allCaptureNames = [];
1456
- if (lineStartsWith?.length) {
1457
- const res = buildLineStartsWithRegexSource(lineStartsWith, fuzzy, capturePrefix);
1458
- finalRegex = res.regex;
1459
- allCaptureNames = res.captureNames;
1460
- }
1461
- if (lineEndsWith?.length) {
1462
- const res = buildLineEndsWithRegexSource(lineEndsWith, fuzzy, capturePrefix);
1463
- finalRegex = res.regex;
1464
- allCaptureNames = res.captureNames;
1465
- }
1466
- if (template) {
1467
- const res = buildTemplateRegexSource(template, capturePrefix);
1468
- finalRegex = res.regex;
1469
- allCaptureNames = [...allCaptureNames, ...res.captureNames];
1470
- }
1471
- if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1472
- if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
1525
+ const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1526
+ if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1527
+ if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1528
+ if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1529
+ const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1530
+ const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1531
+ const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1532
+ const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1533
+ const stopAlternation = buildStopAlternation(stopWords);
1534
+ const lemmaBody = buildHeadwordBody({
1535
+ allowCommaSeparated,
1536
+ colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1537
+ stopAlternation,
1538
+ stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1539
+ unit: lemmaUnit
1540
+ });
1473
1541
  return {
1474
- captureNames: allCaptureNames,
1475
- regex: compileRuleRegex(finalRegex),
1476
- usesCapture: hasCapturingGroup(finalRegex),
1477
- usesLineStartsAfter: false
1542
+ meta,
1543
+ pageStartPrevWordStoplist,
1544
+ regex: `(?:${`(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`}|${allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`})` + buildBalancedMarker({
1545
+ allowParenthesized,
1546
+ allowWhitespaceBeforeColon,
1547
+ captureName,
1548
+ headwordBody: lemmaBody
1549
+ }),
1550
+ samePagePrevWordStoplist,
1551
+ split: "at"
1478
1552
  };
1479
1553
  };
1480
-
1481
- //#endregion
1482
- //#region src/segmentation/breakpoint-constants.ts
1483
- /**
1484
- * Shared constants for segmentation breakpoint processing.
1485
- */
1486
- /**
1487
- * Threshold for using offset-based fast path in boundary processing.
1488
- *
1489
- * Below this: accurate string-search (handles offset drift from structural rules).
1490
- * At or above this: O(n) arithmetic (performance critical for large books).
1491
- *
1492
- * The value of 1000 is chosen based on typical Arabic book sizes:
1493
- * - Sahih al-Bukhari: ~1000-3000 pages
1494
- * - Standard hadith collections: 1000-7000 pages
1495
- * - Large aggregated corpora: 10k-50k pages
1496
- *
1497
- * For segments ≥1000 pages, the performance gain from offset-based slicing
1498
- * outweighs the minor accuracy loss from potential offset drift.
1499
- *
1500
- * @remarks
1501
- * Fast path is skipped when:
1502
- * - `maxContentLength` is set (requires character-accurate splitting)
1503
- * - `debugMetaKey` is set (requires proper provenance tracking)
1504
- * - Content was structurally modified by marker stripping (offsets may drift)
1505
- */
1506
- const FAST_PATH_THRESHOLD = 1e3;
1507
1554
  const WINDOW_PREFIX_LENGTHS = [
1508
1555
  80,
1509
1556
  60,
@@ -1530,23 +1577,6 @@ const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۝۞]/;
1530
1577
  * Matches outside this range are rejected unless `ignoreDeviation` is active.
1531
1578
  */
1532
1579
  const MAX_DEVIATION = 2e3;
1533
- /**
1534
- * Penalty score applied to non-newline anchor candidates.
1535
- *
1536
- * Designed to prioritize newline-aligned boundaries unless a whitespace match is
1537
- * significantly closer (within 20 chars). Handles cases where marker stripping
1538
- * shifts the boundary slightly.
1539
- */
1540
- const NON_NEWLINE_PENALTY = 20;
1541
- /**
1542
- * Limit for inferring start offset from a relaxed search (characters).
1543
- *
1544
- * If the relaxed search finds a match more than this distance away from the
1545
- * expected position, we assume it's a false positive (e.g. repeated content)
1546
- * and do not use it to infer the start offset.
1547
- */
1548
- const INFERENCE_PROXIMITY_LIMIT = 500;
1549
-
1550
1580
  //#endregion
1551
1581
  //#region src/segmentation/match-utils.ts
1552
1582
  /**
@@ -1665,7 +1695,6 @@ const extractDebugIndex = (groups, prefix) => {
1665
1695
  if (!Number.isNaN(idx)) return idx;
1666
1696
  }
1667
1697
  };
1668
-
1669
1698
  //#endregion
1670
1699
  //#region src/segmentation/breakpoint-utils.ts
1671
1700
  /**
@@ -2067,8 +2096,8 @@ const findAnchorCandidates = (content, prefix, start, end) => {
2067
2096
  /** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
2068
2097
  const selectBestAnchor = (candidates, expectedBoundary) => {
2069
2098
  return candidates.reduce((best, curr) => {
2070
- const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : NON_NEWLINE_PENALTY);
2071
- return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : NON_NEWLINE_PENALTY) < bestScore ? curr : best;
2099
+ const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
2100
+ return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
2072
2101
  });
2073
2102
  };
2074
2103
  /**
@@ -2122,7 +2151,7 @@ const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetI
2122
2151
  if (relaxedPos > 0) {
2123
2152
  const inferredStartOffset = rawBoundary - relaxedPos;
2124
2153
  const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
2125
- if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < INFERENCE_PROXIMITY_LIMIT) {
2154
+ if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
2126
2155
  startOffsetInFromPage = inferredStartOffset;
2127
2156
  expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
2128
2157
  pos = relaxedPos;
@@ -2196,7 +2225,7 @@ const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCoun
2196
2225
  const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
2197
2226
  const pageCount = toIdx - fromIdx + 1;
2198
2227
  const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
2199
- if (pageCount >= FAST_PATH_THRESHOLD && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
2228
+ if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
2200
2229
  return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
2201
2230
  };
2202
2231
  /**
@@ -2428,7 +2457,6 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
2428
2457
  }
2429
2458
  return -1;
2430
2459
  };
2431
-
2432
2460
  //#endregion
2433
2461
  //#region src/segmentation/debug-meta.ts
2434
2462
  const resolveDebugConfig = (debug) => {
@@ -2470,59 +2498,197 @@ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
2470
2498
  ...word !== void 0 ? { word } : {}
2471
2499
  } };
2472
2500
  };
2473
- const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2474
- index: breakpointIndex,
2475
- kind: rule.pattern === "" ? "pageBoundary" : "pattern",
2476
- pattern: rule.pattern ?? rule.regex,
2477
- ...wordIndex !== void 0 ? { wordIndex } : {},
2478
- ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2479
- } });
2501
+ const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2502
+ index: breakpointIndex,
2503
+ kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
2504
+ pattern: rule.pattern ?? rule.regex,
2505
+ ...wordIndex !== void 0 ? { wordIndex } : {},
2506
+ ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2507
+ } });
2508
+ /**
2509
+ * Helper to format the debug info into a human-readable string.
2510
+ * @param meta - The segment metadata object
2511
+ * @param options - Formatting options
2512
+ */
2513
+ const formatRuleReason = (rule, concise) => {
2514
+ const { index, patternType, wordIndex, word } = rule;
2515
+ if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2516
+ const wordInfo = word ? ` (Matched: "${word}")` : "";
2517
+ return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2518
+ };
2519
+ const formatBreakpointReason = (breakpoint, concise) => {
2520
+ const { index, kind, pattern, wordIndex, word } = breakpoint;
2521
+ if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2522
+ if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2523
+ if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2524
+ return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2525
+ };
2526
+ const formatContentLengthReason = (split, concise) => {
2527
+ const { maxContentLength, splitReason } = split;
2528
+ if (concise) return `> ${maxContentLength} (${splitReason})`;
2529
+ return `Safety Split (${splitReason}) > ${maxContentLength}`;
2530
+ };
2531
+ /**
2532
+ * Helper to format the debug info into a human-readable string.
2533
+ * @param meta - The segment metadata object
2534
+ * @param options - Formatting options
2535
+ */
2536
+ const getDebugReason = (meta, options) => {
2537
+ const debug = meta?._flappa;
2538
+ if (!debug) return "-";
2539
+ const concise = options?.concise;
2540
+ if (debug.rule) return formatRuleReason(debug.rule, concise);
2541
+ if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2542
+ if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2543
+ return "Unknown";
2544
+ };
2545
+ /**
2546
+ * Convenience helper to get the formatted debug reason directly from a segment.
2547
+ * @param segment - The segment object
2548
+ * @param options - Formatting options
2549
+ */
2550
+ const getSegmentDebugReason = (segment, options) => {
2551
+ return getDebugReason(segment.meta, options);
2552
+ };
2553
+ //#endregion
2554
+ //#region src/segmentation/pattern-validator.ts
2555
+ const KNOWN_TOKENS = new Set(getAvailableTokens());
2556
+ const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
2557
+ const buildBareTokenRegex = () => {
2558
+ const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
2559
+ return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
2560
+ };
2561
+ /**
2562
+ * Validates a single pattern for common issues.
2563
+ */
2564
+ const validatePattern = (pattern, seenPatterns) => {
2565
+ if (!pattern.trim()) return {
2566
+ message: "Empty pattern is not allowed",
2567
+ type: "empty_pattern"
2568
+ };
2569
+ if (seenPatterns.has(pattern)) return {
2570
+ message: `Duplicate pattern: "${pattern}"`,
2571
+ pattern,
2572
+ type: "duplicate"
2573
+ };
2574
+ seenPatterns.add(pattern);
2575
+ TOKEN_INSIDE_BRACES.lastIndex = 0;
2576
+ for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
2577
+ const name = match[1];
2578
+ if (!KNOWN_TOKENS.has(name)) return {
2579
+ message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
2580
+ suggestion: "Check spelling or use a known token",
2581
+ token: name,
2582
+ type: "unknown_token"
2583
+ };
2584
+ }
2585
+ for (const match of pattern.matchAll(buildBareTokenRegex())) {
2586
+ const [full, name] = match;
2587
+ const idx = match.index;
2588
+ if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
2589
+ message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
2590
+ suggestion: `{{${full}}}`,
2591
+ token: name,
2592
+ type: "missing_braces"
2593
+ };
2594
+ }
2595
+ };
2480
2596
  /**
2481
- * Helper to format the debug info into a human-readable string.
2482
- * @param meta - The segment metadata object
2483
- * @param options - Formatting options
2597
+ * Validates an array of patterns, returning parallel array of issues.
2484
2598
  */
2485
- const formatRuleReason = (rule, concise) => {
2486
- const { index, patternType, wordIndex, word } = rule;
2487
- if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2488
- const wordInfo = word ? ` (Matched: "${word}")` : "";
2489
- return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2599
+ const validatePatternArray = (patterns) => {
2600
+ const seen = /* @__PURE__ */ new Set();
2601
+ const issues = patterns.map((p) => validatePattern(p, seen));
2602
+ return issues.some(Boolean) ? issues : void 0;
2490
2603
  };
2491
- const formatBreakpointReason = (breakpoint, concise) => {
2492
- const { index, kind, pattern, wordIndex, word } = breakpoint;
2493
- if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2494
- if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2495
- if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2496
- return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2604
+ const applyRulePatternValidation = (result, key, patterns) => {
2605
+ if (!patterns) return false;
2606
+ const issues = validatePatternArray(patterns);
2607
+ if (!issues) return false;
2608
+ result[key] = issues;
2609
+ return true;
2497
2610
  };
2498
- const formatContentLengthReason = (split, concise) => {
2499
- const { maxContentLength, splitReason } = split;
2500
- if (concise) return `> ${maxContentLength} (${splitReason})`;
2501
- return `Safety Split (${splitReason}) > ${maxContentLength}`;
2611
+ const validateTemplateRule = (rule, result) => {
2612
+ if (rule.template === void 0) return false;
2613
+ const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
2614
+ if (!issue) return false;
2615
+ result.template = issue;
2616
+ return true;
2617
+ };
2618
+ const validateRegexRule = (rule, result) => {
2619
+ if (rule.regex === void 0) return false;
2620
+ if (!rule.regex.trim()) {
2621
+ result.regex = {
2622
+ message: "Empty pattern is not allowed",
2623
+ type: "empty_pattern"
2624
+ };
2625
+ return true;
2626
+ }
2627
+ try {
2628
+ new RegExp(rule.regex, "u");
2629
+ return false;
2630
+ } catch (error) {
2631
+ result.regex = {
2632
+ message: error instanceof Error ? error.message : String(error),
2633
+ pattern: rule.regex,
2634
+ type: "invalid_regex"
2635
+ };
2636
+ return true;
2637
+ }
2638
+ };
2639
+ const formatValidationIssue = (_type, issue, loc) => {
2640
+ if (!issue) return null;
2641
+ if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
2642
+ if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
2643
+ if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
2644
+ if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
2645
+ return `${loc}: ${issue.message || issue.type}`;
2502
2646
  };
2503
2647
  /**
2504
- * Helper to format the debug info into a human-readable string.
2505
- * @param meta - The segment metadata object
2506
- * @param options - Formatting options
2648
+ * Validates split rules for common pattern issues.
2649
+ *
2650
+ * Checks for:
2651
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
2652
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
2653
+ * - Duplicate patterns within the same rule
2654
+ *
2655
+ * @param rules - Array of split rules to validate
2656
+ * @returns Array parallel to input with validation results (undefined if no issues)
2657
+ *
2658
+ * @example
2659
+ * const issues = validateRules([
2660
+ * { lineStartsAfter: ['raqms:num'] }, // Missing braces
2661
+ * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
2662
+ * ]);
2663
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
2664
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
2507
2665
  */
2508
- const getDebugReason = (meta, options) => {
2509
- const debug = meta?._flappa;
2510
- if (!debug) return "-";
2511
- const concise = options?.concise;
2512
- if (debug.rule) return formatRuleReason(debug.rule, concise);
2513
- if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2514
- if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2515
- return "Unknown";
2516
- };
2666
+ const validateRules = (rules) => rules.map((rule) => {
2667
+ const result = {};
2668
+ const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
2669
+ const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
2670
+ const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
2671
+ const templateIssues = validateTemplateRule(rule, result);
2672
+ const regexIssues = validateRegexRule(rule, result);
2673
+ return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues ? result : void 0;
2674
+ });
2517
2675
  /**
2518
- * Convenience helper to get the formatted debug reason directly from a segment.
2519
- * @param segment - The segment object
2520
- * @param options - Formatting options
2676
+ * Formats a validation result array into a list of human-readable error messages.
2677
+ *
2678
+ * Useful for displaying validation errors in UIs.
2679
+ *
2680
+ * @param results - The result array from `validateRules()`
2681
+ * @returns Array of formatted error strings
2682
+ *
2683
+ * @example
2684
+ * const issues = validateRules(rules);
2685
+ * const errors = formatValidationReport(issues);
2686
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
2521
2687
  */
2522
- const getSegmentDebugReason = (segment, options) => {
2523
- return getDebugReason(segment.meta, options);
2524
- };
2525
-
2688
+ const formatValidationReport = (results) => results.flatMap((result, i) => {
2689
+ if (!result) return [];
2690
+ return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${i + 1}, ${type}`)).filter((msg) => msg !== null));
2691
+ });
2526
2692
  //#endregion
2527
2693
  //#region src/segmentation/breakpoint-processor.ts
2528
2694
  const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
@@ -2650,7 +2816,7 @@ const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx,
2650
2816
  const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
2651
2817
  const driftTolerance = Math.max(100, fullContent.length * .01);
2652
2818
  const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
2653
- if (!isAligned && pageCount >= FAST_PATH_THRESHOLD) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
2819
+ if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
2654
2820
  actualLength: fullContent.length,
2655
2821
  drift: Math.abs(expectedLength - fullContent.length),
2656
2822
  expectedLength,
@@ -2791,8 +2957,7 @@ const computeWindowEndPositionForIteration = (remainingContent, cursorPos, curre
2791
2957
  if (maxPages === 0) {
2792
2958
  const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
2793
2959
  const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
2794
- const capped = maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage;
2795
- return Math.min(capped, remainingContent.length);
2960
+ return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
2796
2961
  }
2797
2962
  const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
2798
2963
  return Math.min(pos, remainingContent.length);
@@ -2847,7 +3012,7 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
2847
3012
  const pageCount = toIdx - fromIdx + 1;
2848
3013
  const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
2849
3014
  const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
2850
- if (pageCount < FAST_PATH_THRESHOLD || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
3015
+ if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
2851
3016
  if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
2852
3017
  return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
2853
3018
  };
@@ -3030,7 +3195,178 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
3030
3195
  logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
3031
3196
  return result;
3032
3197
  };
3033
-
3198
+ //#endregion
3199
+ //#region src/segmentation/rule-regex.ts
3200
+ /**
3201
+ * Checks if a regex pattern contains standard (anonymous) capturing groups.
3202
+ *
3203
+ * Detects standard capturing groups `(...)` while excluding:
3204
+ * - Non-capturing groups `(?:...)`
3205
+ * - Lookahead assertions `(?=...)` and `(?!...)`
3206
+ * - Lookbehind assertions `(?<=...)` and `(?<!...)`
3207
+ * - Named groups `(?<name>...)` (start with `(?` so excluded here)
3208
+ *
3209
+ * NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
3210
+ */
3211
+ const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
3212
+ /**
3213
+ * Extracts named capture group names from a regex pattern.
3214
+ *
3215
+ * Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
3216
+ *
3217
+ * @example
3218
+ * extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
3219
+ * extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
3220
+ * extractNamedCaptureNames('^\\d+') // []
3221
+ */
3222
+ const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
3223
+ /**
3224
+ * Safely compiles a regex pattern, throwing a helpful error if invalid.
3225
+ */
3226
+ const compileRuleRegex = (pattern) => {
3227
+ try {
3228
+ return new RegExp(pattern, "gmu");
3229
+ } catch (error) {
3230
+ throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
3231
+ }
3232
+ };
3233
+ /**
3234
+ * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
3235
+ *
3236
+ * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
3237
+ */
3238
+ const processPattern = (pattern, fuzzy, capturePrefix) => {
3239
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
3240
+ return {
3241
+ captureNames,
3242
+ pattern: expanded
3243
+ };
3244
+ };
3245
+ /**
3246
+ * Processes a breakpoint pattern by expanding tokens only.
3247
+ *
3248
+ * Unlike `processPattern`, this does NOT escape brackets because breakpoints
3249
+ * are treated as raw regex patterns (like the `regex` rule type).
3250
+ * Users have full control over regex syntax including `(?:...)` groups.
3251
+ */
3252
+ const processBreakpointPattern = (pattern) => {
3253
+ const { pattern: expanded } = expandTokensWithCaptures(pattern);
3254
+ return expanded;
3255
+ };
3256
+ /**
3257
+ * Builds the raw regex source for a `lineStartsAfter` rule.
3258
+ *
3259
+ * Expands each pattern through `processPattern()`, combines them into an
3260
+ * alternation at the start of a line, and appends a trailing content capture.
3261
+ *
3262
+ * @param patterns - Template-like line-start markers to match
3263
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3264
+ * @param capturePrefix - Optional prefix used for internal named captures
3265
+ * @returns Regex source plus the named captures extracted from the patterns
3266
+ */
3267
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
3268
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3269
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3270
+ return {
3271
+ captureNames: processed.flatMap((p) => p.captureNames),
3272
+ regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
3273
+ };
3274
+ };
3275
+ /**
3276
+ * Builds the raw regex source for a `lineStartsWith` rule.
3277
+ *
3278
+ * Expands each pattern through `processPattern()` and combines them into an
3279
+ * alternation anchored at the start of a line.
3280
+ *
3281
+ * @param patterns - Template-like line-start markers to match
3282
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3283
+ * @param capturePrefix - Optional prefix used for internal named captures
3284
+ * @returns Regex source plus the named captures extracted from the patterns
3285
+ */
3286
+ const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3287
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3288
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3289
+ return {
3290
+ captureNames: processed.flatMap((p) => p.captureNames),
3291
+ regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
3292
+ };
3293
+ };
3294
+ /**
3295
+ * Builds the raw regex source for a `lineEndsWith` rule.
3296
+ *
3297
+ * Expands each pattern through `processPattern()` and combines them into an
3298
+ * end-anchored alternation.
3299
+ *
3300
+ * @param patterns - Template-like line-end markers to match
3301
+ * @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
3302
+ * @param capturePrefix - Optional prefix used for internal named captures
3303
+ * @returns Regex source plus the named captures extracted from the patterns
3304
+ */
3305
+ const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
3306
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
3307
+ const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
3308
+ return {
3309
+ captureNames: processed.flatMap((p) => p.captureNames),
3310
+ regex: `(?:${alternatives})$`
3311
+ };
3312
+ };
3313
+ /**
3314
+ * Builds the raw regex source for a `template` rule.
3315
+ *
3316
+ * Expands tokens and named captures via `expandTokensWithCaptures()` after
3317
+ * applying `escapeTemplateBrackets()` to non-token brackets.
3318
+ *
3319
+ * @param template - Template string containing optional `{{token}}` markers
3320
+ * @param capturePrefix - Optional prefix used for internal named captures
3321
+ * @returns Regex source plus the named captures extracted from the template
3322
+ */
3323
+ const buildTemplateRegexSource = (template, capturePrefix) => {
3324
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
3325
+ return {
3326
+ captureNames,
3327
+ regex: pattern
3328
+ };
3329
+ };
3330
+ const getFuzzyCandidatePatterns = (rule) => [
3331
+ ..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
3332
+ ..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
3333
+ ..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
3334
+ ];
3335
+ const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
3336
+ if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
3337
+ if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
3338
+ if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
3339
+ return null;
3340
+ };
3341
+ /**
3342
+ * Builds a compiled regex and metadata from a split rule.
3343
+ *
3344
+ * Behavior mirrors the previous implementation in `segmenter.ts`.
3345
+ */
3346
+ const buildRuleRegex = (rule, capturePrefix) => {
3347
+ const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
3348
+ if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
3349
+ const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
3350
+ return {
3351
+ captureNames,
3352
+ regex: compileRuleRegex(lsaRegex),
3353
+ usesCapture: true,
3354
+ usesLineStartsAfter: true
3355
+ };
3356
+ }
3357
+ const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
3358
+ let finalRegex = ruleRegexSource?.regex;
3359
+ let allCaptureNames = ruleRegexSource?.captureNames ?? [];
3360
+ if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
3361
+ if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
3362
+ if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
3363
+ return {
3364
+ captureNames: allCaptureNames,
3365
+ regex: compileRuleRegex(finalRegex),
3366
+ usesCapture: hasCapturingGroup(finalRegex),
3367
+ usesLineStartsAfter: false
3368
+ };
3369
+ };
3034
3370
  //#endregion
3035
3371
  //#region src/segmentation/fast-fuzzy-prefix.ts
3036
3372
  /**
@@ -3078,9 +3414,8 @@ const compileFastFuzzyTokenRule = (tokenTemplate) => {
3078
3414
  const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
3079
3415
  if (!m) return null;
3080
3416
  const token = m[1];
3081
- const tokenPattern = getTokenPattern(token);
3082
- if (!tokenPattern) return null;
3083
- const compiled = compileLiteralAlternation(tokenPattern);
3417
+ if (!(token in TOKEN_PATTERNS)) return null;
3418
+ const compiled = compileLiteralAlternation(getTokenPattern(token));
3084
3419
  return compiled ? {
3085
3420
  alternatives: compiled.alternatives,
3086
3421
  token
@@ -3093,11 +3428,11 @@ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
3093
3428
  }
3094
3429
  return null;
3095
3430
  };
3096
-
3097
3431
  //#endregion
3098
3432
  //#region src/segmentation/segmenter-rule-utils.ts
3099
3433
  const tryCompileFastFuzzyRule = (rule) => {
3100
- if (!rule.fuzzy) return null;
3434
+ const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
3435
+ if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
3101
3436
  if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
3102
3437
  const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
3103
3438
  if (compiled) return {
@@ -3139,7 +3474,10 @@ const partitionRulesForMatching = (rules) => {
3139
3474
  prefix: `r${index}_`,
3140
3475
  rule
3141
3476
  });
3142
- else standaloneRules.push(rule);
3477
+ else standaloneRules.push({
3478
+ index,
3479
+ rule
3480
+ });
3143
3481
  }
3144
3482
  return {
3145
3483
  combinableRules,
@@ -3147,9 +3485,37 @@ const partitionRulesForMatching = (rules) => {
3147
3485
  standaloneRules
3148
3486
  };
3149
3487
  };
3488
+ const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
3489
+ const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
3490
+ const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
3491
+ const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
3492
+ const trimTrailingPageWrapNoise = (text) => {
3493
+ let trimmed = text.trimEnd();
3494
+ while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
3495
+ return trimmed;
3496
+ };
3497
+ const endsWithStrongSentenceTerminator = (pageContent) => {
3498
+ return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
3499
+ };
3500
+ const extractLastArabicWord = (pageContent) => {
3501
+ return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
3502
+ };
3503
+ const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
3504
+ if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
3505
+ const lastWord = extractLastArabicWord(previousPageContent);
3506
+ return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
3507
+ };
3508
+ const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
3509
+ if (!stoplist) return true;
3510
+ const lastWord = extractLastArabicWord(contentBeforeMatch);
3511
+ return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
3512
+ };
3150
3513
  const createPageStartGuardChecker = (matchContent, pageMap) => {
3151
3514
  const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
3152
3515
  const compiledPageStartPrev = /* @__PURE__ */ new Map();
3516
+ const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
3517
+ const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
3518
+ const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
3153
3519
  const getPageStartPrevRegex = (rule, ruleIndex) => {
3154
3520
  if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
3155
3521
  const pattern = rule.pageStartGuard;
@@ -3161,6 +3527,33 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
3161
3527
  compiledPageStartPrev.set(ruleIndex, re);
3162
3528
  return re;
3163
3529
  };
3530
+ const getPrevWordStoplist = (rule, ruleIndex) => {
3531
+ if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
3532
+ const stoplist = rule.pageStartPrevWordStoplist;
3533
+ if (!stoplist?.length) {
3534
+ compiledPrevWordStoplists.set(ruleIndex, null);
3535
+ return null;
3536
+ }
3537
+ const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3538
+ compiledPrevWordStoplists.set(ruleIndex, normalized);
3539
+ return normalized;
3540
+ };
3541
+ const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
3542
+ if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
3543
+ const stoplist = rule.samePagePrevWordStoplist;
3544
+ if (!stoplist?.length) {
3545
+ compiledSamePagePrevWordStoplists.set(ruleIndex, null);
3546
+ return null;
3547
+ }
3548
+ const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
3549
+ compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
3550
+ return normalized;
3551
+ };
3552
+ const getPreviousPageContent = (boundaryIndex) => {
3553
+ if (boundaryIndex <= 0) return "";
3554
+ const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
3555
+ return matchContent.slice(prevBoundary.start, prevBoundary.end);
3556
+ };
3164
3557
  const getPrevPageLastNonWsChar = (boundaryIndex) => {
3165
3558
  if (boundaryIndex <= 0) return "";
3166
3559
  const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
@@ -3170,13 +3563,24 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
3170
3563
  }
3171
3564
  return "";
3172
3565
  };
3566
+ const getCurrentPageContentBeforeMatch = (matchStart) => {
3567
+ const pageId = pageMap.getId(matchStart);
3568
+ const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
3569
+ if (boundaryIndex === void 0) return "";
3570
+ const boundary = pageMap.boundaries[boundaryIndex];
3571
+ return matchContent.slice(boundary.start, matchStart);
3572
+ };
3173
3573
  return (rule, ruleIndex, matchStart) => {
3174
3574
  const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
3175
- if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
3176
- const prevReq = getPageStartPrevRegex(rule, ruleIndex);
3177
- if (!prevReq) return true;
3178
- const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
3179
- return lastChar ? prevReq.test(lastChar) : false;
3575
+ if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
3576
+ const prevReq = getPageStartPrevRegex(rule, ruleIndex);
3577
+ if (prevReq) {
3578
+ const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
3579
+ if (!lastChar || !prevReq.test(lastChar)) return false;
3580
+ }
3581
+ return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
3582
+ }
3583
+ return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
3180
3584
  };
3181
3585
  };
3182
3586
  /**
@@ -3212,10 +3616,10 @@ const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule,
3212
3616
  /**
3213
3617
  * Processes matches for all fast-fuzzy rules at a specific line start.
3214
3618
  */
3215
- const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart, splitPointsByRule) => {
3619
+ const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
3216
3620
  for (const ffRule of fastFuzzyRules) {
3217
3621
  if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
3218
- if (isPageStart && !passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
3622
+ if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
3219
3623
  attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
3220
3624
  }
3221
3625
  };
@@ -3230,19 +3634,17 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
3230
3634
  currentBoundary = pageMap.boundaries[boundaryIdx];
3231
3635
  }
3232
3636
  };
3233
- const isPageStart = (offset) => offset === currentBoundary?.start;
3234
3637
  for (let lineStart = 0; lineStart <= matchContent.length;) {
3235
3638
  advanceBoundaryTo(lineStart);
3236
3639
  const pageId = currentBoundary?.id ?? 0;
3237
3640
  if (lineStart >= matchContent.length) break;
3238
- processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart(lineStart), splitPointsByRule);
3641
+ processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
3239
3642
  const nextNl = matchContent.indexOf("\n", lineStart);
3240
3643
  if (nextNl === -1) break;
3241
3644
  lineStart = nextNl + 1;
3242
3645
  }
3243
3646
  return splitPointsByRule;
3244
3647
  };
3245
-
3246
3648
  //#endregion
3247
3649
  //#region src/segmentation/split-point-helpers.ts
3248
3650
  const MAX_REGEX_ITERATIONS = 1e5;
@@ -3256,7 +3658,7 @@ const buildContentOffsets = (match, ruleInfo) => {
3256
3658
  if (!ruleInfo.usesLineStartsAfter) return {};
3257
3659
  const captured = match.groups?.[`${ruleInfo.prefix}__content`];
3258
3660
  if (captured === void 0) return {};
3259
- return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
3661
+ return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
3260
3662
  };
3261
3663
  const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
3262
3664
  const createSplitPointFromMatch = (match, rule, ruleInfo) => {
@@ -3271,7 +3673,32 @@ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
3271
3673
  wordIndex
3272
3674
  };
3273
3675
  };
3676
+ const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
3677
+ const arr = splitPointsByRule.get(originalIndex);
3678
+ if (!arr) {
3679
+ splitPointsByRule.set(originalIndex, [point]);
3680
+ return;
3681
+ }
3682
+ arr.push(point);
3683
+ };
3684
+ /**
3685
+ * Executes a combined regex over the content for combinable rules and records
3686
+ * any resulting split points into `splitPointsByRule`.
3687
+ *
3688
+ * This function mutates `splitPointsByRule` in place and throws if the regex
3689
+ * iteration guard is exceeded.
3690
+ *
3691
+ * @param matchContent - Concatenated content being segmented
3692
+ * @param combinableRules - Rules that can be combined into a single alternation
3693
+ * @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
3694
+ * @param pageMap - Page boundary mapping utilities for the content
3695
+ * @param passesPageStartGuard - Callback that decides whether a match is allowed
3696
+ * @param splitPointsByRule - Mutable map collecting split points by rule index
3697
+ * @param logger - Optional logger for iteration diagnostics
3698
+ * @returns Nothing; results are written into `splitPointsByRule`
3699
+ */
3274
3700
  const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
3701
+ assertCombinedRuleAlignment(combinableRules, ruleRegexes);
3275
3702
  const combinedSource = ruleRegexes.map((r) => r.source).join("|");
3276
3703
  const combinedRegex = new RegExp(combinedSource, "gm");
3277
3704
  logger?.debug?.("[segmenter] combined regex built", {
@@ -3286,19 +3713,29 @@ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, page
3286
3713
  iterations,
3287
3714
  position: m.index
3288
3715
  });
3289
- const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
3290
- if (matchedIndex !== -1) {
3291
- const { rule, index: originalIndex } = combinableRules[matchedIndex];
3292
- if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
3293
- const arr = splitPointsByRule.get(originalIndex);
3294
- if (!arr) splitPointsByRule.set(originalIndex, [createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex])]);
3295
- else arr.push(createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex]));
3296
- }
3297
- }
3716
+ processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
3298
3717
  if (m[0].length === 0) combinedRegex.lastIndex++;
3299
3718
  m = combinedRegex.exec(matchContent);
3300
3719
  }
3301
3720
  };
3721
+ const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
3722
+ if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
3723
+ for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
3724
+ };
3725
+ const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
3726
+ const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
3727
+ if (matchedIndex === -1) return;
3728
+ const { rule, index: originalIndex } = combinableRules[matchedIndex];
3729
+ if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
3730
+ addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
3731
+ };
3732
+ /**
3733
+ * Builds compiled regex metadata for each combinable rule while preserving the
3734
+ * prefix used to identify the matching branch inside a combined alternation.
3735
+ *
3736
+ * @param combinableRules - Rules eligible for combined-regex processing
3737
+ * @returns Rule regex metadata aligned with the input order
3738
+ */
3302
3739
  const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
3303
3740
  const built = buildRuleRegex(rule, prefix);
3304
3741
  return {
@@ -3307,6 +3744,18 @@ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefi
3307
3744
  source: `(?<${prefix}>${built.regex.source})`
3308
3745
  };
3309
3746
  });
3747
+ /**
3748
+ * Processes a standalone rule by matching it independently and appending its
3749
+ * resulting split points into `splitPointsByRule`.
3750
+ *
3751
+ * @param rule - The standalone split rule to evaluate
3752
+ * @param ruleIndex - Original rule index in the caller's rules array
3753
+ * @param matchContent - Concatenated content being segmented
3754
+ * @param pageMap - Page boundary mapping utilities for the content
3755
+ * @param passesPageStartGuard - Callback that decides whether a match is allowed
3756
+ * @param splitPointsByRule - Mutable map collecting split points by rule index
3757
+ * @returns Nothing; results are written into `splitPointsByRule`
3758
+ */
3310
3759
  const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
3311
3760
  const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
3312
3761
  const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
@@ -3341,6 +3790,15 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
3341
3790
  }
3342
3791
  return matches;
3343
3792
  };
3793
+ /**
3794
+ * Applies per-rule occurrence filtering and optional debug metadata patches to
3795
+ * the collected split points.
3796
+ *
3797
+ * @param rules - Full rule list in original order
3798
+ * @param splitPointsByRule - Split points grouped by originating rule index
3799
+ * @param debugMetaKey - Optional metadata key used for debug provenance patches
3800
+ * @returns Flattened split points after occurrence filtering and debug merging
3801
+ */
3344
3802
  const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
3345
3803
  const result = [];
3346
3804
  rules.forEach((rule, index) => {
@@ -3358,7 +3816,6 @@ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
3358
3816
  });
3359
3817
  return result;
3360
3818
  };
3361
-
3362
3819
  //#endregion
3363
3820
  //#region src/segmentation/segmenter.ts
3364
3821
  /**
@@ -3432,10 +3889,30 @@ const dedupeSplitPoints = (splitPoints) => {
3432
3889
  const byIndex = /* @__PURE__ */ new Map();
3433
3890
  for (const p of splitPoints) {
3434
3891
  const existing = byIndex.get(p.index);
3435
- if (!existing || p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
3892
+ if (!existing) {
3893
+ byIndex.set(p.index, p);
3894
+ continue;
3895
+ }
3896
+ byIndex.set(p.index, mergeSplitPoints(existing, p));
3436
3897
  }
3437
3898
  return [...byIndex.values()].sort((a, b) => a.index - b.index);
3438
3899
  };
3900
+ const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
3901
+ const mergeRecord = (existing, incoming) => existing || incoming ? {
3902
+ ...existing ?? {},
3903
+ ...incoming ?? {}
3904
+ } : void 0;
3905
+ const mergeSplitPoints = (existing, incoming) => {
3906
+ const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
3907
+ const fallback = preferred === incoming ? existing : incoming;
3908
+ return {
3909
+ ...fallback,
3910
+ ...preferred,
3911
+ contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
3912
+ meta: mergeRecord(existing.meta, incoming.meta),
3913
+ namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
3914
+ };
3915
+ };
3439
3916
  /**
3440
3917
  * If no structural rules produced segments, create a single segment spanning all pages.
3441
3918
  * This allows breakpoint processing to still run.
@@ -3468,7 +3945,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey,
3468
3945
  });
3469
3946
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
3470
3947
  if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
3471
- for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
3948
+ for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
3472
3949
  return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
3473
3950
  };
3474
3951
  /**
@@ -3508,7 +3985,7 @@ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
3508
3985
  * @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
3509
3986
  */
3510
3987
  const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
3511
- if (!content || !content.includes("\n")) return content;
3988
+ if (!content?.includes("\n")) return content;
3512
3989
  if (pageJoiner === "newline") return content;
3513
3990
  const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
3514
3991
  if (breaksInRange.length === 0) return content;
@@ -3616,16 +4093,23 @@ const segmentPages = (pages, options) => {
3616
4093
  * @returns Array of segment objects
3617
4094
  */
3618
4095
  const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
4096
+ const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
4097
+ const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4098
+ const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
4099
+ const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
4100
+ ...meta,
4101
+ ...namedCaptures
4102
+ } : void 0;
3619
4103
  /**
3620
4104
  * Creates a single segment from a content range.
3621
4105
  */
3622
4106
  const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
3623
- const actualStart = start + (contentStartOffset ?? 0);
4107
+ const actualStart = getActualStart(start, contentStartOffset);
3624
4108
  const sliced = content.slice(actualStart, end);
3625
- let text = capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4109
+ let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
3626
4110
  if (!text) return null;
3627
4111
  if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
3628
- const adjustedStart = actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
4112
+ const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
3629
4113
  const from = pageMap.getId(adjustedStart);
3630
4114
  const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
3631
4115
  const seg = {
@@ -3633,10 +4117,8 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
3633
4117
  from
3634
4118
  };
3635
4119
  if (to !== from) seg.to = to;
3636
- if (meta || namedCaptures) seg.meta = {
3637
- ...meta,
3638
- ...namedCaptures
3639
- };
4120
+ const mergedMeta = applyMeta(meta, namedCaptures);
4121
+ if (mergedMeta) seg.meta = mergedMeta;
3640
4122
  return seg;
3641
4123
  };
3642
4124
  /**
@@ -3668,659 +4150,6 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
3668
4150
  }
3669
4151
  return [...segments, ...createSegmentsFromSplitPoints()];
3670
4152
  };
3671
-
3672
- //#endregion
3673
- //#region src/recovery.ts
3674
- const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
3675
- const normalizeForCompare = (s, mode) => {
3676
- if (mode === "none") return s;
3677
- let out = s;
3678
- if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
3679
- out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
3680
- return out;
3681
- };
3682
- const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
3683
- const buildFixedOptions = (options, selectedRuleIndices) => {
3684
- const fixedRules = (options.rules ?? []).map((r, idx) => {
3685
- if (!selectedRuleIndices.has(idx)) return r;
3686
- if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
3687
- const { lineStartsAfter, ...rest } = r;
3688
- return {
3689
- ...rest,
3690
- lineStartsWith: lineStartsAfter
3691
- };
3692
- });
3693
- return {
3694
- ...options,
3695
- rules: fixedRules
3696
- };
3697
- };
3698
- const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
3699
- const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
3700
- const parts = [];
3701
- for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
3702
- const matchContent = parts.join("\n");
3703
- if (pageJoiner === "newline") return {
3704
- matchContent,
3705
- outputContent: matchContent
3706
- };
3707
- return {
3708
- matchContent,
3709
- outputContent: parts.join(" ")
3710
- };
3711
- };
3712
- const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
3713
- const rules = options.rules ?? [];
3714
- const compiled = [];
3715
- for (const idx of selectedRuleIndices) {
3716
- const r = rules[idx];
3717
- if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3718
- const { lineStartsAfter, ...rest } = r;
3719
- const built = buildRuleRegex({
3720
- ...rest,
3721
- lineStartsWith: lineStartsAfter
3722
- });
3723
- compiled.push({
3724
- ruleIndex: idx,
3725
- startsWithRegex: new RegExp(built.regex.source, "mu")
3726
- });
3727
- }
3728
- return compiled;
3729
- };
3730
- const findUniqueAnchorPos = (outputContent, segmentContent) => {
3731
- for (const len of [
3732
- 80,
3733
- 60,
3734
- 40,
3735
- 30,
3736
- 20,
3737
- 15
3738
- ]) {
3739
- const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
3740
- if (!needle.trim()) continue;
3741
- const first = outputContent.indexOf(needle);
3742
- if (first === -1) continue;
3743
- if (outputContent.indexOf(needle, first + 1) === -1) return first;
3744
- }
3745
- return null;
3746
- };
3747
- const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
3748
- const line = matchContent.slice(lineStart);
3749
- for (const mr of compiledMistaken) {
3750
- mr.startsWithRegex.lastIndex = 0;
3751
- const m = mr.startsWithRegex.exec(line);
3752
- if (!m || m.index !== 0) continue;
3753
- const markerMatch = m[0];
3754
- const markerEnd = lineStart + markerMatch.length;
3755
- if (anchorPos < markerEnd) continue;
3756
- const gap = matchContent.slice(markerEnd, anchorPos);
3757
- const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
3758
- if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
3759
- return { prefix: recoveredPrefix };
3760
- }
3761
- return { reason: "no selected marker pattern matched at anchored line start" };
3762
- };
3763
- const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
3764
- const fromIdx = pageIdToIndex.get(segment.from);
3765
- const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
3766
- if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
3767
- kind: "unresolved",
3768
- reason: "segment page range not found in pages"
3769
- };
3770
- const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
3771
- if (!segment.content) return {
3772
- kind: "unresolved",
3773
- reason: "empty segment content"
3774
- };
3775
- const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
3776
- if (anchorPos === null) return {
3777
- kind: "unresolved",
3778
- reason: "could not uniquely anchor segment content in page range"
3779
- };
3780
- const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
3781
- const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
3782
- if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
3783
- kind: "unresolved",
3784
- reason: found.reason
3785
- };
3786
- return {
3787
- kind: "recovered",
3788
- recoveredContent: `${found.prefix}${segment.content}`,
3789
- recoveredPrefix: found.prefix
3790
- };
3791
- };
3792
- const resolveRuleIndicesSelector = (rules, indicesIn) => {
3793
- const errors = [];
3794
- const indices = /* @__PURE__ */ new Set();
3795
- for (const idx of indicesIn) {
3796
- if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
3797
- errors.push(`Selector index out of range: ${idx}`);
3798
- continue;
3799
- }
3800
- const rule = rules[idx];
3801
- if (!rule || !("lineStartsAfter" in rule)) {
3802
- errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
3803
- continue;
3804
- }
3805
- indices.add(idx);
3806
- }
3807
- return {
3808
- errors,
3809
- indices,
3810
- warnings: []
3811
- };
3812
- };
3813
- const resolvePredicateSelector = (rules, predicate) => {
3814
- const errors = [];
3815
- const warnings = [];
3816
- const indices = /* @__PURE__ */ new Set();
3817
- rules.forEach((r, i) => {
3818
- try {
3819
- if (!predicate(r, i)) return;
3820
- if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
3821
- indices.add(i);
3822
- return;
3823
- }
3824
- warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
3825
- } catch (e) {
3826
- const msg = e instanceof Error ? e.message : String(e);
3827
- errors.push(`Predicate threw at rule ${i}: ${msg}`);
3828
- }
3829
- });
3830
- if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
3831
- return {
3832
- errors,
3833
- indices,
3834
- warnings
3835
- };
3836
- };
3837
- const resolvePatternsSelector = (rules, patterns, matchMode) => {
3838
- const errors = [];
3839
- const warnings = [];
3840
- const indices = /* @__PURE__ */ new Set();
3841
- const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
3842
- const targets = patterns.map(normalizePattern);
3843
- for (let pi = 0; pi < patterns.length; pi++) {
3844
- const rawPattern = patterns[pi];
3845
- const pat = targets[pi];
3846
- const matched = [];
3847
- for (let i = 0; i < rules.length; i++) {
3848
- const r = rules[i];
3849
- if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3850
- if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
3851
- }
3852
- if (matched.length === 0) {
3853
- errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
3854
- continue;
3855
- }
3856
- if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
3857
- matched.forEach((i) => {
3858
- indices.add(i);
3859
- });
3860
- }
3861
- return {
3862
- errors,
3863
- indices,
3864
- warnings
3865
- };
3866
- };
3867
- const resolveSelectorToRuleIndices = (options, selector) => {
3868
- const rules = options.rules ?? [];
3869
- if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
3870
- if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
3871
- return resolvePatternsSelector(rules, selector.patterns, selector.match);
3872
- };
3873
- const longestCommonSuffixLength = (a, b) => {
3874
- const max = Math.min(a.length, b.length);
3875
- let i = 0;
3876
- while (i < max) {
3877
- if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
3878
- i++;
3879
- }
3880
- return i;
3881
- };
3882
- const AMBIGUITY_SCORE_GAP = 5;
3883
- const scoreCandidate = (orig, fixed, normalizeMode) => {
3884
- if (fixed.content === orig.content) return {
3885
- fixedIndex: -1,
3886
- kind: "exact",
3887
- score: 100
3888
- };
3889
- if (fixed.content.endsWith(orig.content)) {
3890
- const markerLen = fixed.content.length - orig.content.length;
3891
- return {
3892
- fixedIndex: -1,
3893
- kind: "exact_suffix",
3894
- score: 90 + Math.min(30, markerLen)
3895
- };
3896
- }
3897
- if (normalizeMode !== "none") {
3898
- const normFixed = normalizeForCompare(fixed.content, normalizeMode);
3899
- const normOrig = normalizeForCompare(orig.content, normalizeMode);
3900
- if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
3901
- const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
3902
- return {
3903
- fixedIndex: -1,
3904
- kind: "normalized_suffix",
3905
- score: 70 + Math.floor(overlap * 20)
3906
- };
3907
- }
3908
- }
3909
- return null;
3910
- };
3911
- const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
3912
- const warnings = [...reportBase.warnings];
3913
- warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
3914
- const details = segments.map((s, i) => {
3915
- const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
3916
- return {
3917
- from: s.from,
3918
- notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
3919
- originalStartPreview: preview(s.content),
3920
- segmentIndex: i,
3921
- status,
3922
- strategy: "none",
3923
- to: s.to
3924
- };
3925
- });
3926
- return {
3927
- report: {
3928
- ...reportBase,
3929
- details,
3930
- summary: {
3931
- mode,
3932
- recovered: 0,
3933
- totalSegments: segments.length,
3934
- unchanged: segments.length,
3935
- unresolved: selectorErrors.length ? segments.length : 0
3936
- },
3937
- warnings
3938
- },
3939
- segments
3940
- };
3941
- };
3942
- const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
3943
- const recoveredAtIndex = /* @__PURE__ */ new Map();
3944
- const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
3945
- if (mode !== "best_effort_then_rerun") return {
3946
- recoveredAtIndex,
3947
- recoveredDetailAtIndex
3948
- };
3949
- const pageIdToIndex = buildPageIdToIndex(pages);
3950
- const pageJoiner = options.pageJoiner ?? "space";
3951
- const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
3952
- for (let i = 0; i < segments.length; i++) {
3953
- const orig = segments[i];
3954
- const r = tryBestEffortRecoverOneSegment(orig, pages, pageIdToIndex, compiledMistaken, pageJoiner);
3955
- if (r.kind !== "recovered") continue;
3956
- const seg = {
3957
- ...orig,
3958
- content: r.recoveredContent
3959
- };
3960
- recoveredAtIndex.set(i, seg);
3961
- recoveredDetailAtIndex.set(i, {
3962
- from: orig.from,
3963
- originalStartPreview: preview(orig.content),
3964
- recoveredPrefixPreview: preview(r.recoveredPrefix),
3965
- recoveredStartPreview: preview(seg.content),
3966
- segmentIndex: i,
3967
- status: "recovered",
3968
- strategy: "stage1",
3969
- to: orig.to
3970
- });
3971
- }
3972
- return {
3973
- recoveredAtIndex,
3974
- recoveredDetailAtIndex
3975
- };
3976
- };
3977
- const buildFixedBuckets = (fixedSegments) => {
3978
- const buckets = /* @__PURE__ */ new Map();
3979
- for (let i = 0; i < fixedSegments.length; i++) {
3980
- const k = segmentRangeKey(fixedSegments[i]);
3981
- const arr = buckets.get(k);
3982
- if (!arr) buckets.set(k, [i]);
3983
- else arr.push(i);
3984
- }
3985
- return buckets;
3986
- };
3987
- const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
3988
- let best = null;
3989
- let secondBestScore = -Infinity;
3990
- for (const fixedIdx of candidates) {
3991
- if (usedFixed.has(fixedIdx)) continue;
3992
- const fixed = fixedSegments[fixedIdx];
3993
- const scored = scoreCandidate(orig, fixed, normalizeCompare);
3994
- if (!scored) continue;
3995
- const candidateScore = scored.score;
3996
- if (!best || candidateScore > best.score) {
3997
- secondBestScore = best?.score ?? -Infinity;
3998
- best = {
3999
- fixedIdx,
4000
- score: candidateScore
4001
- };
4002
- } else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
4003
- }
4004
- if (!best) return { kind: "none" };
4005
- if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
4006
- return {
4007
- fixedIdx: best.fixedIdx,
4008
- kind: "match"
4009
- };
4010
- };
4011
- const detailUnresolved = (orig, segmentIndex, notes) => ({
4012
- from: orig.from,
4013
- notes,
4014
- originalStartPreview: preview(orig.content),
4015
- segmentIndex,
4016
- status: "unresolved_alignment",
4017
- strategy: "rerun",
4018
- to: orig.to
4019
- });
4020
- const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
4021
- from: orig.from,
4022
- notes,
4023
- originalStartPreview: preview(orig.content),
4024
- segmentIndex,
4025
- status: "skipped_idempotent",
4026
- strategy: "rerun",
4027
- to: orig.to
4028
- });
4029
- const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
4030
- let recoveredPrefixPreview;
4031
- if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
4032
- return {
4033
- from: orig.from,
4034
- originalStartPreview: preview(orig.content),
4035
- recoveredPrefixPreview,
4036
- recoveredStartPreview: preview(fixed.content),
4037
- segmentIndex,
4038
- status: "recovered",
4039
- strategy: "rerun",
4040
- to: orig.to
4041
- };
4042
- };
4043
- const mergeWithRerun = (params) => {
4044
- const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
4045
- const usedFixed = /* @__PURE__ */ new Set();
4046
- const out = [];
4047
- const details = [];
4048
- let recovered = 0;
4049
- let unresolved = 0;
4050
- let unchanged = 0;
4051
- for (let i = 0; i < originalSegments.length; i++) {
4052
- const stage1Recovered = stage1RecoveredAtIndex.get(i);
4053
- if (stage1Recovered) {
4054
- out.push(stage1Recovered);
4055
- recovered++;
4056
- details.push(recoveredDetailAtIndex.get(i) ?? {
4057
- from: stage1Recovered.from,
4058
- originalStartPreview: preview(originalSegments[i].content),
4059
- recoveredStartPreview: preview(stage1Recovered.content),
4060
- segmentIndex: i,
4061
- status: "recovered",
4062
- strategy: "stage1",
4063
- to: stage1Recovered.to
4064
- });
4065
- continue;
4066
- }
4067
- const orig = originalSegments[i];
4068
- const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
4069
- if (best.kind === "none") {
4070
- out.push(orig);
4071
- unresolved++;
4072
- details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
4073
- continue;
4074
- }
4075
- if (best.kind === "ambiguous") {
4076
- out.push(orig);
4077
- unresolved++;
4078
- details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
4079
- continue;
4080
- }
4081
- usedFixed.add(best.fixedIdx);
4082
- const fixed = fixedSegments[best.fixedIdx];
4083
- if (fixed.content === orig.content) {
4084
- out.push(orig);
4085
- unchanged++;
4086
- details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
4087
- continue;
4088
- }
4089
- out.push({
4090
- ...orig,
4091
- content: fixed.content
4092
- });
4093
- recovered++;
4094
- details.push(detailRecoveredRerun(orig, fixed, i));
4095
- }
4096
- return {
4097
- details,
4098
- segments: out,
4099
- summary: {
4100
- recovered,
4101
- unchanged,
4102
- unresolved
4103
- }
4104
- };
4105
- };
4106
- function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
4107
- const mode = opts?.mode ?? "rerun_only";
4108
- const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
4109
- const resolved = resolveSelectorToRuleIndices(options, selector);
4110
- const reportBase = {
4111
- byRun: void 0,
4112
- errors: resolved.errors,
4113
- warnings: resolved.warnings
4114
- };
4115
- if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
4116
- const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
4117
- const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
4118
- const merged = mergeWithRerun({
4119
- fixedBuckets: buildFixedBuckets(fixedSegments),
4120
- fixedSegments,
4121
- normalizeCompare,
4122
- originalSegments: segments,
4123
- recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
4124
- stage1RecoveredAtIndex: stage1.recoveredAtIndex
4125
- });
4126
- return {
4127
- report: {
4128
- ...reportBase,
4129
- details: merged.details,
4130
- summary: {
4131
- mode,
4132
- recovered: merged.summary.recovered,
4133
- totalSegments: segments.length,
4134
- unchanged: merged.summary.unchanged,
4135
- unresolved: merged.summary.unresolved
4136
- }
4137
- },
4138
- segments: merged.segments
4139
- };
4140
- }
4141
- function recoverMistakenMarkersForRuns(runs, opts) {
4142
- const allSegments = [];
4143
- const byRun = [];
4144
- const details = [];
4145
- const warnings = [];
4146
- const errors = [];
4147
- let recovered = 0;
4148
- let unchanged = 0;
4149
- let unresolved = 0;
4150
- let offset = 0;
4151
- for (let i = 0; i < runs.length; i++) {
4152
- const run = runs[i];
4153
- const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
4154
- allSegments.push(...res.segments);
4155
- for (const d of res.report.details) details.push({
4156
- ...d,
4157
- segmentIndex: d.segmentIndex + offset
4158
- });
4159
- offset += run.segments.length;
4160
- recovered += res.report.summary.recovered;
4161
- unchanged += res.report.summary.unchanged;
4162
- unresolved += res.report.summary.unresolved;
4163
- warnings.push(...res.report.warnings);
4164
- errors.push(...res.report.errors);
4165
- byRun.push({
4166
- recovered: res.report.summary.recovered,
4167
- runIndex: i,
4168
- totalSegments: run.segments.length,
4169
- unresolved: res.report.summary.unresolved
4170
- });
4171
- }
4172
- return {
4173
- report: {
4174
- byRun,
4175
- details,
4176
- errors,
4177
- summary: {
4178
- mode: opts?.mode ?? "rerun_only",
4179
- recovered,
4180
- totalSegments: offset,
4181
- unchanged,
4182
- unresolved
4183
- },
4184
- warnings
4185
- },
4186
- segments: allSegments
4187
- };
4188
- }
4189
-
4190
- //#endregion
4191
- //#region src/segmentation/pattern-validator.ts
4192
- const KNOWN_TOKENS = new Set(getAvailableTokens());
4193
- const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
4194
- const buildBareTokenRegex = () => {
4195
- const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
4196
- return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
4197
- };
4198
- /**
4199
- * Validates a single pattern for common issues.
4200
- */
4201
- const validatePattern = (pattern, seenPatterns) => {
4202
- if (!pattern.trim()) return {
4203
- message: "Empty pattern is not allowed",
4204
- type: "empty_pattern"
4205
- };
4206
- if (seenPatterns.has(pattern)) return {
4207
- message: `Duplicate pattern: "${pattern}"`,
4208
- pattern,
4209
- type: "duplicate"
4210
- };
4211
- seenPatterns.add(pattern);
4212
- TOKEN_INSIDE_BRACES.lastIndex = 0;
4213
- for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
4214
- const name = match[1];
4215
- if (!KNOWN_TOKENS.has(name)) return {
4216
- message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
4217
- suggestion: "Check spelling or use a known token",
4218
- token: name,
4219
- type: "unknown_token"
4220
- };
4221
- }
4222
- for (const match of pattern.matchAll(buildBareTokenRegex())) {
4223
- const [full, name] = match;
4224
- const idx = match.index;
4225
- if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
4226
- message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
4227
- suggestion: `{{${full}}}`,
4228
- token: name,
4229
- type: "missing_braces"
4230
- };
4231
- }
4232
- };
4233
- /**
4234
- * Validates an array of patterns, returning parallel array of issues.
4235
- */
4236
- const validatePatternArray = (patterns) => {
4237
- const seen = /* @__PURE__ */ new Set();
4238
- const issues = patterns.map((p) => validatePattern(p, seen));
4239
- return issues.some(Boolean) ? issues : void 0;
4240
- };
4241
- /**
4242
- * Validates split rules for common pattern issues.
4243
- *
4244
- * Checks for:
4245
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
4246
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
4247
- * - Duplicate patterns within the same rule
4248
- *
4249
- * @param rules - Array of split rules to validate
4250
- * @returns Array parallel to input with validation results (undefined if no issues)
4251
- *
4252
- * @example
4253
- * const issues = validateRules([
4254
- * { lineStartsAfter: ['raqms:num'] }, // Missing braces
4255
- * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
4256
- * ]);
4257
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
4258
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
4259
- */
4260
- const validateRules = (rules) => rules.map((rule) => {
4261
- const result = {};
4262
- let hasIssues = false;
4263
- for (const key of [
4264
- "lineStartsWith",
4265
- "lineStartsAfter",
4266
- "lineEndsWith"
4267
- ]) if (key in rule && rule[key]) {
4268
- const issues = validatePatternArray(rule[key]);
4269
- if (issues) {
4270
- result[key] = issues;
4271
- hasIssues = true;
4272
- }
4273
- }
4274
- if ("template" in rule && rule.template !== void 0) {
4275
- const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
4276
- if (issue) {
4277
- result.template = issue;
4278
- hasIssues = true;
4279
- }
4280
- }
4281
- return hasIssues ? result : void 0;
4282
- });
4283
- /**
4284
- * Formats a validation result array into a list of human-readable error messages.
4285
- *
4286
- * Useful for displaying validation errors in UIs.
4287
- *
4288
- * @param results - The result array from `validateRules()`
4289
- * @returns Array of formatted error strings
4290
- *
4291
- * @example
4292
- * const issues = validateRules(rules);
4293
- * const errors = formatValidationReport(issues);
4294
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
4295
- */
4296
- const formatValidationReport = (results) => results.flatMap((result, i) => {
4297
- if (!result) return [];
4298
- return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => {
4299
- if (!issue) return null;
4300
- const loc = `Rule ${i + 1}, ${type}`;
4301
- if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
4302
- if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
4303
- if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
4304
- return `${loc}: ${issue.message || issue.type}`;
4305
- })).filter((msg) => msg !== null);
4306
- });
4307
-
4308
- //#endregion
4309
- //#region src/validation/validation-constants.ts
4310
- /**
4311
- * Validation-specific constants
4312
- */
4313
- /**
4314
- * Limit for validation issue preview length (characters).
4315
- */
4316
- const PREVIEW_LIMIT = 140;
4317
- /**
4318
- * Threshold for short segment content (characters).
4319
- * Segments shorter than this will trigger a full-document search fallback
4320
- * if not found in the expected window.
4321
- */
4322
- const FULL_SEARCH_THRESHOLD = 500;
4323
-
4324
4153
  //#endregion
4325
4154
  //#region src/validation/validate-segments.ts
4326
4155
  /**
@@ -4329,8 +4158,8 @@ const FULL_SEARCH_THRESHOLD = 500;
4329
4158
  */
4330
4159
  const buildPreview = (text) => {
4331
4160
  const normalized = text.replace(/\s+/g, " ").trim();
4332
- if (normalized.length <= PREVIEW_LIMIT) return normalized;
4333
- return `${normalized.slice(0, PREVIEW_LIMIT)}...`;
4161
+ if (normalized.length <= 140) return normalized;
4162
+ return `${normalized.slice(0, 140)}...`;
4334
4163
  };
4335
4164
  /**
4336
4165
  * Creates a lightweight snapshot of a segment for inclusion in validation checks.
@@ -4358,19 +4187,18 @@ const normalizePages = (pages, options) => {
4358
4187
  */
4359
4188
  const buildJoinedContent = (pages, joiner) => {
4360
4189
  const boundaries = [];
4361
- const nonEmptyPages = pages.filter((p) => p.content);
4362
- const joined = nonEmptyPages.map((p) => p.content).join(joiner);
4190
+ const joined = pages.map((p) => p.content).join(joiner);
4363
4191
  let offset = 0;
4364
- for (let i = 0; i < nonEmptyPages.length; i++) {
4365
- const content = nonEmptyPages[i].content;
4192
+ for (let i = 0; i < pages.length; i++) {
4193
+ const content = pages[i].content;
4366
4194
  const start = offset;
4367
- const end = start + content.length - 1;
4195
+ const end = start + content.length;
4368
4196
  boundaries.push({
4369
4197
  end,
4370
- id: nonEmptyPages[i].id,
4198
+ id: pages[i].id,
4371
4199
  start
4372
4200
  });
4373
- offset = end + 1 + (i < nonEmptyPages.length - 1 ? joiner.length : 0);
4201
+ offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
4374
4202
  }
4375
4203
  return {
4376
4204
  boundaries,
@@ -4561,7 +4389,7 @@ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, search
4561
4389
  const bufferSize = 1e3;
4562
4390
  const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
4563
4391
  if (rawMatches.length === 0) {
4564
- const threshold = validationOptions?.fullSearchThreshold ?? FULL_SEARCH_THRESHOLD;
4392
+ const threshold = validationOptions?.fullSearchThreshold ?? 500;
4565
4393
  if (content.length < threshold) {
4566
4394
  const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
4567
4395
  const validMatch = fullMatches.find((m) => {
@@ -4715,7 +4543,7 @@ const validateSegments = (pages, options, segments, validationOptions) => {
4715
4543
  }
4716
4544
  };
4717
4545
  };
4718
-
4719
4546
  //#endregion
4720
- export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
4547
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
4548
+
4721
4549
  //# sourceMappingURL=index.mjs.map