flappa-doormal 2.17.1 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +16 -39
- package/README.md +91 -62
- package/dist/index.d.mts +196 -73
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +976 -1148
- package/dist/index.mjs.map +1 -1
- package/package.json +9 -9
package/dist/index.mjs
CHANGED
|
@@ -1,141 +1,25 @@
|
|
|
1
|
-
//#region src/
|
|
1
|
+
//#region src/segmentation/tokens.ts
|
|
2
2
|
/**
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
6
|
-
* for consistent pattern matching across platforms.
|
|
3
|
+
* Arabic base letters used by low-level dictionary-style regex helpers.
|
|
7
4
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
5
|
+
* This is intentionally broader than `{{harf}}`:
|
|
6
|
+
* - includes standalone hamza `ء`
|
|
7
|
+
* - stays as a raw regex fragment rather than a template token
|
|
10
8
|
*/
|
|
11
|
-
const
|
|
12
|
-
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
13
|
-
};
|
|
9
|
+
const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
|
|
14
10
|
/**
|
|
15
|
-
*
|
|
16
|
-
* but preserves content inside `{{...}}` token delimiters.
|
|
17
|
-
*
|
|
18
|
-
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
19
|
-
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
20
|
-
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
21
|
-
*
|
|
22
|
-
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
23
|
-
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
24
|
-
*
|
|
25
|
-
* @example
|
|
26
|
-
* escapeTemplateBrackets('({{harf}}): ')
|
|
27
|
-
* // → '\\({{harf}}\\): '
|
|
28
|
-
*
|
|
29
|
-
* @example
|
|
30
|
-
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
31
|
-
* // → '\\[{{raqm}}\\] '
|
|
32
|
-
*
|
|
33
|
-
* @example
|
|
34
|
-
* escapeTemplateBrackets('{{harf}}')
|
|
35
|
-
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
11
|
+
* Arabic combining marks / annotation signs used by low-level regex helpers.
|
|
36
12
|
*/
|
|
37
|
-
const
|
|
38
|
-
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
|
|
39
|
-
};
|
|
13
|
+
const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
|
|
40
14
|
/**
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
* Includes the following diacritical marks:
|
|
44
|
-
* - U+064B: ً (fathatan - double fatha)
|
|
45
|
-
* - U+064C: ٌ (dammatan - double damma)
|
|
46
|
-
* - U+064D: ٍ (kasratan - double kasra)
|
|
47
|
-
* - U+064E: َ (fatha - short a)
|
|
48
|
-
* - U+064F: ُ (damma - short u)
|
|
49
|
-
* - U+0650: ِ (kasra - short i)
|
|
50
|
-
* - U+0651: ّ (shadda - gemination)
|
|
51
|
-
* - U+0652: ْ (sukun - no vowel)
|
|
52
|
-
*
|
|
53
|
-
* @internal
|
|
15
|
+
* A single Arabic base letter followed by zero or more combining marks.
|
|
54
16
|
*/
|
|
55
|
-
const
|
|
17
|
+
const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
|
|
56
18
|
/**
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
* Characters within the same group are considered equivalent for matching purposes.
|
|
60
|
-
* This handles common variations in Arabic text where different characters are
|
|
61
|
-
* used interchangeably or have the same underlying meaning.
|
|
62
|
-
*
|
|
63
|
-
* Equivalence groups:
|
|
64
|
-
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
65
|
-
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
66
|
-
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
67
|
-
*
|
|
68
|
-
* @internal
|
|
19
|
+
* One or more Arabic letters, where each letter may carry combining marks.
|
|
69
20
|
*/
|
|
70
|
-
const
|
|
71
|
-
|
|
72
|
-
"ا",
|
|
73
|
-
"آ",
|
|
74
|
-
"أ",
|
|
75
|
-
"إ"
|
|
76
|
-
],
|
|
77
|
-
["ة", "ه"],
|
|
78
|
-
["ى", "ي"]
|
|
79
|
-
];
|
|
80
|
-
/**
|
|
81
|
-
* Escapes a string for safe inclusion in a regular expression.
|
|
82
|
-
*
|
|
83
|
-
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
84
|
-
*
|
|
85
|
-
* @param s - Any string to escape
|
|
86
|
-
* @returns String with regex metacharacters escaped
|
|
87
|
-
*
|
|
88
|
-
* @example
|
|
89
|
-
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
90
|
-
* escapeRegex('[test]') // → '\\[test\\]'
|
|
91
|
-
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
92
|
-
*/
|
|
93
|
-
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
94
|
-
const getEquivClass = (ch) => {
|
|
95
|
-
const group = EQUIV_GROUPS.find((g) => g.includes(ch));
|
|
96
|
-
return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
|
|
97
|
-
};
|
|
98
|
-
const normalizeArabicLight = (str) => {
|
|
99
|
-
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
100
|
-
};
|
|
101
|
-
const makeDiacriticInsensitive = (text) => {
|
|
102
|
-
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
103
|
-
return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
104
|
-
};
|
|
105
|
-
const isCombiningMarkOrSelector = (char) => {
|
|
106
|
-
if (!char) return false;
|
|
107
|
-
return /\p{M}/u.test(char) || char === "︎" || char === "️";
|
|
108
|
-
};
|
|
109
|
-
const isJoiner = (char) => char === "" || char === "";
|
|
110
|
-
/**
|
|
111
|
-
* Ensures the position does not split a grapheme cluster (surrogate pairs,
|
|
112
|
-
* combining marks, or zero-width joiners / variation selectors).
|
|
113
|
-
*
|
|
114
|
-
* This is only used as a last-resort fallback when we are forced to split
|
|
115
|
-
* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
|
|
116
|
-
*/
|
|
117
|
-
const adjustForUnicodeBoundary = (content, position) => {
|
|
118
|
-
let adjusted = position;
|
|
119
|
-
while (adjusted > 0) {
|
|
120
|
-
const high = content.charCodeAt(adjusted - 1);
|
|
121
|
-
const low = content.charCodeAt(adjusted);
|
|
122
|
-
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
|
|
123
|
-
adjusted -= 1;
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
const nextChar = content[adjusted];
|
|
127
|
-
const prevChar = content[adjusted - 1];
|
|
128
|
-
if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
|
|
129
|
-
adjusted -= 1;
|
|
130
|
-
continue;
|
|
131
|
-
}
|
|
132
|
-
break;
|
|
133
|
-
}
|
|
134
|
-
return adjusted;
|
|
135
|
-
};
|
|
136
|
-
|
|
137
|
-
//#endregion
|
|
138
|
-
//#region src/segmentation/tokens.ts
|
|
21
|
+
const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
|
|
22
|
+
const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
|
|
139
23
|
const RUMUZ_ATOM = `(?:${[
|
|
140
24
|
"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
141
25
|
"خت",
|
|
@@ -166,15 +50,25 @@ const RUMUZ_ATOM = `(?:${[
|
|
|
166
50
|
].join("|")})`;
|
|
167
51
|
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
168
52
|
const BASE_TOKENS = {
|
|
53
|
+
/** Chapter marker (باب). */
|
|
169
54
|
bab: "باب",
|
|
55
|
+
/** Basmala (بسم الله). Also matches ﷽. */
|
|
170
56
|
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
57
|
+
/** Bullet point variants: `•`, `*`, `°`. */
|
|
171
58
|
bullet: "[•*°]",
|
|
59
|
+
/** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
|
|
172
60
|
dash: "[-–—ـ]",
|
|
61
|
+
/** Section marker (فصل / مسألة). */
|
|
173
62
|
fasl: ["مسألة", "فصل"].join("|"),
|
|
63
|
+
/** Single Arabic letter (أ-ي). Does NOT include diacritics. */
|
|
174
64
|
harf: "[أ-ي]",
|
|
175
|
-
|
|
65
|
+
/** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
|
|
66
|
+
harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
|
|
67
|
+
/** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
|
|
176
68
|
hr: "[-–—ـ_=]{5,}",
|
|
69
|
+
/** Book marker (كتاب). */
|
|
177
70
|
kitab: "كتاب",
|
|
71
|
+
/** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
|
|
178
72
|
naql: [
|
|
179
73
|
"حدثني",
|
|
180
74
|
"وأخبرنا",
|
|
@@ -186,33 +80,58 @@ const BASE_TOKENS = {
|
|
|
186
80
|
"وحدثني",
|
|
187
81
|
"وحدثنيه"
|
|
188
82
|
].join("|"),
|
|
83
|
+
/** Newline character. Useful for breakpoints that split on line boundaries. */
|
|
189
84
|
newline: "\\n",
|
|
85
|
+
/** Single ASCII digit (0-9). */
|
|
190
86
|
num: "\\d",
|
|
87
|
+
/** One or more ASCII digits (0-9)+. */
|
|
191
88
|
nums: "\\d+",
|
|
89
|
+
/** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
|
|
192
90
|
raqm: "[\\u0660-\\u0669]",
|
|
91
|
+
/** One or more Arabic-Indic digits (٠-٩)+. */
|
|
193
92
|
raqms: "[\\u0660-\\u0669]+",
|
|
93
|
+
/** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
|
|
194
94
|
rumuz: RUMUZ_BLOCK,
|
|
95
|
+
/** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
|
|
195
96
|
tarqim: "[.!?؟؛]"
|
|
196
97
|
};
|
|
197
98
|
/** Pre-defined token constants for use in patterns. */
|
|
198
99
|
const Token = {
|
|
100
|
+
/** Chapter marker - باب */
|
|
199
101
|
BAB: "{{bab}}",
|
|
102
|
+
/** Basmala - بسم الله */
|
|
200
103
|
BASMALAH: "{{basmalah}}",
|
|
104
|
+
/** Bullet point variants */
|
|
201
105
|
BULLET: "{{bullet}}",
|
|
106
|
+
/** Dash variants (hyphen, en-dash, em-dash, tatweel) */
|
|
202
107
|
DASH: "{{dash}}",
|
|
108
|
+
/** Section marker - فصل / مسألة */
|
|
203
109
|
FASL: "{{fasl}}",
|
|
110
|
+
/** Single Arabic letter */
|
|
204
111
|
HARF: "{{harf}}",
|
|
112
|
+
/** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
|
|
205
113
|
HARFS: "{{harfs}}",
|
|
114
|
+
/** Horizontal rule / separator (repeated dashes) */
|
|
206
115
|
HR: "{{hr}}",
|
|
116
|
+
/** Book marker - كتاب */
|
|
207
117
|
KITAB: "{{kitab}}",
|
|
118
|
+
/** Hadith transmission phrases */
|
|
208
119
|
NAQL: "{{naql}}",
|
|
120
|
+
/** Newline character (for breakpoints) */
|
|
209
121
|
NEWLINE: "{{newline}}",
|
|
122
|
+
/** Single ASCII digit */
|
|
210
123
|
NUM: "{{num}}",
|
|
124
|
+
/** Composite: {{raqms}} {{dash}} (space) */
|
|
211
125
|
NUMBERED: "{{numbered}}",
|
|
126
|
+
/** One or more ASCII digits */
|
|
212
127
|
NUMS: "{{nums}}",
|
|
128
|
+
/** Single Arabic-Indic digit */
|
|
213
129
|
RAQM: "{{raqm}}",
|
|
130
|
+
/** One or more Arabic-Indic digits */
|
|
214
131
|
RAQMS: "{{raqms}}",
|
|
132
|
+
/** Source abbreviations (rijāl/takhrīj) */
|
|
215
133
|
RUMUZ: "{{rumuz}}",
|
|
134
|
+
/** Punctuation marks */
|
|
216
135
|
TARQIM: "{{tarqim}}"
|
|
217
136
|
};
|
|
218
137
|
/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
|
|
@@ -222,7 +141,9 @@ const withCapture = (token, name) => {
|
|
|
222
141
|
return `{{${match[1]}:${name}}}`;
|
|
223
142
|
};
|
|
224
143
|
/** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
|
|
225
|
-
const COMPOSITE_TOKENS = {
|
|
144
|
+
const COMPOSITE_TOKENS = {
|
|
145
|
+
/** Common hadith numbering format: Arabic-Indic digits + dash + space. */
|
|
146
|
+
numbered: "{{raqms}} {{dash}} " };
|
|
226
147
|
/** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
|
|
227
148
|
const expandCompositeTokensInTemplate = (template) => {
|
|
228
149
|
let out = template;
|
|
@@ -473,11 +394,11 @@ const templateToRegex = (template) => {
|
|
|
473
394
|
* Useful for documentation, validation, or building user interfaces
|
|
474
395
|
* that show available tokens.
|
|
475
396
|
*
|
|
476
|
-
* @returns Array of token names (e.g., `['bab', '
|
|
397
|
+
* @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
|
|
477
398
|
*
|
|
478
399
|
* @example
|
|
479
400
|
* getAvailableTokens()
|
|
480
|
-
* // → ['bab', '
|
|
401
|
+
* // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
481
402
|
*/
|
|
482
403
|
const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
483
404
|
/**
|
|
@@ -486,13 +407,13 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
486
407
|
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
487
408
|
* without any expansion or capture group wrapping.
|
|
488
409
|
*
|
|
489
|
-
* @param tokenName - The token name to look up (e.g., 'raqms'
|
|
490
|
-
* @returns The regex pattern string
|
|
410
|
+
* @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
|
|
411
|
+
* @returns The regex pattern string for that known token
|
|
491
412
|
*
|
|
492
413
|
* @example
|
|
493
414
|
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
494
415
|
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
495
|
-
* getTokenPattern('
|
|
416
|
+
* getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
|
|
496
417
|
*/
|
|
497
418
|
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
498
419
|
/**
|
|
@@ -571,7 +492,161 @@ const applyTokenMappings = (template, mappings) => {
|
|
|
571
492
|
const stripTokenMappings = (template) => {
|
|
572
493
|
return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
|
|
573
494
|
};
|
|
574
|
-
|
|
495
|
+
//#endregion
|
|
496
|
+
//#region src/utils/textUtils.ts
|
|
497
|
+
/**
|
|
498
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
499
|
+
*
|
|
500
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
501
|
+
* for consistent pattern matching across platforms.
|
|
502
|
+
*
|
|
503
|
+
* @param content - Raw content with potentially mixed line endings
|
|
504
|
+
* @returns Content with all line endings normalized to `\n`
|
|
505
|
+
*/
|
|
506
|
+
const normalizeLineEndings = (content) => {
|
|
507
|
+
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
508
|
+
};
|
|
509
|
+
/**
|
|
510
|
+
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
511
|
+
* but preserves content inside `{{...}}` token delimiters.
|
|
512
|
+
*
|
|
513
|
+
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
514
|
+
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
515
|
+
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
516
|
+
*
|
|
517
|
+
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
518
|
+
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
519
|
+
*
|
|
520
|
+
* @example
|
|
521
|
+
* escapeTemplateBrackets('({{harf}}): ')
|
|
522
|
+
* // → '\\({{harf}}\\): '
|
|
523
|
+
*
|
|
524
|
+
* @example
|
|
525
|
+
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
526
|
+
* // → '\\[{{raqm}}\\] '
|
|
527
|
+
*
|
|
528
|
+
* @example
|
|
529
|
+
* escapeTemplateBrackets('{{harf}}')
|
|
530
|
+
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
531
|
+
*/
|
|
532
|
+
const escapeTemplateBrackets = (pattern) => {
|
|
533
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
|
|
534
|
+
};
|
|
535
|
+
/**
|
|
536
|
+
* Character class matching all Arabic diacritics (Tashkeel/Harakat).
|
|
537
|
+
*
|
|
538
|
+
* Includes the following diacritical marks:
|
|
539
|
+
* - U+0640: ـ (tatweel / kashida)
|
|
540
|
+
* - U+064B: ً (fathatan - double fatha)
|
|
541
|
+
* - U+064C: ٌ (dammatan - double damma)
|
|
542
|
+
* - U+064D: ٍ (kasratan - double kasra)
|
|
543
|
+
* - U+064E: َ (fatha - short a)
|
|
544
|
+
* - U+064F: ُ (damma - short u)
|
|
545
|
+
* - U+0650: ِ (kasra - short i)
|
|
546
|
+
* - U+0651: ّ (shadda - gemination)
|
|
547
|
+
* - U+0652: ْ (sukun - no vowel)
|
|
548
|
+
*
|
|
549
|
+
* @internal
|
|
550
|
+
*/
|
|
551
|
+
const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
|
|
552
|
+
/**
|
|
553
|
+
* Groups of equivalent Arabic characters.
|
|
554
|
+
*
|
|
555
|
+
* Characters within the same group are considered equivalent for matching purposes.
|
|
556
|
+
* This handles common variations in Arabic text where different characters are
|
|
557
|
+
* used interchangeably or have the same underlying meaning.
|
|
558
|
+
*
|
|
559
|
+
* Equivalence groups:
|
|
560
|
+
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
561
|
+
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
562
|
+
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
563
|
+
*
|
|
564
|
+
* @internal
|
|
565
|
+
*/
|
|
566
|
+
const EQUIV_GROUPS = [
|
|
567
|
+
[
|
|
568
|
+
"ا",
|
|
569
|
+
"آ",
|
|
570
|
+
"أ",
|
|
571
|
+
"إ"
|
|
572
|
+
],
|
|
573
|
+
["ة", "ه"],
|
|
574
|
+
["ى", "ي"]
|
|
575
|
+
];
|
|
576
|
+
const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
|
|
577
|
+
/**
|
|
578
|
+
* Escapes a string for safe inclusion in a regular expression.
|
|
579
|
+
*
|
|
580
|
+
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
581
|
+
*
|
|
582
|
+
* @param s - Any string to escape
|
|
583
|
+
* @returns String with regex metacharacters escaped
|
|
584
|
+
*
|
|
585
|
+
* @example
|
|
586
|
+
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
587
|
+
* escapeRegex('[test]') // → '\\[test\\]'
|
|
588
|
+
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
589
|
+
*/
|
|
590
|
+
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
591
|
+
const getEquivClass = (ch) => {
|
|
592
|
+
const group = EQUIV_GROUPS.find((g) => g.includes(ch));
|
|
593
|
+
return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
|
|
594
|
+
};
|
|
595
|
+
const normalizeArabicLight = (str) => {
|
|
596
|
+
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
597
|
+
};
|
|
598
|
+
/**
|
|
599
|
+
* Normalizes Arabic text for exact comparisons while tolerating common variants.
|
|
600
|
+
*
|
|
601
|
+
* This removes Arabic diacritics, collapses whitespace, removes joiners, and
|
|
602
|
+
* maps common equivalent letters to a shared canonical form:
|
|
603
|
+
* - ا/آ/أ/إ -> ا
|
|
604
|
+
* - ة/ه -> ه
|
|
605
|
+
* - ى/ي -> ي
|
|
606
|
+
*/
|
|
607
|
+
const normalizeArabicForComparison = (text) => {
|
|
608
|
+
return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
|
|
609
|
+
if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
|
|
610
|
+
if (ch === "ة") return "ه";
|
|
611
|
+
if (ch === "ى") return "ي";
|
|
612
|
+
return ch;
|
|
613
|
+
}).join("");
|
|
614
|
+
};
|
|
615
|
+
const makeDiacriticInsensitive = (text) => {
|
|
616
|
+
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
617
|
+
return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
618
|
+
};
|
|
619
|
+
const isCombiningMarkOrSelector = (char) => {
|
|
620
|
+
if (!char) return false;
|
|
621
|
+
return /\p{M}/u.test(char) || char === "︎" || char === "️";
|
|
622
|
+
};
|
|
623
|
+
const isJoiner = (char) => char === "" || char === "";
|
|
624
|
+
/**
|
|
625
|
+
* Ensures the position does not split a grapheme cluster (surrogate pairs,
|
|
626
|
+
* combining marks, or zero-width joiners / variation selectors).
|
|
627
|
+
*
|
|
628
|
+
* This is only used as a last-resort fallback when we are forced to split
|
|
629
|
+
* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
|
|
630
|
+
*/
|
|
631
|
+
const adjustForUnicodeBoundary = (content, position) => {
|
|
632
|
+
let adjusted = position;
|
|
633
|
+
while (adjusted > 0) {
|
|
634
|
+
const high = content.charCodeAt(adjusted - 1);
|
|
635
|
+
const low = content.charCodeAt(adjusted);
|
|
636
|
+
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
|
|
637
|
+
adjusted -= 1;
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
const nextChar = content[adjusted];
|
|
641
|
+
const prevChar = content[adjusted - 1];
|
|
642
|
+
if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
|
|
643
|
+
adjusted -= 1;
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
break;
|
|
647
|
+
}
|
|
648
|
+
return adjusted;
|
|
649
|
+
};
|
|
575
650
|
//#endregion
|
|
576
651
|
//#region src/analysis/shared.ts
|
|
577
652
|
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
@@ -632,7 +707,6 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
|
632
707
|
};
|
|
633
708
|
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
634
709
|
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
635
|
-
|
|
636
710
|
//#endregion
|
|
637
711
|
//#region src/analysis/line-starts.ts
|
|
638
712
|
const resolveOptions$1 = (options = {}) => ({
|
|
@@ -658,65 +732,141 @@ const compareBySpecificity = (a, b) => {
|
|
|
658
732
|
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
659
733
|
};
|
|
660
734
|
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
661
|
-
|
|
662
|
-
const trimTrailingWs = (out, mode) => {
|
|
663
|
-
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
664
|
-
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
665
|
-
return out;
|
|
666
|
-
};
|
|
667
|
-
/** Try to extract first word for fallback */
|
|
668
|
-
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
669
|
-
/** Consume prefix matchers at current position */
|
|
670
|
-
const consumePrefixes = (s, pos, out, matchers, ws) => {
|
|
671
|
-
let matched = false;
|
|
735
|
+
const appendPrefix = (s, pos, out, matchers, ws) => {
|
|
672
736
|
for (const re of matchers) {
|
|
673
737
|
if (pos >= s.length) break;
|
|
674
738
|
const m = re.exec(s.slice(pos));
|
|
675
739
|
if (!m?.index && m?.[0]) {
|
|
676
740
|
out += escapeSignatureLiteral(m[0]);
|
|
677
741
|
pos += m[0].length;
|
|
678
|
-
matched = true;
|
|
679
742
|
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
680
743
|
if (wsm) {
|
|
681
744
|
pos += wsm[0].length;
|
|
682
745
|
out = appendWs(out, ws);
|
|
683
746
|
}
|
|
747
|
+
return {
|
|
748
|
+
matched: true,
|
|
749
|
+
out,
|
|
750
|
+
pos
|
|
751
|
+
};
|
|
684
752
|
}
|
|
685
753
|
}
|
|
686
754
|
return {
|
|
687
|
-
matched,
|
|
755
|
+
matched: false,
|
|
688
756
|
out,
|
|
689
757
|
pos
|
|
690
758
|
};
|
|
691
759
|
};
|
|
692
|
-
|
|
693
|
-
const tryMatchToken = (s, pos, out, compiled) => {
|
|
760
|
+
const appendToken = (s, pos, out, compiled) => {
|
|
694
761
|
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
695
|
-
|
|
696
|
-
matched: false,
|
|
697
|
-
out,
|
|
698
|
-
pos
|
|
699
|
-
};
|
|
700
|
-
return {
|
|
762
|
+
return best ? {
|
|
701
763
|
matched: true,
|
|
702
764
|
out: `${out}{{${best.token}}}`,
|
|
703
765
|
pos: pos + best.text.length
|
|
766
|
+
} : {
|
|
767
|
+
matched: false,
|
|
768
|
+
out,
|
|
769
|
+
pos
|
|
704
770
|
};
|
|
705
771
|
};
|
|
706
|
-
|
|
707
|
-
const tryMatchDelimiter = (s, pos, out) => {
|
|
772
|
+
const appendDelimiter = (s, pos, out) => {
|
|
708
773
|
const ch = s[pos];
|
|
709
|
-
|
|
774
|
+
return ch && isCommonDelimiter(ch) ? {
|
|
775
|
+
matched: true,
|
|
776
|
+
out: `${out}${escapeSignatureLiteral(ch)}`,
|
|
777
|
+
pos: pos + 1
|
|
778
|
+
} : {
|
|
710
779
|
matched: false,
|
|
711
780
|
out,
|
|
712
|
-
pos
|
|
781
|
+
pos
|
|
782
|
+
};
|
|
783
|
+
};
|
|
784
|
+
const appendFallbackWord = (s, pos, out) => {
|
|
785
|
+
const word = extractFirstWord(s.slice(pos));
|
|
786
|
+
return word ? `${out}${escapeSignatureLiteral(word)}` : null;
|
|
787
|
+
};
|
|
788
|
+
const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
|
|
789
|
+
const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
|
|
790
|
+
if (ws.skipped) return {
|
|
791
|
+
done: false,
|
|
792
|
+
matchedAny,
|
|
793
|
+
matchedToken,
|
|
794
|
+
out: ws.out,
|
|
795
|
+
pos: ws.pos,
|
|
796
|
+
steps: 0
|
|
797
|
+
};
|
|
798
|
+
const tok = appendToken(s, pos, out, compiled);
|
|
799
|
+
if (tok.matched) return {
|
|
800
|
+
done: false,
|
|
801
|
+
matchedAny: true,
|
|
802
|
+
matchedToken: true,
|
|
803
|
+
out: tok.out,
|
|
804
|
+
pos: tok.pos,
|
|
805
|
+
steps: 1
|
|
806
|
+
};
|
|
807
|
+
if (matchedAny) {
|
|
808
|
+
const delim = appendDelimiter(s, pos, out);
|
|
809
|
+
if (delim.matched) return {
|
|
810
|
+
done: false,
|
|
811
|
+
matchedAny,
|
|
812
|
+
matchedToken,
|
|
813
|
+
out: delim.out,
|
|
814
|
+
pos: delim.pos,
|
|
815
|
+
steps: 0
|
|
816
|
+
};
|
|
817
|
+
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
818
|
+
const fallback = appendFallbackWord(s, pos, out);
|
|
819
|
+
if (fallback) return {
|
|
820
|
+
done: true,
|
|
821
|
+
matchedAny,
|
|
822
|
+
matchedToken,
|
|
823
|
+
out: fallback,
|
|
824
|
+
pos,
|
|
825
|
+
steps: 1
|
|
826
|
+
};
|
|
827
|
+
}
|
|
828
|
+
return {
|
|
829
|
+
done: true,
|
|
830
|
+
matchedAny,
|
|
831
|
+
matchedToken,
|
|
832
|
+
out,
|
|
833
|
+
pos,
|
|
834
|
+
steps: 0
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
if (!opts.includeFirstWordFallback) return {
|
|
838
|
+
done: true,
|
|
839
|
+
matchedAny,
|
|
840
|
+
matchedToken,
|
|
841
|
+
out,
|
|
842
|
+
pos,
|
|
843
|
+
steps: 0
|
|
713
844
|
};
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
845
|
+
const fallback = appendFallbackWord(s, pos, out);
|
|
846
|
+
return fallback ? {
|
|
847
|
+
done: true,
|
|
848
|
+
matchedAny: true,
|
|
849
|
+
matchedToken,
|
|
850
|
+
out: fallback,
|
|
851
|
+
pos,
|
|
852
|
+
steps: 0
|
|
853
|
+
} : {
|
|
854
|
+
done: true,
|
|
855
|
+
matchedAny,
|
|
856
|
+
matchedToken,
|
|
857
|
+
out,
|
|
858
|
+
pos,
|
|
859
|
+
steps: 0
|
|
718
860
|
};
|
|
719
861
|
};
|
|
862
|
+
/** Remove trailing whitespace placeholders */
|
|
863
|
+
const trimTrailingWs = (out, mode) => {
|
|
864
|
+
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
865
|
+
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
866
|
+
return out;
|
|
867
|
+
};
|
|
868
|
+
/** Try to extract first word for fallback */
|
|
869
|
+
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
720
870
|
/** Skip whitespace at position */
|
|
721
871
|
const skipWhitespace$1 = (s, pos, out, ws) => {
|
|
722
872
|
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
@@ -737,47 +887,25 @@ const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
|
737
887
|
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
738
888
|
const compiled = compileTokenRegexes(tokenNames);
|
|
739
889
|
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
740
|
-
const prefix =
|
|
890
|
+
const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
741
891
|
pos = prefix.pos;
|
|
742
892
|
out = prefix.out;
|
|
743
893
|
matchedAny = prefix.matched;
|
|
744
894
|
while (steps < 6 && pos < s.length) {
|
|
745
|
-
const
|
|
746
|
-
if (
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if (tok.matched) {
|
|
753
|
-
pos = tok.pos;
|
|
754
|
-
out = tok.out;
|
|
755
|
-
matchedAny = matchedToken = true;
|
|
756
|
-
steps++;
|
|
757
|
-
continue;
|
|
758
|
-
}
|
|
759
|
-
if (matchedAny) {
|
|
760
|
-
const delim = tryMatchDelimiter(s, pos, out);
|
|
761
|
-
if (delim.matched) {
|
|
762
|
-
pos = delim.pos;
|
|
763
|
-
out = delim.out;
|
|
764
|
-
continue;
|
|
765
|
-
}
|
|
766
|
-
}
|
|
767
|
-
if (matchedAny) {
|
|
768
|
-
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
769
|
-
const word = extractFirstWord(s.slice(pos));
|
|
770
|
-
if (word) {
|
|
771
|
-
out += escapeSignatureLiteral(word);
|
|
772
|
-
steps++;
|
|
773
|
-
}
|
|
774
|
-
}
|
|
895
|
+
const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
|
|
896
|
+
if (next.done) {
|
|
897
|
+
if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
|
|
898
|
+
if (next.steps > 0) steps += next.steps;
|
|
899
|
+
matchedAny = next.matchedAny;
|
|
900
|
+
matchedToken = next.matchedToken;
|
|
901
|
+
out = next.out;
|
|
775
902
|
break;
|
|
776
903
|
}
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
904
|
+
pos = next.pos;
|
|
905
|
+
out = next.out;
|
|
906
|
+
matchedAny = next.matchedAny;
|
|
907
|
+
matchedToken = next.matchedToken;
|
|
908
|
+
steps += next.steps;
|
|
781
909
|
}
|
|
782
910
|
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
783
911
|
};
|
|
@@ -821,7 +949,6 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
|
821
949
|
pattern
|
|
822
950
|
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
823
951
|
};
|
|
824
|
-
|
|
825
952
|
//#endregion
|
|
826
953
|
//#region src/analysis/repeating-sequences.ts
|
|
827
954
|
const resolveOptions = (options) => {
|
|
@@ -843,6 +970,7 @@ const resolveOptions = (options) => {
|
|
|
843
970
|
const createRawCursor = (text, normalize) => {
|
|
844
971
|
let rawPos = 0;
|
|
845
972
|
return {
|
|
973
|
+
/** Advance cursor, returning the raw text chunk consumed */
|
|
846
974
|
advance(normalizedLen) {
|
|
847
975
|
if (!normalize) {
|
|
848
976
|
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
@@ -947,23 +1075,27 @@ const buildExample = (page, window, contextChars) => {
|
|
|
947
1075
|
text: page.content.slice(start, end)
|
|
948
1076
|
};
|
|
949
1077
|
};
|
|
1078
|
+
const recordPattern = (page, window, opts, stats) => {
|
|
1079
|
+
if (opts.requireToken && !hasTokenInWindow(window)) return;
|
|
1080
|
+
const pattern = buildPattern(window, opts.whitespace);
|
|
1081
|
+
let entry = stats.get(pattern);
|
|
1082
|
+
if (!entry) {
|
|
1083
|
+
if (stats.size >= opts.maxUniquePatterns) return;
|
|
1084
|
+
entry = {
|
|
1085
|
+
count: 0,
|
|
1086
|
+
examples: [],
|
|
1087
|
+
...computeWindowStats(window)
|
|
1088
|
+
};
|
|
1089
|
+
stats.set(pattern, entry);
|
|
1090
|
+
}
|
|
1091
|
+
entry.count++;
|
|
1092
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
1093
|
+
};
|
|
950
1094
|
/** Extract N-grams from a single page */
|
|
951
1095
|
const extractPageNgrams = (page, items, opts, stats) => {
|
|
952
|
-
for (let i = 0; i <= items.length - opts.minElements; i++)
|
|
953
|
-
const
|
|
954
|
-
|
|
955
|
-
const pattern = buildPattern(window, opts.whitespace);
|
|
956
|
-
if (!stats.has(pattern)) {
|
|
957
|
-
if (stats.size >= opts.maxUniquePatterns) continue;
|
|
958
|
-
stats.set(pattern, {
|
|
959
|
-
count: 0,
|
|
960
|
-
examples: [],
|
|
961
|
-
...computeWindowStats(window)
|
|
962
|
-
});
|
|
963
|
-
}
|
|
964
|
-
const entry = stats.get(pattern);
|
|
965
|
-
entry.count++;
|
|
966
|
-
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
1096
|
+
for (let i = 0; i <= items.length - opts.minElements; i++) {
|
|
1097
|
+
const maxWindowSize = Math.min(opts.maxElements, items.length - i);
|
|
1098
|
+
for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
|
|
967
1099
|
}
|
|
968
1100
|
};
|
|
969
1101
|
/**
|
|
@@ -985,7 +1117,6 @@ const analyzeRepeatingSequences = (pages, options) => {
|
|
|
985
1117
|
pattern
|
|
986
1118
|
}));
|
|
987
1119
|
};
|
|
988
|
-
|
|
989
1120
|
//#endregion
|
|
990
1121
|
//#region src/detection.ts
|
|
991
1122
|
/**
|
|
@@ -1147,7 +1278,6 @@ const analyzeTextForRule = (text) => {
|
|
|
1147
1278
|
...suggestPatternConfig(detected)
|
|
1148
1279
|
};
|
|
1149
1280
|
};
|
|
1150
|
-
|
|
1151
1281
|
//#endregion
|
|
1152
1282
|
//#region src/types/rules.ts
|
|
1153
1283
|
/**
|
|
@@ -1172,7 +1302,6 @@ const PATTERN_TYPE_KEYS = [
|
|
|
1172
1302
|
"template",
|
|
1173
1303
|
"regex"
|
|
1174
1304
|
];
|
|
1175
|
-
|
|
1176
1305
|
//#endregion
|
|
1177
1306
|
//#region src/optimization/optimize-rules.ts
|
|
1178
1307
|
const MERGEABLE_KEYS = new Set([
|
|
@@ -1231,7 +1360,6 @@ const optimizeRules = (rules) => {
|
|
|
1231
1360
|
rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
|
|
1232
1361
|
};
|
|
1233
1362
|
};
|
|
1234
|
-
|
|
1235
1363
|
//#endregion
|
|
1236
1364
|
//#region src/preprocessing/transforms.ts
|
|
1237
1365
|
/** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
|
|
@@ -1340,170 +1468,89 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
|
1340
1468
|
}
|
|
1341
1469
|
return result;
|
|
1342
1470
|
};
|
|
1343
|
-
|
|
1344
1471
|
//#endregion
|
|
1345
|
-
//#region src/segmentation/rule
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
*
|
|
1355
|
-
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
1356
|
-
*/
|
|
1357
|
-
const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
|
|
1358
|
-
/**
|
|
1359
|
-
* Extracts named capture group names from a regex pattern.
|
|
1360
|
-
*
|
|
1361
|
-
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
1362
|
-
*
|
|
1363
|
-
* @example
|
|
1364
|
-
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
1365
|
-
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
1366
|
-
* extractNamedCaptureNames('^\\d+') // []
|
|
1367
|
-
*/
|
|
1368
|
-
const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([^>]+)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
|
|
1369
|
-
/**
|
|
1370
|
-
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1371
|
-
*/
|
|
1372
|
-
const compileRuleRegex = (pattern) => {
|
|
1373
|
-
try {
|
|
1374
|
-
return new RegExp(pattern, "gmu");
|
|
1375
|
-
} catch (error) {
|
|
1376
|
-
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
|
|
1472
|
+
//#region src/segmentation/arabic-dictionary-rule.ts
|
|
1473
|
+
const uniqueNormalizedWords = (words) => {
|
|
1474
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1475
|
+
const result = [];
|
|
1476
|
+
for (const word of words) {
|
|
1477
|
+
const normalized = normalizeArabicForComparison(word);
|
|
1478
|
+
if (!normalized || seen.has(normalized)) continue;
|
|
1479
|
+
seen.add(normalized);
|
|
1480
|
+
result.push(normalized);
|
|
1377
1481
|
}
|
|
1482
|
+
return result;
|
|
1378
1483
|
};
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
*/
|
|
1384
|
-
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
1385
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
1386
|
-
return {
|
|
1387
|
-
captureNames,
|
|
1388
|
-
pattern: expanded
|
|
1389
|
-
};
|
|
1390
|
-
};
|
|
1391
|
-
/**
|
|
1392
|
-
* Processes a breakpoint pattern by expanding tokens only.
|
|
1393
|
-
*
|
|
1394
|
-
* Unlike `processPattern`, this does NOT escape brackets because breakpoints
|
|
1395
|
-
* are treated as raw regex patterns (like the `regex` rule type).
|
|
1396
|
-
* Users have full control over regex syntax including `(?:...)` groups.
|
|
1397
|
-
*/
|
|
1398
|
-
const processBreakpointPattern = (pattern) => {
|
|
1399
|
-
const { pattern: expanded } = expandTokensWithCaptures(pattern);
|
|
1400
|
-
return expanded;
|
|
1401
|
-
};
|
|
1402
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1403
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1404
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
1405
|
-
return {
|
|
1406
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1407
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
|
|
1408
|
-
};
|
|
1409
|
-
};
|
|
1410
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1411
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1412
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
1413
|
-
return {
|
|
1414
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1415
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
|
|
1416
|
-
};
|
|
1484
|
+
const buildStopAlternation = (stopWords) => {
|
|
1485
|
+
const unique = uniqueNormalizedWords(stopWords);
|
|
1486
|
+
if (unique.length === 0) return "";
|
|
1487
|
+
return unique.map((word) => makeDiacriticInsensitive(word)).join("|");
|
|
1417
1488
|
};
|
|
1418
|
-
const
|
|
1419
|
-
|
|
1420
|
-
const
|
|
1421
|
-
return {
|
|
1422
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1423
|
-
regex: `(?:${alternatives})$`
|
|
1424
|
-
};
|
|
1489
|
+
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1490
|
+
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
1491
|
+
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
1492
|
+
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
1425
1493
|
};
|
|
1426
|
-
const
|
|
1427
|
-
const
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
};
|
|
1494
|
+
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1495
|
+
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1496
|
+
const withCapture = captureName ? `(?<${captureName}>${headwordBody})` : `(?:${headwordBody})`;
|
|
1497
|
+
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1498
|
+
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1432
1499
|
};
|
|
1433
1500
|
/**
|
|
1434
|
-
*
|
|
1501
|
+
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1435
1502
|
*
|
|
1436
|
-
*
|
|
1503
|
+
* The generated rule:
|
|
1504
|
+
* - keeps the lemma marker in `segment.content`
|
|
1505
|
+
* - stores the lemma in `segment.meta[captureName]`
|
|
1506
|
+
* - matches root entries at true line/page starts
|
|
1507
|
+
* - matches mid-line subentries conservatively when they begin with `و`
|
|
1508
|
+
* - can optionally support parenthesized headwords like `(عنبر) :`
|
|
1509
|
+
* - can optionally support comma-separated headword lists like `سبد، دبس:`
|
|
1510
|
+
*
|
|
1511
|
+
* @example
|
|
1512
|
+
* createArabicDictionaryEntryRule({
|
|
1513
|
+
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1514
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1515
|
+
* })
|
|
1516
|
+
*
|
|
1517
|
+
* @example
|
|
1518
|
+
* createArabicDictionaryEntryRule({
|
|
1519
|
+
* allowParenthesized: true,
|
|
1520
|
+
* allowWhitespaceBeforeColon: true,
|
|
1521
|
+
* allowCommaSeparated: true,
|
|
1522
|
+
* stopWords: ['الليث', 'العجاج'],
|
|
1523
|
+
* })
|
|
1437
1524
|
*/
|
|
1438
|
-
const
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
}
|
|
1454
|
-
let finalRegex = regex;
|
|
1455
|
-
let allCaptureNames = [];
|
|
1456
|
-
if (lineStartsWith?.length) {
|
|
1457
|
-
const res = buildLineStartsWithRegexSource(lineStartsWith, fuzzy, capturePrefix);
|
|
1458
|
-
finalRegex = res.regex;
|
|
1459
|
-
allCaptureNames = res.captureNames;
|
|
1460
|
-
}
|
|
1461
|
-
if (lineEndsWith?.length) {
|
|
1462
|
-
const res = buildLineEndsWithRegexSource(lineEndsWith, fuzzy, capturePrefix);
|
|
1463
|
-
finalRegex = res.regex;
|
|
1464
|
-
allCaptureNames = res.captureNames;
|
|
1465
|
-
}
|
|
1466
|
-
if (template) {
|
|
1467
|
-
const res = buildTemplateRegexSource(template, capturePrefix);
|
|
1468
|
-
finalRegex = res.regex;
|
|
1469
|
-
allCaptureNames = [...allCaptureNames, ...res.captureNames];
|
|
1470
|
-
}
|
|
1471
|
-
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
1472
|
-
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
1525
|
+
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1526
|
+
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1527
|
+
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1528
|
+
if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1529
|
+
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1530
|
+
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1531
|
+
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1532
|
+
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1533
|
+
const stopAlternation = buildStopAlternation(stopWords);
|
|
1534
|
+
const lemmaBody = buildHeadwordBody({
|
|
1535
|
+
allowCommaSeparated,
|
|
1536
|
+
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1537
|
+
stopAlternation,
|
|
1538
|
+
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1539
|
+
unit: lemmaUnit
|
|
1540
|
+
});
|
|
1473
1541
|
return {
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1542
|
+
meta,
|
|
1543
|
+
pageStartPrevWordStoplist,
|
|
1544
|
+
regex: `(?:${`(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`}|${allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`})` + buildBalancedMarker({
|
|
1545
|
+
allowParenthesized,
|
|
1546
|
+
allowWhitespaceBeforeColon,
|
|
1547
|
+
captureName,
|
|
1548
|
+
headwordBody: lemmaBody
|
|
1549
|
+
}),
|
|
1550
|
+
samePagePrevWordStoplist,
|
|
1551
|
+
split: "at"
|
|
1478
1552
|
};
|
|
1479
1553
|
};
|
|
1480
|
-
|
|
1481
|
-
//#endregion
|
|
1482
|
-
//#region src/segmentation/breakpoint-constants.ts
|
|
1483
|
-
/**
|
|
1484
|
-
* Shared constants for segmentation breakpoint processing.
|
|
1485
|
-
*/
|
|
1486
|
-
/**
|
|
1487
|
-
* Threshold for using offset-based fast path in boundary processing.
|
|
1488
|
-
*
|
|
1489
|
-
* Below this: accurate string-search (handles offset drift from structural rules).
|
|
1490
|
-
* At or above this: O(n) arithmetic (performance critical for large books).
|
|
1491
|
-
*
|
|
1492
|
-
* The value of 1000 is chosen based on typical Arabic book sizes:
|
|
1493
|
-
* - Sahih al-Bukhari: ~1000-3000 pages
|
|
1494
|
-
* - Standard hadith collections: 1000-7000 pages
|
|
1495
|
-
* - Large aggregated corpora: 10k-50k pages
|
|
1496
|
-
*
|
|
1497
|
-
* For segments ≥1000 pages, the performance gain from offset-based slicing
|
|
1498
|
-
* outweighs the minor accuracy loss from potential offset drift.
|
|
1499
|
-
*
|
|
1500
|
-
* @remarks
|
|
1501
|
-
* Fast path is skipped when:
|
|
1502
|
-
* - `maxContentLength` is set (requires character-accurate splitting)
|
|
1503
|
-
* - `debugMetaKey` is set (requires proper provenance tracking)
|
|
1504
|
-
* - Content was structurally modified by marker stripping (offsets may drift)
|
|
1505
|
-
*/
|
|
1506
|
-
const FAST_PATH_THRESHOLD = 1e3;
|
|
1507
1554
|
const WINDOW_PREFIX_LENGTHS = [
|
|
1508
1555
|
80,
|
|
1509
1556
|
60,
|
|
@@ -1530,23 +1577,6 @@ const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۞]/;
|
|
|
1530
1577
|
* Matches outside this range are rejected unless `ignoreDeviation` is active.
|
|
1531
1578
|
*/
|
|
1532
1579
|
const MAX_DEVIATION = 2e3;
|
|
1533
|
-
/**
|
|
1534
|
-
* Penalty score applied to non-newline anchor candidates.
|
|
1535
|
-
*
|
|
1536
|
-
* Designed to prioritize newline-aligned boundaries unless a whitespace match is
|
|
1537
|
-
* significantly closer (within 20 chars). Handles cases where marker stripping
|
|
1538
|
-
* shifts the boundary slightly.
|
|
1539
|
-
*/
|
|
1540
|
-
const NON_NEWLINE_PENALTY = 20;
|
|
1541
|
-
/**
|
|
1542
|
-
* Limit for inferring start offset from a relaxed search (characters).
|
|
1543
|
-
*
|
|
1544
|
-
* If the relaxed search finds a match more than this distance away from the
|
|
1545
|
-
* expected position, we assume it's a false positive (e.g. repeated content)
|
|
1546
|
-
* and do not use it to infer the start offset.
|
|
1547
|
-
*/
|
|
1548
|
-
const INFERENCE_PROXIMITY_LIMIT = 500;
|
|
1549
|
-
|
|
1550
1580
|
//#endregion
|
|
1551
1581
|
//#region src/segmentation/match-utils.ts
|
|
1552
1582
|
/**
|
|
@@ -1665,7 +1695,6 @@ const extractDebugIndex = (groups, prefix) => {
|
|
|
1665
1695
|
if (!Number.isNaN(idx)) return idx;
|
|
1666
1696
|
}
|
|
1667
1697
|
};
|
|
1668
|
-
|
|
1669
1698
|
//#endregion
|
|
1670
1699
|
//#region src/segmentation/breakpoint-utils.ts
|
|
1671
1700
|
/**
|
|
@@ -2067,8 +2096,8 @@ const findAnchorCandidates = (content, prefix, start, end) => {
|
|
|
2067
2096
|
/** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
|
|
2068
2097
|
const selectBestAnchor = (candidates, expectedBoundary) => {
|
|
2069
2098
|
return candidates.reduce((best, curr) => {
|
|
2070
|
-
const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 :
|
|
2071
|
-
return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 :
|
|
2099
|
+
const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
|
|
2100
|
+
return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
|
|
2072
2101
|
});
|
|
2073
2102
|
};
|
|
2074
2103
|
/**
|
|
@@ -2122,7 +2151,7 @@ const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetI
|
|
|
2122
2151
|
if (relaxedPos > 0) {
|
|
2123
2152
|
const inferredStartOffset = rawBoundary - relaxedPos;
|
|
2124
2153
|
const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2125
|
-
if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) <
|
|
2154
|
+
if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
|
|
2126
2155
|
startOffsetInFromPage = inferredStartOffset;
|
|
2127
2156
|
expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2128
2157
|
pos = relaxedPos;
|
|
@@ -2196,7 +2225,7 @@ const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCoun
|
|
|
2196
2225
|
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2197
2226
|
const pageCount = toIdx - fromIdx + 1;
|
|
2198
2227
|
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2199
|
-
if (pageCount >=
|
|
2228
|
+
if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
|
|
2200
2229
|
return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
2201
2230
|
};
|
|
2202
2231
|
/**
|
|
@@ -2428,7 +2457,6 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
|
|
|
2428
2457
|
}
|
|
2429
2458
|
return -1;
|
|
2430
2459
|
};
|
|
2431
|
-
|
|
2432
2460
|
//#endregion
|
|
2433
2461
|
//#region src/segmentation/debug-meta.ts
|
|
2434
2462
|
const resolveDebugConfig = (debug) => {
|
|
@@ -2470,59 +2498,197 @@ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
|
|
|
2470
2498
|
...word !== void 0 ? { word } : {}
|
|
2471
2499
|
} };
|
|
2472
2500
|
};
|
|
2473
|
-
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2474
|
-
index: breakpointIndex,
|
|
2475
|
-
kind: rule.pattern === "" ? "pageBoundary" : "pattern",
|
|
2476
|
-
pattern: rule.pattern ?? rule.regex,
|
|
2477
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2478
|
-
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2479
|
-
} });
|
|
2501
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2502
|
+
index: breakpointIndex,
|
|
2503
|
+
kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
|
|
2504
|
+
pattern: rule.pattern ?? rule.regex,
|
|
2505
|
+
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2506
|
+
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2507
|
+
} });
|
|
2508
|
+
/**
|
|
2509
|
+
* Helper to format the debug info into a human-readable string.
|
|
2510
|
+
* @param meta - The segment metadata object
|
|
2511
|
+
* @param options - Formatting options
|
|
2512
|
+
*/
|
|
2513
|
+
const formatRuleReason = (rule, concise) => {
|
|
2514
|
+
const { index, patternType, wordIndex, word } = rule;
|
|
2515
|
+
if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
|
|
2516
|
+
const wordInfo = word ? ` (Matched: "${word}")` : "";
|
|
2517
|
+
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2518
|
+
};
|
|
2519
|
+
const formatBreakpointReason = (breakpoint, concise) => {
|
|
2520
|
+
const { index, kind, pattern, wordIndex, word } = breakpoint;
|
|
2521
|
+
if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
|
|
2522
|
+
if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
|
|
2523
|
+
if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
|
|
2524
|
+
return `Breakpoint #${index} (${kind}) - "${pattern}"`;
|
|
2525
|
+
};
|
|
2526
|
+
const formatContentLengthReason = (split, concise) => {
|
|
2527
|
+
const { maxContentLength, splitReason } = split;
|
|
2528
|
+
if (concise) return `> ${maxContentLength} (${splitReason})`;
|
|
2529
|
+
return `Safety Split (${splitReason}) > ${maxContentLength}`;
|
|
2530
|
+
};
|
|
2531
|
+
/**
|
|
2532
|
+
* Helper to format the debug info into a human-readable string.
|
|
2533
|
+
* @param meta - The segment metadata object
|
|
2534
|
+
* @param options - Formatting options
|
|
2535
|
+
*/
|
|
2536
|
+
const getDebugReason = (meta, options) => {
|
|
2537
|
+
const debug = meta?._flappa;
|
|
2538
|
+
if (!debug) return "-";
|
|
2539
|
+
const concise = options?.concise;
|
|
2540
|
+
if (debug.rule) return formatRuleReason(debug.rule, concise);
|
|
2541
|
+
if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
|
|
2542
|
+
if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
|
|
2543
|
+
return "Unknown";
|
|
2544
|
+
};
|
|
2545
|
+
/**
|
|
2546
|
+
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
2547
|
+
* @param segment - The segment object
|
|
2548
|
+
* @param options - Formatting options
|
|
2549
|
+
*/
|
|
2550
|
+
const getSegmentDebugReason = (segment, options) => {
|
|
2551
|
+
return getDebugReason(segment.meta, options);
|
|
2552
|
+
};
|
|
2553
|
+
//#endregion
|
|
2554
|
+
//#region src/segmentation/pattern-validator.ts
|
|
2555
|
+
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
2556
|
+
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
2557
|
+
const buildBareTokenRegex = () => {
|
|
2558
|
+
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
2559
|
+
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
2560
|
+
};
|
|
2561
|
+
/**
|
|
2562
|
+
* Validates a single pattern for common issues.
|
|
2563
|
+
*/
|
|
2564
|
+
const validatePattern = (pattern, seenPatterns) => {
|
|
2565
|
+
if (!pattern.trim()) return {
|
|
2566
|
+
message: "Empty pattern is not allowed",
|
|
2567
|
+
type: "empty_pattern"
|
|
2568
|
+
};
|
|
2569
|
+
if (seenPatterns.has(pattern)) return {
|
|
2570
|
+
message: `Duplicate pattern: "${pattern}"`,
|
|
2571
|
+
pattern,
|
|
2572
|
+
type: "duplicate"
|
|
2573
|
+
};
|
|
2574
|
+
seenPatterns.add(pattern);
|
|
2575
|
+
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
2576
|
+
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
2577
|
+
const name = match[1];
|
|
2578
|
+
if (!KNOWN_TOKENS.has(name)) return {
|
|
2579
|
+
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
2580
|
+
suggestion: "Check spelling or use a known token",
|
|
2581
|
+
token: name,
|
|
2582
|
+
type: "unknown_token"
|
|
2583
|
+
};
|
|
2584
|
+
}
|
|
2585
|
+
for (const match of pattern.matchAll(buildBareTokenRegex())) {
|
|
2586
|
+
const [full, name] = match;
|
|
2587
|
+
const idx = match.index;
|
|
2588
|
+
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
2589
|
+
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
2590
|
+
suggestion: `{{${full}}}`,
|
|
2591
|
+
token: name,
|
|
2592
|
+
type: "missing_braces"
|
|
2593
|
+
};
|
|
2594
|
+
}
|
|
2595
|
+
};
|
|
2480
2596
|
/**
|
|
2481
|
-
*
|
|
2482
|
-
* @param meta - The segment metadata object
|
|
2483
|
-
* @param options - Formatting options
|
|
2597
|
+
* Validates an array of patterns, returning parallel array of issues.
|
|
2484
2598
|
*/
|
|
2485
|
-
const
|
|
2486
|
-
const
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2599
|
+
const validatePatternArray = (patterns) => {
|
|
2600
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2601
|
+
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
2602
|
+
return issues.some(Boolean) ? issues : void 0;
|
|
2490
2603
|
};
|
|
2491
|
-
const
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
if (
|
|
2495
|
-
|
|
2496
|
-
return
|
|
2604
|
+
const applyRulePatternValidation = (result, key, patterns) => {
|
|
2605
|
+
if (!patterns) return false;
|
|
2606
|
+
const issues = validatePatternArray(patterns);
|
|
2607
|
+
if (!issues) return false;
|
|
2608
|
+
result[key] = issues;
|
|
2609
|
+
return true;
|
|
2497
2610
|
};
|
|
2498
|
-
const
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2611
|
+
const validateTemplateRule = (rule, result) => {
|
|
2612
|
+
if (rule.template === void 0) return false;
|
|
2613
|
+
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
2614
|
+
if (!issue) return false;
|
|
2615
|
+
result.template = issue;
|
|
2616
|
+
return true;
|
|
2617
|
+
};
|
|
2618
|
+
const validateRegexRule = (rule, result) => {
|
|
2619
|
+
if (rule.regex === void 0) return false;
|
|
2620
|
+
if (!rule.regex.trim()) {
|
|
2621
|
+
result.regex = {
|
|
2622
|
+
message: "Empty pattern is not allowed",
|
|
2623
|
+
type: "empty_pattern"
|
|
2624
|
+
};
|
|
2625
|
+
return true;
|
|
2626
|
+
}
|
|
2627
|
+
try {
|
|
2628
|
+
new RegExp(rule.regex, "u");
|
|
2629
|
+
return false;
|
|
2630
|
+
} catch (error) {
|
|
2631
|
+
result.regex = {
|
|
2632
|
+
message: error instanceof Error ? error.message : String(error),
|
|
2633
|
+
pattern: rule.regex,
|
|
2634
|
+
type: "invalid_regex"
|
|
2635
|
+
};
|
|
2636
|
+
return true;
|
|
2637
|
+
}
|
|
2638
|
+
};
|
|
2639
|
+
const formatValidationIssue = (_type, issue, loc) => {
|
|
2640
|
+
if (!issue) return null;
|
|
2641
|
+
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
2642
|
+
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
2643
|
+
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
2644
|
+
if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
|
|
2645
|
+
return `${loc}: ${issue.message || issue.type}`;
|
|
2502
2646
|
};
|
|
2503
2647
|
/**
|
|
2504
|
-
*
|
|
2505
|
-
*
|
|
2506
|
-
*
|
|
2648
|
+
* Validates split rules for common pattern issues.
|
|
2649
|
+
*
|
|
2650
|
+
* Checks for:
|
|
2651
|
+
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
2652
|
+
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
2653
|
+
* - Duplicate patterns within the same rule
|
|
2654
|
+
*
|
|
2655
|
+
* @param rules - Array of split rules to validate
|
|
2656
|
+
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
2657
|
+
*
|
|
2658
|
+
* @example
|
|
2659
|
+
* const issues = validateRules([
|
|
2660
|
+
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
2661
|
+
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
2662
|
+
* ]);
|
|
2663
|
+
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
2664
|
+
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
2507
2665
|
*/
|
|
2508
|
-
const
|
|
2509
|
-
const
|
|
2510
|
-
|
|
2511
|
-
const
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
return
|
|
2516
|
-
};
|
|
2666
|
+
const validateRules = (rules) => rules.map((rule) => {
|
|
2667
|
+
const result = {};
|
|
2668
|
+
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
|
|
2669
|
+
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
|
|
2670
|
+
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
|
|
2671
|
+
const templateIssues = validateTemplateRule(rule, result);
|
|
2672
|
+
const regexIssues = validateRegexRule(rule, result);
|
|
2673
|
+
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues ? result : void 0;
|
|
2674
|
+
});
|
|
2517
2675
|
/**
|
|
2518
|
-
*
|
|
2519
|
-
*
|
|
2520
|
-
*
|
|
2676
|
+
* Formats a validation result array into a list of human-readable error messages.
|
|
2677
|
+
*
|
|
2678
|
+
* Useful for displaying validation errors in UIs.
|
|
2679
|
+
*
|
|
2680
|
+
* @param results - The result array from `validateRules()`
|
|
2681
|
+
* @returns Array of formatted error strings
|
|
2682
|
+
*
|
|
2683
|
+
* @example
|
|
2684
|
+
* const issues = validateRules(rules);
|
|
2685
|
+
* const errors = formatValidationReport(issues);
|
|
2686
|
+
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
2521
2687
|
*/
|
|
2522
|
-
const
|
|
2523
|
-
|
|
2524
|
-
};
|
|
2525
|
-
|
|
2688
|
+
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
2689
|
+
if (!result) return [];
|
|
2690
|
+
return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${i + 1}, ${type}`)).filter((msg) => msg !== null));
|
|
2691
|
+
});
|
|
2526
2692
|
//#endregion
|
|
2527
2693
|
//#region src/segmentation/breakpoint-processor.ts
|
|
2528
2694
|
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
@@ -2650,7 +2816,7 @@ const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx,
|
|
|
2650
2816
|
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2651
2817
|
const driftTolerance = Math.max(100, fullContent.length * .01);
|
|
2652
2818
|
const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
|
|
2653
|
-
if (!isAligned && pageCount >=
|
|
2819
|
+
if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
|
|
2654
2820
|
actualLength: fullContent.length,
|
|
2655
2821
|
drift: Math.abs(expectedLength - fullContent.length),
|
|
2656
2822
|
expectedLength,
|
|
@@ -2791,8 +2957,7 @@ const computeWindowEndPositionForIteration = (remainingContent, cursorPos, curre
|
|
|
2791
2957
|
if (maxPages === 0) {
|
|
2792
2958
|
const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
|
|
2793
2959
|
const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
|
|
2794
|
-
|
|
2795
|
-
return Math.min(capped, remainingContent.length);
|
|
2960
|
+
return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
|
|
2796
2961
|
}
|
|
2797
2962
|
const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
|
|
2798
2963
|
return Math.min(pos, remainingContent.length);
|
|
@@ -2847,7 +3012,7 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
|
|
|
2847
3012
|
const pageCount = toIdx - fromIdx + 1;
|
|
2848
3013
|
const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
|
|
2849
3014
|
const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
|
|
2850
|
-
if (pageCount <
|
|
3015
|
+
if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
|
|
2851
3016
|
if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
|
|
2852
3017
|
return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
|
|
2853
3018
|
};
|
|
@@ -3030,7 +3195,178 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
3030
3195
|
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
3031
3196
|
return result;
|
|
3032
3197
|
};
|
|
3033
|
-
|
|
3198
|
+
//#endregion
|
|
3199
|
+
//#region src/segmentation/rule-regex.ts
|
|
3200
|
+
/**
|
|
3201
|
+
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
3202
|
+
*
|
|
3203
|
+
* Detects standard capturing groups `(...)` while excluding:
|
|
3204
|
+
* - Non-capturing groups `(?:...)`
|
|
3205
|
+
* - Lookahead assertions `(?=...)` and `(?!...)`
|
|
3206
|
+
* - Lookbehind assertions `(?<=...)` and `(?<!...)`
|
|
3207
|
+
* - Named groups `(?<name>...)` (start with `(?` so excluded here)
|
|
3208
|
+
*
|
|
3209
|
+
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
3210
|
+
*/
|
|
3211
|
+
const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
|
|
3212
|
+
/**
|
|
3213
|
+
* Extracts named capture group names from a regex pattern.
|
|
3214
|
+
*
|
|
3215
|
+
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
3216
|
+
*
|
|
3217
|
+
* @example
|
|
3218
|
+
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
3219
|
+
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
3220
|
+
* extractNamedCaptureNames('^\\d+') // []
|
|
3221
|
+
*/
|
|
3222
|
+
const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
|
|
3223
|
+
/**
|
|
3224
|
+
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
3225
|
+
*/
|
|
3226
|
+
const compileRuleRegex = (pattern) => {
|
|
3227
|
+
try {
|
|
3228
|
+
return new RegExp(pattern, "gmu");
|
|
3229
|
+
} catch (error) {
|
|
3230
|
+
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
|
|
3231
|
+
}
|
|
3232
|
+
};
|
|
3233
|
+
/**
|
|
3234
|
+
* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
|
|
3235
|
+
*
|
|
3236
|
+
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
3237
|
+
*/
|
|
3238
|
+
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
3239
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
3240
|
+
return {
|
|
3241
|
+
captureNames,
|
|
3242
|
+
pattern: expanded
|
|
3243
|
+
};
|
|
3244
|
+
};
|
|
3245
|
+
/**
|
|
3246
|
+
* Processes a breakpoint pattern by expanding tokens only.
|
|
3247
|
+
*
|
|
3248
|
+
* Unlike `processPattern`, this does NOT escape brackets because breakpoints
|
|
3249
|
+
* are treated as raw regex patterns (like the `regex` rule type).
|
|
3250
|
+
* Users have full control over regex syntax including `(?:...)` groups.
|
|
3251
|
+
*/
|
|
3252
|
+
const processBreakpointPattern = (pattern) => {
|
|
3253
|
+
const { pattern: expanded } = expandTokensWithCaptures(pattern);
|
|
3254
|
+
return expanded;
|
|
3255
|
+
};
|
|
3256
|
+
/**
|
|
3257
|
+
* Builds the raw regex source for a `lineStartsAfter` rule.
|
|
3258
|
+
*
|
|
3259
|
+
* Expands each pattern through `processPattern()`, combines them into an
|
|
3260
|
+
* alternation at the start of a line, and appends a trailing content capture.
|
|
3261
|
+
*
|
|
3262
|
+
* @param patterns - Template-like line-start markers to match
|
|
3263
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3264
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3265
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3266
|
+
*/
|
|
3267
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3268
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3269
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3270
|
+
return {
|
|
3271
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3272
|
+
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
|
|
3273
|
+
};
|
|
3274
|
+
};
|
|
3275
|
+
/**
|
|
3276
|
+
* Builds the raw regex source for a `lineStartsWith` rule.
|
|
3277
|
+
*
|
|
3278
|
+
* Expands each pattern through `processPattern()` and combines them into an
|
|
3279
|
+
* alternation anchored at the start of a line.
|
|
3280
|
+
*
|
|
3281
|
+
* @param patterns - Template-like line-start markers to match
|
|
3282
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3283
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3284
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3285
|
+
*/
|
|
3286
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3287
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3288
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3289
|
+
return {
|
|
3290
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3291
|
+
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
|
|
3292
|
+
};
|
|
3293
|
+
};
|
|
3294
|
+
/**
|
|
3295
|
+
* Builds the raw regex source for a `lineEndsWith` rule.
|
|
3296
|
+
*
|
|
3297
|
+
* Expands each pattern through `processPattern()` and combines them into an
|
|
3298
|
+
* end-anchored alternation.
|
|
3299
|
+
*
|
|
3300
|
+
* @param patterns - Template-like line-end markers to match
|
|
3301
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3302
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3303
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3304
|
+
*/
|
|
3305
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3306
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3307
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3308
|
+
return {
|
|
3309
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3310
|
+
regex: `(?:${alternatives})$`
|
|
3311
|
+
};
|
|
3312
|
+
};
|
|
3313
|
+
/**
|
|
3314
|
+
* Builds the raw regex source for a `template` rule.
|
|
3315
|
+
*
|
|
3316
|
+
* Expands tokens and named captures via `expandTokensWithCaptures()` after
|
|
3317
|
+
* applying `escapeTemplateBrackets()` to non-token brackets.
|
|
3318
|
+
*
|
|
3319
|
+
* @param template - Template string containing optional `{{token}}` markers
|
|
3320
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3321
|
+
* @returns Regex source plus the named captures extracted from the template
|
|
3322
|
+
*/
|
|
3323
|
+
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
3324
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
3325
|
+
return {
|
|
3326
|
+
captureNames,
|
|
3327
|
+
regex: pattern
|
|
3328
|
+
};
|
|
3329
|
+
};
|
|
3330
|
+
const getFuzzyCandidatePatterns = (rule) => [
|
|
3331
|
+
..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
|
|
3332
|
+
..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
|
|
3333
|
+
..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
|
|
3334
|
+
];
|
|
3335
|
+
const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
|
|
3336
|
+
if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
|
|
3337
|
+
if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
|
|
3338
|
+
if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
|
|
3339
|
+
return null;
|
|
3340
|
+
};
|
|
3341
|
+
/**
|
|
3342
|
+
* Builds a compiled regex and metadata from a split rule.
|
|
3343
|
+
*
|
|
3344
|
+
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
3345
|
+
*/
|
|
3346
|
+
const buildRuleRegex = (rule, capturePrefix) => {
|
|
3347
|
+
const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
|
|
3348
|
+
if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
|
|
3349
|
+
const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
|
|
3350
|
+
return {
|
|
3351
|
+
captureNames,
|
|
3352
|
+
regex: compileRuleRegex(lsaRegex),
|
|
3353
|
+
usesCapture: true,
|
|
3354
|
+
usesLineStartsAfter: true
|
|
3355
|
+
};
|
|
3356
|
+
}
|
|
3357
|
+
const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
|
|
3358
|
+
let finalRegex = ruleRegexSource?.regex;
|
|
3359
|
+
let allCaptureNames = ruleRegexSource?.captureNames ?? [];
|
|
3360
|
+
if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
|
|
3361
|
+
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
3362
|
+
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
3363
|
+
return {
|
|
3364
|
+
captureNames: allCaptureNames,
|
|
3365
|
+
regex: compileRuleRegex(finalRegex),
|
|
3366
|
+
usesCapture: hasCapturingGroup(finalRegex),
|
|
3367
|
+
usesLineStartsAfter: false
|
|
3368
|
+
};
|
|
3369
|
+
};
|
|
3034
3370
|
//#endregion
|
|
3035
3371
|
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
3036
3372
|
/**
|
|
@@ -3078,9 +3414,8 @@ const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
|
3078
3414
|
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
3079
3415
|
if (!m) return null;
|
|
3080
3416
|
const token = m[1];
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
const compiled = compileLiteralAlternation(tokenPattern);
|
|
3417
|
+
if (!(token in TOKEN_PATTERNS)) return null;
|
|
3418
|
+
const compiled = compileLiteralAlternation(getTokenPattern(token));
|
|
3084
3419
|
return compiled ? {
|
|
3085
3420
|
alternatives: compiled.alternatives,
|
|
3086
3421
|
token
|
|
@@ -3093,11 +3428,11 @@ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
|
3093
3428
|
}
|
|
3094
3429
|
return null;
|
|
3095
3430
|
};
|
|
3096
|
-
|
|
3097
3431
|
//#endregion
|
|
3098
3432
|
//#region src/segmentation/segmenter-rule-utils.ts
|
|
3099
3433
|
const tryCompileFastFuzzyRule = (rule) => {
|
|
3100
|
-
|
|
3434
|
+
const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
|
|
3435
|
+
if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
|
|
3101
3436
|
if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
|
|
3102
3437
|
const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
|
|
3103
3438
|
if (compiled) return {
|
|
@@ -3139,7 +3474,10 @@ const partitionRulesForMatching = (rules) => {
|
|
|
3139
3474
|
prefix: `r${index}_`,
|
|
3140
3475
|
rule
|
|
3141
3476
|
});
|
|
3142
|
-
else standaloneRules.push(
|
|
3477
|
+
else standaloneRules.push({
|
|
3478
|
+
index,
|
|
3479
|
+
rule
|
|
3480
|
+
});
|
|
3143
3481
|
}
|
|
3144
3482
|
return {
|
|
3145
3483
|
combinableRules,
|
|
@@ -3147,9 +3485,37 @@ const partitionRulesForMatching = (rules) => {
|
|
|
3147
3485
|
standaloneRules
|
|
3148
3486
|
};
|
|
3149
3487
|
};
|
|
3488
|
+
const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
|
|
3489
|
+
const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
|
|
3490
|
+
const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
|
|
3491
|
+
const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
|
|
3492
|
+
const trimTrailingPageWrapNoise = (text) => {
|
|
3493
|
+
let trimmed = text.trimEnd();
|
|
3494
|
+
while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
|
|
3495
|
+
return trimmed;
|
|
3496
|
+
};
|
|
3497
|
+
const endsWithStrongSentenceTerminator = (pageContent) => {
|
|
3498
|
+
return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
|
|
3499
|
+
};
|
|
3500
|
+
const extractLastArabicWord = (pageContent) => {
|
|
3501
|
+
return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
|
|
3502
|
+
};
|
|
3503
|
+
const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
|
|
3504
|
+
if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
|
|
3505
|
+
const lastWord = extractLastArabicWord(previousPageContent);
|
|
3506
|
+
return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
|
|
3507
|
+
};
|
|
3508
|
+
const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
|
|
3509
|
+
if (!stoplist) return true;
|
|
3510
|
+
const lastWord = extractLastArabicWord(contentBeforeMatch);
|
|
3511
|
+
return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
|
|
3512
|
+
};
|
|
3150
3513
|
const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
3151
3514
|
const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
|
|
3152
3515
|
const compiledPageStartPrev = /* @__PURE__ */ new Map();
|
|
3516
|
+
const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3517
|
+
const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3518
|
+
const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
|
|
3153
3519
|
const getPageStartPrevRegex = (rule, ruleIndex) => {
|
|
3154
3520
|
if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
|
|
3155
3521
|
const pattern = rule.pageStartGuard;
|
|
@@ -3161,6 +3527,33 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
|
3161
3527
|
compiledPageStartPrev.set(ruleIndex, re);
|
|
3162
3528
|
return re;
|
|
3163
3529
|
};
|
|
3530
|
+
const getPrevWordStoplist = (rule, ruleIndex) => {
|
|
3531
|
+
if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
|
|
3532
|
+
const stoplist = rule.pageStartPrevWordStoplist;
|
|
3533
|
+
if (!stoplist?.length) {
|
|
3534
|
+
compiledPrevWordStoplists.set(ruleIndex, null);
|
|
3535
|
+
return null;
|
|
3536
|
+
}
|
|
3537
|
+
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3538
|
+
compiledPrevWordStoplists.set(ruleIndex, normalized);
|
|
3539
|
+
return normalized;
|
|
3540
|
+
};
|
|
3541
|
+
const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
|
|
3542
|
+
if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
|
|
3543
|
+
const stoplist = rule.samePagePrevWordStoplist;
|
|
3544
|
+
if (!stoplist?.length) {
|
|
3545
|
+
compiledSamePagePrevWordStoplists.set(ruleIndex, null);
|
|
3546
|
+
return null;
|
|
3547
|
+
}
|
|
3548
|
+
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3549
|
+
compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
|
|
3550
|
+
return normalized;
|
|
3551
|
+
};
|
|
3552
|
+
const getPreviousPageContent = (boundaryIndex) => {
|
|
3553
|
+
if (boundaryIndex <= 0) return "";
|
|
3554
|
+
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
3555
|
+
return matchContent.slice(prevBoundary.start, prevBoundary.end);
|
|
3556
|
+
};
|
|
3164
3557
|
const getPrevPageLastNonWsChar = (boundaryIndex) => {
|
|
3165
3558
|
if (boundaryIndex <= 0) return "";
|
|
3166
3559
|
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
@@ -3170,13 +3563,24 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
|
3170
3563
|
}
|
|
3171
3564
|
return "";
|
|
3172
3565
|
};
|
|
3566
|
+
const getCurrentPageContentBeforeMatch = (matchStart) => {
|
|
3567
|
+
const pageId = pageMap.getId(matchStart);
|
|
3568
|
+
const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
|
|
3569
|
+
if (boundaryIndex === void 0) return "";
|
|
3570
|
+
const boundary = pageMap.boundaries[boundaryIndex];
|
|
3571
|
+
return matchContent.slice(boundary.start, matchStart);
|
|
3572
|
+
};
|
|
3173
3573
|
return (rule, ruleIndex, matchStart) => {
|
|
3174
3574
|
const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
|
|
3175
|
-
if (boundaryIndex
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3575
|
+
if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
|
|
3576
|
+
const prevReq = getPageStartPrevRegex(rule, ruleIndex);
|
|
3577
|
+
if (prevReq) {
|
|
3578
|
+
const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
|
|
3579
|
+
if (!lastChar || !prevReq.test(lastChar)) return false;
|
|
3580
|
+
}
|
|
3581
|
+
return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
|
|
3582
|
+
}
|
|
3583
|
+
return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
|
|
3180
3584
|
};
|
|
3181
3585
|
};
|
|
3182
3586
|
/**
|
|
@@ -3212,10 +3616,10 @@ const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule,
|
|
|
3212
3616
|
/**
|
|
3213
3617
|
* Processes matches for all fast-fuzzy rules at a specific line start.
|
|
3214
3618
|
*/
|
|
3215
|
-
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard,
|
|
3619
|
+
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
|
|
3216
3620
|
for (const ffRule of fastFuzzyRules) {
|
|
3217
3621
|
if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
|
|
3218
|
-
if (
|
|
3622
|
+
if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
|
|
3219
3623
|
attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
|
|
3220
3624
|
}
|
|
3221
3625
|
};
|
|
@@ -3230,19 +3634,17 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
3230
3634
|
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
3231
3635
|
}
|
|
3232
3636
|
};
|
|
3233
|
-
const isPageStart = (offset) => offset === currentBoundary?.start;
|
|
3234
3637
|
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
3235
3638
|
advanceBoundaryTo(lineStart);
|
|
3236
3639
|
const pageId = currentBoundary?.id ?? 0;
|
|
3237
3640
|
if (lineStart >= matchContent.length) break;
|
|
3238
|
-
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard,
|
|
3641
|
+
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
|
|
3239
3642
|
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
3240
3643
|
if (nextNl === -1) break;
|
|
3241
3644
|
lineStart = nextNl + 1;
|
|
3242
3645
|
}
|
|
3243
3646
|
return splitPointsByRule;
|
|
3244
3647
|
};
|
|
3245
|
-
|
|
3246
3648
|
//#endregion
|
|
3247
3649
|
//#region src/segmentation/split-point-helpers.ts
|
|
3248
3650
|
const MAX_REGEX_ITERATIONS = 1e5;
|
|
@@ -3256,7 +3658,7 @@ const buildContentOffsets = (match, ruleInfo) => {
|
|
|
3256
3658
|
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
3257
3659
|
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
3258
3660
|
if (captured === void 0) return {};
|
|
3259
|
-
return { contentStartOffset: (match.groups?.[ruleInfo.prefix]
|
|
3661
|
+
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
|
|
3260
3662
|
};
|
|
3261
3663
|
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
3262
3664
|
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
@@ -3271,7 +3673,32 @@ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
|
3271
3673
|
wordIndex
|
|
3272
3674
|
};
|
|
3273
3675
|
};
|
|
3676
|
+
const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
|
|
3677
|
+
const arr = splitPointsByRule.get(originalIndex);
|
|
3678
|
+
if (!arr) {
|
|
3679
|
+
splitPointsByRule.set(originalIndex, [point]);
|
|
3680
|
+
return;
|
|
3681
|
+
}
|
|
3682
|
+
arr.push(point);
|
|
3683
|
+
};
|
|
3684
|
+
/**
|
|
3685
|
+
* Executes a combined regex over the content for combinable rules and records
|
|
3686
|
+
* any resulting split points into `splitPointsByRule`.
|
|
3687
|
+
*
|
|
3688
|
+
* This function mutates `splitPointsByRule` in place and throws if the regex
|
|
3689
|
+
* iteration guard is exceeded.
|
|
3690
|
+
*
|
|
3691
|
+
* @param matchContent - Concatenated content being segmented
|
|
3692
|
+
* @param combinableRules - Rules that can be combined into a single alternation
|
|
3693
|
+
* @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
|
|
3694
|
+
* @param pageMap - Page boundary mapping utilities for the content
|
|
3695
|
+
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3696
|
+
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3697
|
+
* @param logger - Optional logger for iteration diagnostics
|
|
3698
|
+
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3699
|
+
*/
|
|
3274
3700
|
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
3701
|
+
assertCombinedRuleAlignment(combinableRules, ruleRegexes);
|
|
3275
3702
|
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
3276
3703
|
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
3277
3704
|
logger?.debug?.("[segmenter] combined regex built", {
|
|
@@ -3286,19 +3713,29 @@ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, page
|
|
|
3286
3713
|
iterations,
|
|
3287
3714
|
position: m.index
|
|
3288
3715
|
});
|
|
3289
|
-
|
|
3290
|
-
if (matchedIndex !== -1) {
|
|
3291
|
-
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
3292
|
-
if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
|
|
3293
|
-
const arr = splitPointsByRule.get(originalIndex);
|
|
3294
|
-
if (!arr) splitPointsByRule.set(originalIndex, [createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex])]);
|
|
3295
|
-
else arr.push(createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex]));
|
|
3296
|
-
}
|
|
3297
|
-
}
|
|
3716
|
+
processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
|
|
3298
3717
|
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
3299
3718
|
m = combinedRegex.exec(matchContent);
|
|
3300
3719
|
}
|
|
3301
3720
|
};
|
|
3721
|
+
const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
|
|
3722
|
+
if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
|
|
3723
|
+
for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
|
|
3724
|
+
};
|
|
3725
|
+
const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
|
|
3726
|
+
const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
|
|
3727
|
+
if (matchedIndex === -1) return;
|
|
3728
|
+
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
3729
|
+
if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
|
|
3730
|
+
addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
|
|
3731
|
+
};
|
|
3732
|
+
/**
|
|
3733
|
+
* Builds compiled regex metadata for each combinable rule while preserving the
|
|
3734
|
+
* prefix used to identify the matching branch inside a combined alternation.
|
|
3735
|
+
*
|
|
3736
|
+
* @param combinableRules - Rules eligible for combined-regex processing
|
|
3737
|
+
* @returns Rule regex metadata aligned with the input order
|
|
3738
|
+
*/
|
|
3302
3739
|
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
3303
3740
|
const built = buildRuleRegex(rule, prefix);
|
|
3304
3741
|
return {
|
|
@@ -3307,6 +3744,18 @@ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefi
|
|
|
3307
3744
|
source: `(?<${prefix}>${built.regex.source})`
|
|
3308
3745
|
};
|
|
3309
3746
|
});
|
|
3747
|
+
/**
|
|
3748
|
+
* Processes a standalone rule by matching it independently and appending its
|
|
3749
|
+
* resulting split points into `splitPointsByRule`.
|
|
3750
|
+
*
|
|
3751
|
+
* @param rule - The standalone split rule to evaluate
|
|
3752
|
+
* @param ruleIndex - Original rule index in the caller's rules array
|
|
3753
|
+
* @param matchContent - Concatenated content being segmented
|
|
3754
|
+
* @param pageMap - Page boundary mapping utilities for the content
|
|
3755
|
+
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3756
|
+
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3757
|
+
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3758
|
+
*/
|
|
3310
3759
|
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
3311
3760
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
3312
3761
|
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
@@ -3341,6 +3790,15 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
|
3341
3790
|
}
|
|
3342
3791
|
return matches;
|
|
3343
3792
|
};
|
|
3793
|
+
/**
|
|
3794
|
+
* Applies per-rule occurrence filtering and optional debug metadata patches to
|
|
3795
|
+
* the collected split points.
|
|
3796
|
+
*
|
|
3797
|
+
* @param rules - Full rule list in original order
|
|
3798
|
+
* @param splitPointsByRule - Split points grouped by originating rule index
|
|
3799
|
+
* @param debugMetaKey - Optional metadata key used for debug provenance patches
|
|
3800
|
+
* @returns Flattened split points after occurrence filtering and debug merging
|
|
3801
|
+
*/
|
|
3344
3802
|
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
3345
3803
|
const result = [];
|
|
3346
3804
|
rules.forEach((rule, index) => {
|
|
@@ -3358,7 +3816,6 @@ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
|
3358
3816
|
});
|
|
3359
3817
|
return result;
|
|
3360
3818
|
};
|
|
3361
|
-
|
|
3362
3819
|
//#endregion
|
|
3363
3820
|
//#region src/segmentation/segmenter.ts
|
|
3364
3821
|
/**
|
|
@@ -3432,10 +3889,30 @@ const dedupeSplitPoints = (splitPoints) => {
|
|
|
3432
3889
|
const byIndex = /* @__PURE__ */ new Map();
|
|
3433
3890
|
for (const p of splitPoints) {
|
|
3434
3891
|
const existing = byIndex.get(p.index);
|
|
3435
|
-
if (!existing
|
|
3892
|
+
if (!existing) {
|
|
3893
|
+
byIndex.set(p.index, p);
|
|
3894
|
+
continue;
|
|
3895
|
+
}
|
|
3896
|
+
byIndex.set(p.index, mergeSplitPoints(existing, p));
|
|
3436
3897
|
}
|
|
3437
3898
|
return [...byIndex.values()].sort((a, b) => a.index - b.index);
|
|
3438
3899
|
};
|
|
3900
|
+
const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
|
|
3901
|
+
const mergeRecord = (existing, incoming) => existing || incoming ? {
|
|
3902
|
+
...existing ?? {},
|
|
3903
|
+
...incoming ?? {}
|
|
3904
|
+
} : void 0;
|
|
3905
|
+
const mergeSplitPoints = (existing, incoming) => {
|
|
3906
|
+
const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
|
|
3907
|
+
const fallback = preferred === incoming ? existing : incoming;
|
|
3908
|
+
return {
|
|
3909
|
+
...fallback,
|
|
3910
|
+
...preferred,
|
|
3911
|
+
contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
|
|
3912
|
+
meta: mergeRecord(existing.meta, incoming.meta),
|
|
3913
|
+
namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
|
|
3914
|
+
};
|
|
3915
|
+
};
|
|
3439
3916
|
/**
|
|
3440
3917
|
* If no structural rules produced segments, create a single segment spanning all pages.
|
|
3441
3918
|
* This allows breakpoint processing to still run.
|
|
@@ -3468,7 +3945,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey,
|
|
|
3468
3945
|
});
|
|
3469
3946
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
3470
3947
|
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
3471
|
-
for (const rule of standaloneRules) processStandaloneRule(rule,
|
|
3948
|
+
for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
3472
3949
|
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
3473
3950
|
};
|
|
3474
3951
|
/**
|
|
@@ -3508,7 +3985,7 @@ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
|
|
|
3508
3985
|
* @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
|
|
3509
3986
|
*/
|
|
3510
3987
|
const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
|
|
3511
|
-
if (!content
|
|
3988
|
+
if (!content?.includes("\n")) return content;
|
|
3512
3989
|
if (pageJoiner === "newline") return content;
|
|
3513
3990
|
const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
|
|
3514
3991
|
if (breaksInRange.length === 0) return content;
|
|
@@ -3616,16 +4093,23 @@ const segmentPages = (pages, options) => {
|
|
|
3616
4093
|
* @returns Array of segment objects
|
|
3617
4094
|
*/
|
|
3618
4095
|
const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
4096
|
+
const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
|
|
4097
|
+
const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
|
|
4098
|
+
const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
|
|
4099
|
+
const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
|
|
4100
|
+
...meta,
|
|
4101
|
+
...namedCaptures
|
|
4102
|
+
} : void 0;
|
|
3619
4103
|
/**
|
|
3620
4104
|
* Creates a single segment from a content range.
|
|
3621
4105
|
*/
|
|
3622
4106
|
const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
|
|
3623
|
-
const actualStart = start
|
|
4107
|
+
const actualStart = getActualStart(start, contentStartOffset);
|
|
3624
4108
|
const sliced = content.slice(actualStart, end);
|
|
3625
|
-
let text =
|
|
4109
|
+
let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
|
|
3626
4110
|
if (!text) return null;
|
|
3627
4111
|
if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
|
|
3628
|
-
const adjustedStart = actualStart
|
|
4112
|
+
const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
|
|
3629
4113
|
const from = pageMap.getId(adjustedStart);
|
|
3630
4114
|
const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
|
|
3631
4115
|
const seg = {
|
|
@@ -3633,10 +4117,8 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
|
3633
4117
|
from
|
|
3634
4118
|
};
|
|
3635
4119
|
if (to !== from) seg.to = to;
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
...namedCaptures
|
|
3639
|
-
};
|
|
4120
|
+
const mergedMeta = applyMeta(meta, namedCaptures);
|
|
4121
|
+
if (mergedMeta) seg.meta = mergedMeta;
|
|
3640
4122
|
return seg;
|
|
3641
4123
|
};
|
|
3642
4124
|
/**
|
|
@@ -3668,659 +4150,6 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
|
3668
4150
|
}
|
|
3669
4151
|
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
3670
4152
|
};
|
|
3671
|
-
|
|
3672
|
-
//#endregion
|
|
3673
|
-
//#region src/recovery.ts
|
|
3674
|
-
const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
|
|
3675
|
-
const normalizeForCompare = (s, mode) => {
|
|
3676
|
-
if (mode === "none") return s;
|
|
3677
|
-
let out = s;
|
|
3678
|
-
if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
|
|
3679
|
-
out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
|
|
3680
|
-
return out;
|
|
3681
|
-
};
|
|
3682
|
-
const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
|
|
3683
|
-
const buildFixedOptions = (options, selectedRuleIndices) => {
|
|
3684
|
-
const fixedRules = (options.rules ?? []).map((r, idx) => {
|
|
3685
|
-
if (!selectedRuleIndices.has(idx)) return r;
|
|
3686
|
-
if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
|
|
3687
|
-
const { lineStartsAfter, ...rest } = r;
|
|
3688
|
-
return {
|
|
3689
|
-
...rest,
|
|
3690
|
-
lineStartsWith: lineStartsAfter
|
|
3691
|
-
};
|
|
3692
|
-
});
|
|
3693
|
-
return {
|
|
3694
|
-
...options,
|
|
3695
|
-
rules: fixedRules
|
|
3696
|
-
};
|
|
3697
|
-
};
|
|
3698
|
-
const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
|
|
3699
|
-
const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
|
|
3700
|
-
const parts = [];
|
|
3701
|
-
for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
|
|
3702
|
-
const matchContent = parts.join("\n");
|
|
3703
|
-
if (pageJoiner === "newline") return {
|
|
3704
|
-
matchContent,
|
|
3705
|
-
outputContent: matchContent
|
|
3706
|
-
};
|
|
3707
|
-
return {
|
|
3708
|
-
matchContent,
|
|
3709
|
-
outputContent: parts.join(" ")
|
|
3710
|
-
};
|
|
3711
|
-
};
|
|
3712
|
-
const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
|
|
3713
|
-
const rules = options.rules ?? [];
|
|
3714
|
-
const compiled = [];
|
|
3715
|
-
for (const idx of selectedRuleIndices) {
|
|
3716
|
-
const r = rules[idx];
|
|
3717
|
-
if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3718
|
-
const { lineStartsAfter, ...rest } = r;
|
|
3719
|
-
const built = buildRuleRegex({
|
|
3720
|
-
...rest,
|
|
3721
|
-
lineStartsWith: lineStartsAfter
|
|
3722
|
-
});
|
|
3723
|
-
compiled.push({
|
|
3724
|
-
ruleIndex: idx,
|
|
3725
|
-
startsWithRegex: new RegExp(built.regex.source, "mu")
|
|
3726
|
-
});
|
|
3727
|
-
}
|
|
3728
|
-
return compiled;
|
|
3729
|
-
};
|
|
3730
|
-
const findUniqueAnchorPos = (outputContent, segmentContent) => {
|
|
3731
|
-
for (const len of [
|
|
3732
|
-
80,
|
|
3733
|
-
60,
|
|
3734
|
-
40,
|
|
3735
|
-
30,
|
|
3736
|
-
20,
|
|
3737
|
-
15
|
|
3738
|
-
]) {
|
|
3739
|
-
const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
|
|
3740
|
-
if (!needle.trim()) continue;
|
|
3741
|
-
const first = outputContent.indexOf(needle);
|
|
3742
|
-
if (first === -1) continue;
|
|
3743
|
-
if (outputContent.indexOf(needle, first + 1) === -1) return first;
|
|
3744
|
-
}
|
|
3745
|
-
return null;
|
|
3746
|
-
};
|
|
3747
|
-
const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
|
|
3748
|
-
const line = matchContent.slice(lineStart);
|
|
3749
|
-
for (const mr of compiledMistaken) {
|
|
3750
|
-
mr.startsWithRegex.lastIndex = 0;
|
|
3751
|
-
const m = mr.startsWithRegex.exec(line);
|
|
3752
|
-
if (!m || m.index !== 0) continue;
|
|
3753
|
-
const markerMatch = m[0];
|
|
3754
|
-
const markerEnd = lineStart + markerMatch.length;
|
|
3755
|
-
if (anchorPos < markerEnd) continue;
|
|
3756
|
-
const gap = matchContent.slice(markerEnd, anchorPos);
|
|
3757
|
-
const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
|
|
3758
|
-
if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
|
|
3759
|
-
return { prefix: recoveredPrefix };
|
|
3760
|
-
}
|
|
3761
|
-
return { reason: "no selected marker pattern matched at anchored line start" };
|
|
3762
|
-
};
|
|
3763
|
-
const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
|
|
3764
|
-
const fromIdx = pageIdToIndex.get(segment.from);
|
|
3765
|
-
const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
|
|
3766
|
-
if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
|
|
3767
|
-
kind: "unresolved",
|
|
3768
|
-
reason: "segment page range not found in pages"
|
|
3769
|
-
};
|
|
3770
|
-
const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
|
|
3771
|
-
if (!segment.content) return {
|
|
3772
|
-
kind: "unresolved",
|
|
3773
|
-
reason: "empty segment content"
|
|
3774
|
-
};
|
|
3775
|
-
const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
|
|
3776
|
-
if (anchorPos === null) return {
|
|
3777
|
-
kind: "unresolved",
|
|
3778
|
-
reason: "could not uniquely anchor segment content in page range"
|
|
3779
|
-
};
|
|
3780
|
-
const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
|
|
3781
|
-
const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
|
|
3782
|
-
if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
|
|
3783
|
-
kind: "unresolved",
|
|
3784
|
-
reason: found.reason
|
|
3785
|
-
};
|
|
3786
|
-
return {
|
|
3787
|
-
kind: "recovered",
|
|
3788
|
-
recoveredContent: `${found.prefix}${segment.content}`,
|
|
3789
|
-
recoveredPrefix: found.prefix
|
|
3790
|
-
};
|
|
3791
|
-
};
|
|
3792
|
-
const resolveRuleIndicesSelector = (rules, indicesIn) => {
|
|
3793
|
-
const errors = [];
|
|
3794
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3795
|
-
for (const idx of indicesIn) {
|
|
3796
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
|
|
3797
|
-
errors.push(`Selector index out of range: ${idx}`);
|
|
3798
|
-
continue;
|
|
3799
|
-
}
|
|
3800
|
-
const rule = rules[idx];
|
|
3801
|
-
if (!rule || !("lineStartsAfter" in rule)) {
|
|
3802
|
-
errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
|
|
3803
|
-
continue;
|
|
3804
|
-
}
|
|
3805
|
-
indices.add(idx);
|
|
3806
|
-
}
|
|
3807
|
-
return {
|
|
3808
|
-
errors,
|
|
3809
|
-
indices,
|
|
3810
|
-
warnings: []
|
|
3811
|
-
};
|
|
3812
|
-
};
|
|
3813
|
-
const resolvePredicateSelector = (rules, predicate) => {
|
|
3814
|
-
const errors = [];
|
|
3815
|
-
const warnings = [];
|
|
3816
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3817
|
-
rules.forEach((r, i) => {
|
|
3818
|
-
try {
|
|
3819
|
-
if (!predicate(r, i)) return;
|
|
3820
|
-
if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
|
|
3821
|
-
indices.add(i);
|
|
3822
|
-
return;
|
|
3823
|
-
}
|
|
3824
|
-
warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
|
|
3825
|
-
} catch (e) {
|
|
3826
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
3827
|
-
errors.push(`Predicate threw at rule ${i}: ${msg}`);
|
|
3828
|
-
}
|
|
3829
|
-
});
|
|
3830
|
-
if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
|
|
3831
|
-
return {
|
|
3832
|
-
errors,
|
|
3833
|
-
indices,
|
|
3834
|
-
warnings
|
|
3835
|
-
};
|
|
3836
|
-
};
|
|
3837
|
-
const resolvePatternsSelector = (rules, patterns, matchMode) => {
|
|
3838
|
-
const errors = [];
|
|
3839
|
-
const warnings = [];
|
|
3840
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3841
|
-
const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
|
|
3842
|
-
const targets = patterns.map(normalizePattern);
|
|
3843
|
-
for (let pi = 0; pi < patterns.length; pi++) {
|
|
3844
|
-
const rawPattern = patterns[pi];
|
|
3845
|
-
const pat = targets[pi];
|
|
3846
|
-
const matched = [];
|
|
3847
|
-
for (let i = 0; i < rules.length; i++) {
|
|
3848
|
-
const r = rules[i];
|
|
3849
|
-
if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3850
|
-
if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
|
|
3851
|
-
}
|
|
3852
|
-
if (matched.length === 0) {
|
|
3853
|
-
errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
|
|
3854
|
-
continue;
|
|
3855
|
-
}
|
|
3856
|
-
if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
|
|
3857
|
-
matched.forEach((i) => {
|
|
3858
|
-
indices.add(i);
|
|
3859
|
-
});
|
|
3860
|
-
}
|
|
3861
|
-
return {
|
|
3862
|
-
errors,
|
|
3863
|
-
indices,
|
|
3864
|
-
warnings
|
|
3865
|
-
};
|
|
3866
|
-
};
|
|
3867
|
-
const resolveSelectorToRuleIndices = (options, selector) => {
|
|
3868
|
-
const rules = options.rules ?? [];
|
|
3869
|
-
if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
|
|
3870
|
-
if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
|
|
3871
|
-
return resolvePatternsSelector(rules, selector.patterns, selector.match);
|
|
3872
|
-
};
|
|
3873
|
-
const longestCommonSuffixLength = (a, b) => {
|
|
3874
|
-
const max = Math.min(a.length, b.length);
|
|
3875
|
-
let i = 0;
|
|
3876
|
-
while (i < max) {
|
|
3877
|
-
if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
|
|
3878
|
-
i++;
|
|
3879
|
-
}
|
|
3880
|
-
return i;
|
|
3881
|
-
};
|
|
3882
|
-
const AMBIGUITY_SCORE_GAP = 5;
|
|
3883
|
-
const scoreCandidate = (orig, fixed, normalizeMode) => {
|
|
3884
|
-
if (fixed.content === orig.content) return {
|
|
3885
|
-
fixedIndex: -1,
|
|
3886
|
-
kind: "exact",
|
|
3887
|
-
score: 100
|
|
3888
|
-
};
|
|
3889
|
-
if (fixed.content.endsWith(orig.content)) {
|
|
3890
|
-
const markerLen = fixed.content.length - orig.content.length;
|
|
3891
|
-
return {
|
|
3892
|
-
fixedIndex: -1,
|
|
3893
|
-
kind: "exact_suffix",
|
|
3894
|
-
score: 90 + Math.min(30, markerLen)
|
|
3895
|
-
};
|
|
3896
|
-
}
|
|
3897
|
-
if (normalizeMode !== "none") {
|
|
3898
|
-
const normFixed = normalizeForCompare(fixed.content, normalizeMode);
|
|
3899
|
-
const normOrig = normalizeForCompare(orig.content, normalizeMode);
|
|
3900
|
-
if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
|
|
3901
|
-
const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
|
|
3902
|
-
return {
|
|
3903
|
-
fixedIndex: -1,
|
|
3904
|
-
kind: "normalized_suffix",
|
|
3905
|
-
score: 70 + Math.floor(overlap * 20)
|
|
3906
|
-
};
|
|
3907
|
-
}
|
|
3908
|
-
}
|
|
3909
|
-
return null;
|
|
3910
|
-
};
|
|
3911
|
-
const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
|
|
3912
|
-
const warnings = [...reportBase.warnings];
|
|
3913
|
-
warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
|
|
3914
|
-
const details = segments.map((s, i) => {
|
|
3915
|
-
const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
|
|
3916
|
-
return {
|
|
3917
|
-
from: s.from,
|
|
3918
|
-
notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
|
|
3919
|
-
originalStartPreview: preview(s.content),
|
|
3920
|
-
segmentIndex: i,
|
|
3921
|
-
status,
|
|
3922
|
-
strategy: "none",
|
|
3923
|
-
to: s.to
|
|
3924
|
-
};
|
|
3925
|
-
});
|
|
3926
|
-
return {
|
|
3927
|
-
report: {
|
|
3928
|
-
...reportBase,
|
|
3929
|
-
details,
|
|
3930
|
-
summary: {
|
|
3931
|
-
mode,
|
|
3932
|
-
recovered: 0,
|
|
3933
|
-
totalSegments: segments.length,
|
|
3934
|
-
unchanged: segments.length,
|
|
3935
|
-
unresolved: selectorErrors.length ? segments.length : 0
|
|
3936
|
-
},
|
|
3937
|
-
warnings
|
|
3938
|
-
},
|
|
3939
|
-
segments
|
|
3940
|
-
};
|
|
3941
|
-
};
|
|
3942
|
-
const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
|
|
3943
|
-
const recoveredAtIndex = /* @__PURE__ */ new Map();
|
|
3944
|
-
const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
|
|
3945
|
-
if (mode !== "best_effort_then_rerun") return {
|
|
3946
|
-
recoveredAtIndex,
|
|
3947
|
-
recoveredDetailAtIndex
|
|
3948
|
-
};
|
|
3949
|
-
const pageIdToIndex = buildPageIdToIndex(pages);
|
|
3950
|
-
const pageJoiner = options.pageJoiner ?? "space";
|
|
3951
|
-
const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
|
|
3952
|
-
for (let i = 0; i < segments.length; i++) {
|
|
3953
|
-
const orig = segments[i];
|
|
3954
|
-
const r = tryBestEffortRecoverOneSegment(orig, pages, pageIdToIndex, compiledMistaken, pageJoiner);
|
|
3955
|
-
if (r.kind !== "recovered") continue;
|
|
3956
|
-
const seg = {
|
|
3957
|
-
...orig,
|
|
3958
|
-
content: r.recoveredContent
|
|
3959
|
-
};
|
|
3960
|
-
recoveredAtIndex.set(i, seg);
|
|
3961
|
-
recoveredDetailAtIndex.set(i, {
|
|
3962
|
-
from: orig.from,
|
|
3963
|
-
originalStartPreview: preview(orig.content),
|
|
3964
|
-
recoveredPrefixPreview: preview(r.recoveredPrefix),
|
|
3965
|
-
recoveredStartPreview: preview(seg.content),
|
|
3966
|
-
segmentIndex: i,
|
|
3967
|
-
status: "recovered",
|
|
3968
|
-
strategy: "stage1",
|
|
3969
|
-
to: orig.to
|
|
3970
|
-
});
|
|
3971
|
-
}
|
|
3972
|
-
return {
|
|
3973
|
-
recoveredAtIndex,
|
|
3974
|
-
recoveredDetailAtIndex
|
|
3975
|
-
};
|
|
3976
|
-
};
|
|
3977
|
-
const buildFixedBuckets = (fixedSegments) => {
|
|
3978
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
3979
|
-
for (let i = 0; i < fixedSegments.length; i++) {
|
|
3980
|
-
const k = segmentRangeKey(fixedSegments[i]);
|
|
3981
|
-
const arr = buckets.get(k);
|
|
3982
|
-
if (!arr) buckets.set(k, [i]);
|
|
3983
|
-
else arr.push(i);
|
|
3984
|
-
}
|
|
3985
|
-
return buckets;
|
|
3986
|
-
};
|
|
3987
|
-
const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
|
|
3988
|
-
let best = null;
|
|
3989
|
-
let secondBestScore = -Infinity;
|
|
3990
|
-
for (const fixedIdx of candidates) {
|
|
3991
|
-
if (usedFixed.has(fixedIdx)) continue;
|
|
3992
|
-
const fixed = fixedSegments[fixedIdx];
|
|
3993
|
-
const scored = scoreCandidate(orig, fixed, normalizeCompare);
|
|
3994
|
-
if (!scored) continue;
|
|
3995
|
-
const candidateScore = scored.score;
|
|
3996
|
-
if (!best || candidateScore > best.score) {
|
|
3997
|
-
secondBestScore = best?.score ?? -Infinity;
|
|
3998
|
-
best = {
|
|
3999
|
-
fixedIdx,
|
|
4000
|
-
score: candidateScore
|
|
4001
|
-
};
|
|
4002
|
-
} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
|
|
4003
|
-
}
|
|
4004
|
-
if (!best) return { kind: "none" };
|
|
4005
|
-
if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
|
|
4006
|
-
return {
|
|
4007
|
-
fixedIdx: best.fixedIdx,
|
|
4008
|
-
kind: "match"
|
|
4009
|
-
};
|
|
4010
|
-
};
|
|
4011
|
-
const detailUnresolved = (orig, segmentIndex, notes) => ({
|
|
4012
|
-
from: orig.from,
|
|
4013
|
-
notes,
|
|
4014
|
-
originalStartPreview: preview(orig.content),
|
|
4015
|
-
segmentIndex,
|
|
4016
|
-
status: "unresolved_alignment",
|
|
4017
|
-
strategy: "rerun",
|
|
4018
|
-
to: orig.to
|
|
4019
|
-
});
|
|
4020
|
-
const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
|
|
4021
|
-
from: orig.from,
|
|
4022
|
-
notes,
|
|
4023
|
-
originalStartPreview: preview(orig.content),
|
|
4024
|
-
segmentIndex,
|
|
4025
|
-
status: "skipped_idempotent",
|
|
4026
|
-
strategy: "rerun",
|
|
4027
|
-
to: orig.to
|
|
4028
|
-
});
|
|
4029
|
-
const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
|
|
4030
|
-
let recoveredPrefixPreview;
|
|
4031
|
-
if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
|
|
4032
|
-
return {
|
|
4033
|
-
from: orig.from,
|
|
4034
|
-
originalStartPreview: preview(orig.content),
|
|
4035
|
-
recoveredPrefixPreview,
|
|
4036
|
-
recoveredStartPreview: preview(fixed.content),
|
|
4037
|
-
segmentIndex,
|
|
4038
|
-
status: "recovered",
|
|
4039
|
-
strategy: "rerun",
|
|
4040
|
-
to: orig.to
|
|
4041
|
-
};
|
|
4042
|
-
};
|
|
4043
|
-
const mergeWithRerun = (params) => {
|
|
4044
|
-
const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
|
|
4045
|
-
const usedFixed = /* @__PURE__ */ new Set();
|
|
4046
|
-
const out = [];
|
|
4047
|
-
const details = [];
|
|
4048
|
-
let recovered = 0;
|
|
4049
|
-
let unresolved = 0;
|
|
4050
|
-
let unchanged = 0;
|
|
4051
|
-
for (let i = 0; i < originalSegments.length; i++) {
|
|
4052
|
-
const stage1Recovered = stage1RecoveredAtIndex.get(i);
|
|
4053
|
-
if (stage1Recovered) {
|
|
4054
|
-
out.push(stage1Recovered);
|
|
4055
|
-
recovered++;
|
|
4056
|
-
details.push(recoveredDetailAtIndex.get(i) ?? {
|
|
4057
|
-
from: stage1Recovered.from,
|
|
4058
|
-
originalStartPreview: preview(originalSegments[i].content),
|
|
4059
|
-
recoveredStartPreview: preview(stage1Recovered.content),
|
|
4060
|
-
segmentIndex: i,
|
|
4061
|
-
status: "recovered",
|
|
4062
|
-
strategy: "stage1",
|
|
4063
|
-
to: stage1Recovered.to
|
|
4064
|
-
});
|
|
4065
|
-
continue;
|
|
4066
|
-
}
|
|
4067
|
-
const orig = originalSegments[i];
|
|
4068
|
-
const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
|
|
4069
|
-
if (best.kind === "none") {
|
|
4070
|
-
out.push(orig);
|
|
4071
|
-
unresolved++;
|
|
4072
|
-
details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
|
|
4073
|
-
continue;
|
|
4074
|
-
}
|
|
4075
|
-
if (best.kind === "ambiguous") {
|
|
4076
|
-
out.push(orig);
|
|
4077
|
-
unresolved++;
|
|
4078
|
-
details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
|
|
4079
|
-
continue;
|
|
4080
|
-
}
|
|
4081
|
-
usedFixed.add(best.fixedIdx);
|
|
4082
|
-
const fixed = fixedSegments[best.fixedIdx];
|
|
4083
|
-
if (fixed.content === orig.content) {
|
|
4084
|
-
out.push(orig);
|
|
4085
|
-
unchanged++;
|
|
4086
|
-
details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
|
|
4087
|
-
continue;
|
|
4088
|
-
}
|
|
4089
|
-
out.push({
|
|
4090
|
-
...orig,
|
|
4091
|
-
content: fixed.content
|
|
4092
|
-
});
|
|
4093
|
-
recovered++;
|
|
4094
|
-
details.push(detailRecoveredRerun(orig, fixed, i));
|
|
4095
|
-
}
|
|
4096
|
-
return {
|
|
4097
|
-
details,
|
|
4098
|
-
segments: out,
|
|
4099
|
-
summary: {
|
|
4100
|
-
recovered,
|
|
4101
|
-
unchanged,
|
|
4102
|
-
unresolved
|
|
4103
|
-
}
|
|
4104
|
-
};
|
|
4105
|
-
};
|
|
4106
|
-
function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
|
|
4107
|
-
const mode = opts?.mode ?? "rerun_only";
|
|
4108
|
-
const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
|
|
4109
|
-
const resolved = resolveSelectorToRuleIndices(options, selector);
|
|
4110
|
-
const reportBase = {
|
|
4111
|
-
byRun: void 0,
|
|
4112
|
-
errors: resolved.errors,
|
|
4113
|
-
warnings: resolved.warnings
|
|
4114
|
-
};
|
|
4115
|
-
if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
|
|
4116
|
-
const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
|
|
4117
|
-
const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
|
|
4118
|
-
const merged = mergeWithRerun({
|
|
4119
|
-
fixedBuckets: buildFixedBuckets(fixedSegments),
|
|
4120
|
-
fixedSegments,
|
|
4121
|
-
normalizeCompare,
|
|
4122
|
-
originalSegments: segments,
|
|
4123
|
-
recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
|
|
4124
|
-
stage1RecoveredAtIndex: stage1.recoveredAtIndex
|
|
4125
|
-
});
|
|
4126
|
-
return {
|
|
4127
|
-
report: {
|
|
4128
|
-
...reportBase,
|
|
4129
|
-
details: merged.details,
|
|
4130
|
-
summary: {
|
|
4131
|
-
mode,
|
|
4132
|
-
recovered: merged.summary.recovered,
|
|
4133
|
-
totalSegments: segments.length,
|
|
4134
|
-
unchanged: merged.summary.unchanged,
|
|
4135
|
-
unresolved: merged.summary.unresolved
|
|
4136
|
-
}
|
|
4137
|
-
},
|
|
4138
|
-
segments: merged.segments
|
|
4139
|
-
};
|
|
4140
|
-
}
|
|
4141
|
-
function recoverMistakenMarkersForRuns(runs, opts) {
|
|
4142
|
-
const allSegments = [];
|
|
4143
|
-
const byRun = [];
|
|
4144
|
-
const details = [];
|
|
4145
|
-
const warnings = [];
|
|
4146
|
-
const errors = [];
|
|
4147
|
-
let recovered = 0;
|
|
4148
|
-
let unchanged = 0;
|
|
4149
|
-
let unresolved = 0;
|
|
4150
|
-
let offset = 0;
|
|
4151
|
-
for (let i = 0; i < runs.length; i++) {
|
|
4152
|
-
const run = runs[i];
|
|
4153
|
-
const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
|
|
4154
|
-
allSegments.push(...res.segments);
|
|
4155
|
-
for (const d of res.report.details) details.push({
|
|
4156
|
-
...d,
|
|
4157
|
-
segmentIndex: d.segmentIndex + offset
|
|
4158
|
-
});
|
|
4159
|
-
offset += run.segments.length;
|
|
4160
|
-
recovered += res.report.summary.recovered;
|
|
4161
|
-
unchanged += res.report.summary.unchanged;
|
|
4162
|
-
unresolved += res.report.summary.unresolved;
|
|
4163
|
-
warnings.push(...res.report.warnings);
|
|
4164
|
-
errors.push(...res.report.errors);
|
|
4165
|
-
byRun.push({
|
|
4166
|
-
recovered: res.report.summary.recovered,
|
|
4167
|
-
runIndex: i,
|
|
4168
|
-
totalSegments: run.segments.length,
|
|
4169
|
-
unresolved: res.report.summary.unresolved
|
|
4170
|
-
});
|
|
4171
|
-
}
|
|
4172
|
-
return {
|
|
4173
|
-
report: {
|
|
4174
|
-
byRun,
|
|
4175
|
-
details,
|
|
4176
|
-
errors,
|
|
4177
|
-
summary: {
|
|
4178
|
-
mode: opts?.mode ?? "rerun_only",
|
|
4179
|
-
recovered,
|
|
4180
|
-
totalSegments: offset,
|
|
4181
|
-
unchanged,
|
|
4182
|
-
unresolved
|
|
4183
|
-
},
|
|
4184
|
-
warnings
|
|
4185
|
-
},
|
|
4186
|
-
segments: allSegments
|
|
4187
|
-
};
|
|
4188
|
-
}
|
|
4189
|
-
|
|
4190
|
-
//#endregion
|
|
4191
|
-
//#region src/segmentation/pattern-validator.ts
|
|
4192
|
-
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
4193
|
-
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
4194
|
-
const buildBareTokenRegex = () => {
|
|
4195
|
-
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
4196
|
-
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
4197
|
-
};
|
|
4198
|
-
/**
|
|
4199
|
-
* Validates a single pattern for common issues.
|
|
4200
|
-
*/
|
|
4201
|
-
const validatePattern = (pattern, seenPatterns) => {
|
|
4202
|
-
if (!pattern.trim()) return {
|
|
4203
|
-
message: "Empty pattern is not allowed",
|
|
4204
|
-
type: "empty_pattern"
|
|
4205
|
-
};
|
|
4206
|
-
if (seenPatterns.has(pattern)) return {
|
|
4207
|
-
message: `Duplicate pattern: "${pattern}"`,
|
|
4208
|
-
pattern,
|
|
4209
|
-
type: "duplicate"
|
|
4210
|
-
};
|
|
4211
|
-
seenPatterns.add(pattern);
|
|
4212
|
-
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
4213
|
-
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
4214
|
-
const name = match[1];
|
|
4215
|
-
if (!KNOWN_TOKENS.has(name)) return {
|
|
4216
|
-
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
4217
|
-
suggestion: "Check spelling or use a known token",
|
|
4218
|
-
token: name,
|
|
4219
|
-
type: "unknown_token"
|
|
4220
|
-
};
|
|
4221
|
-
}
|
|
4222
|
-
for (const match of pattern.matchAll(buildBareTokenRegex())) {
|
|
4223
|
-
const [full, name] = match;
|
|
4224
|
-
const idx = match.index;
|
|
4225
|
-
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
4226
|
-
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
4227
|
-
suggestion: `{{${full}}}`,
|
|
4228
|
-
token: name,
|
|
4229
|
-
type: "missing_braces"
|
|
4230
|
-
};
|
|
4231
|
-
}
|
|
4232
|
-
};
|
|
4233
|
-
/**
|
|
4234
|
-
* Validates an array of patterns, returning parallel array of issues.
|
|
4235
|
-
*/
|
|
4236
|
-
const validatePatternArray = (patterns) => {
|
|
4237
|
-
const seen = /* @__PURE__ */ new Set();
|
|
4238
|
-
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
4239
|
-
return issues.some(Boolean) ? issues : void 0;
|
|
4240
|
-
};
|
|
4241
|
-
/**
|
|
4242
|
-
* Validates split rules for common pattern issues.
|
|
4243
|
-
*
|
|
4244
|
-
* Checks for:
|
|
4245
|
-
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
4246
|
-
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
4247
|
-
* - Duplicate patterns within the same rule
|
|
4248
|
-
*
|
|
4249
|
-
* @param rules - Array of split rules to validate
|
|
4250
|
-
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
4251
|
-
*
|
|
4252
|
-
* @example
|
|
4253
|
-
* const issues = validateRules([
|
|
4254
|
-
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
4255
|
-
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
4256
|
-
* ]);
|
|
4257
|
-
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
4258
|
-
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
4259
|
-
*/
|
|
4260
|
-
const validateRules = (rules) => rules.map((rule) => {
|
|
4261
|
-
const result = {};
|
|
4262
|
-
let hasIssues = false;
|
|
4263
|
-
for (const key of [
|
|
4264
|
-
"lineStartsWith",
|
|
4265
|
-
"lineStartsAfter",
|
|
4266
|
-
"lineEndsWith"
|
|
4267
|
-
]) if (key in rule && rule[key]) {
|
|
4268
|
-
const issues = validatePatternArray(rule[key]);
|
|
4269
|
-
if (issues) {
|
|
4270
|
-
result[key] = issues;
|
|
4271
|
-
hasIssues = true;
|
|
4272
|
-
}
|
|
4273
|
-
}
|
|
4274
|
-
if ("template" in rule && rule.template !== void 0) {
|
|
4275
|
-
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
4276
|
-
if (issue) {
|
|
4277
|
-
result.template = issue;
|
|
4278
|
-
hasIssues = true;
|
|
4279
|
-
}
|
|
4280
|
-
}
|
|
4281
|
-
return hasIssues ? result : void 0;
|
|
4282
|
-
});
|
|
4283
|
-
/**
|
|
4284
|
-
* Formats a validation result array into a list of human-readable error messages.
|
|
4285
|
-
*
|
|
4286
|
-
* Useful for displaying validation errors in UIs.
|
|
4287
|
-
*
|
|
4288
|
-
* @param results - The result array from `validateRules()`
|
|
4289
|
-
* @returns Array of formatted error strings
|
|
4290
|
-
*
|
|
4291
|
-
* @example
|
|
4292
|
-
* const issues = validateRules(rules);
|
|
4293
|
-
* const errors = formatValidationReport(issues);
|
|
4294
|
-
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
4295
|
-
*/
|
|
4296
|
-
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
4297
|
-
if (!result) return [];
|
|
4298
|
-
return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => {
|
|
4299
|
-
if (!issue) return null;
|
|
4300
|
-
const loc = `Rule ${i + 1}, ${type}`;
|
|
4301
|
-
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
4302
|
-
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
4303
|
-
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
4304
|
-
return `${loc}: ${issue.message || issue.type}`;
|
|
4305
|
-
})).filter((msg) => msg !== null);
|
|
4306
|
-
});
|
|
4307
|
-
|
|
4308
|
-
//#endregion
|
|
4309
|
-
//#region src/validation/validation-constants.ts
|
|
4310
|
-
/**
|
|
4311
|
-
* Validation-specific constants
|
|
4312
|
-
*/
|
|
4313
|
-
/**
|
|
4314
|
-
* Limit for validation issue preview length (characters).
|
|
4315
|
-
*/
|
|
4316
|
-
const PREVIEW_LIMIT = 140;
|
|
4317
|
-
/**
|
|
4318
|
-
* Threshold for short segment content (characters).
|
|
4319
|
-
* Segments shorter than this will trigger a full-document search fallback
|
|
4320
|
-
* if not found in the expected window.
|
|
4321
|
-
*/
|
|
4322
|
-
const FULL_SEARCH_THRESHOLD = 500;
|
|
4323
|
-
|
|
4324
4153
|
//#endregion
|
|
4325
4154
|
//#region src/validation/validate-segments.ts
|
|
4326
4155
|
/**
|
|
@@ -4329,8 +4158,8 @@ const FULL_SEARCH_THRESHOLD = 500;
|
|
|
4329
4158
|
*/
|
|
4330
4159
|
const buildPreview = (text) => {
|
|
4331
4160
|
const normalized = text.replace(/\s+/g, " ").trim();
|
|
4332
|
-
if (normalized.length <=
|
|
4333
|
-
return `${normalized.slice(0,
|
|
4161
|
+
if (normalized.length <= 140) return normalized;
|
|
4162
|
+
return `${normalized.slice(0, 140)}...`;
|
|
4334
4163
|
};
|
|
4335
4164
|
/**
|
|
4336
4165
|
* Creates a lightweight snapshot of a segment for inclusion in validation checks.
|
|
@@ -4358,19 +4187,18 @@ const normalizePages = (pages, options) => {
|
|
|
4358
4187
|
*/
|
|
4359
4188
|
const buildJoinedContent = (pages, joiner) => {
|
|
4360
4189
|
const boundaries = [];
|
|
4361
|
-
const
|
|
4362
|
-
const joined = nonEmptyPages.map((p) => p.content).join(joiner);
|
|
4190
|
+
const joined = pages.map((p) => p.content).join(joiner);
|
|
4363
4191
|
let offset = 0;
|
|
4364
|
-
for (let i = 0; i <
|
|
4365
|
-
const content =
|
|
4192
|
+
for (let i = 0; i < pages.length; i++) {
|
|
4193
|
+
const content = pages[i].content;
|
|
4366
4194
|
const start = offset;
|
|
4367
|
-
const end = start + content.length
|
|
4195
|
+
const end = start + content.length;
|
|
4368
4196
|
boundaries.push({
|
|
4369
4197
|
end,
|
|
4370
|
-
id:
|
|
4198
|
+
id: pages[i].id,
|
|
4371
4199
|
start
|
|
4372
4200
|
});
|
|
4373
|
-
offset
|
|
4201
|
+
offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
|
|
4374
4202
|
}
|
|
4375
4203
|
return {
|
|
4376
4204
|
boundaries,
|
|
@@ -4561,7 +4389,7 @@ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, search
|
|
|
4561
4389
|
const bufferSize = 1e3;
|
|
4562
4390
|
const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
|
|
4563
4391
|
if (rawMatches.length === 0) {
|
|
4564
|
-
const threshold = validationOptions?.fullSearchThreshold ??
|
|
4392
|
+
const threshold = validationOptions?.fullSearchThreshold ?? 500;
|
|
4565
4393
|
if (content.length < threshold) {
|
|
4566
4394
|
const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
|
|
4567
4395
|
const validMatch = fullMatches.find((m) => {
|
|
@@ -4715,7 +4543,7 @@ const validateSegments = (pages, options, segments, validationOptions) => {
|
|
|
4715
4543
|
}
|
|
4716
4544
|
};
|
|
4717
4545
|
};
|
|
4718
|
-
|
|
4719
4546
|
//#endregion
|
|
4720
|
-
export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive,
|
|
4547
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
4548
|
+
|
|
4721
4549
|
//# sourceMappingURL=index.mjs.map
|