flappa-doormal 2.17.1 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +16 -39
- package/README.md +114 -63
- package/dist/index.d.mts +227 -76
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1034 -1147
- package/dist/index.mjs.map +1 -1
- package/package.json +9 -9
package/dist/index.mjs
CHANGED
|
@@ -1,141 +1,25 @@
|
|
|
1
|
-
//#region src/
|
|
2
|
-
/**
|
|
3
|
-
* Normalizes line endings to Unix-style (`\n`).
|
|
4
|
-
*
|
|
5
|
-
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
6
|
-
* for consistent pattern matching across platforms.
|
|
7
|
-
*
|
|
8
|
-
* @param content - Raw content with potentially mixed line endings
|
|
9
|
-
* @returns Content with all line endings normalized to `\n`
|
|
10
|
-
*/
|
|
11
|
-
const normalizeLineEndings = (content) => {
|
|
12
|
-
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
13
|
-
};
|
|
14
|
-
/**
|
|
15
|
-
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
16
|
-
* but preserves content inside `{{...}}` token delimiters.
|
|
17
|
-
*
|
|
18
|
-
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
19
|
-
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
20
|
-
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
21
|
-
*
|
|
22
|
-
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
23
|
-
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
24
|
-
*
|
|
25
|
-
* @example
|
|
26
|
-
* escapeTemplateBrackets('({{harf}}): ')
|
|
27
|
-
* // → '\\({{harf}}\\): '
|
|
28
|
-
*
|
|
29
|
-
* @example
|
|
30
|
-
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
31
|
-
* // → '\\[{{raqm}}\\] '
|
|
32
|
-
*
|
|
33
|
-
* @example
|
|
34
|
-
* escapeTemplateBrackets('{{harf}}')
|
|
35
|
-
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
36
|
-
*/
|
|
37
|
-
const escapeTemplateBrackets = (pattern) => {
|
|
38
|
-
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
|
|
39
|
-
};
|
|
1
|
+
//#region src/segmentation/tokens.ts
|
|
40
2
|
/**
|
|
41
|
-
*
|
|
3
|
+
* Arabic base letters used by low-level dictionary-style regex helpers.
|
|
42
4
|
*
|
|
43
|
-
*
|
|
44
|
-
* -
|
|
45
|
-
* -
|
|
46
|
-
* - U+064D: ٍ (kasratan - double kasra)
|
|
47
|
-
* - U+064E: َ (fatha - short a)
|
|
48
|
-
* - U+064F: ُ (damma - short u)
|
|
49
|
-
* - U+0650: ِ (kasra - short i)
|
|
50
|
-
* - U+0651: ّ (shadda - gemination)
|
|
51
|
-
* - U+0652: ْ (sukun - no vowel)
|
|
52
|
-
*
|
|
53
|
-
* @internal
|
|
5
|
+
* This is intentionally broader than `{{harf}}`:
|
|
6
|
+
* - includes standalone hamza `ء`
|
|
7
|
+
* - stays as a raw regex fragment rather than a template token
|
|
54
8
|
*/
|
|
55
|
-
const
|
|
9
|
+
const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
|
|
56
10
|
/**
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
* Characters within the same group are considered equivalent for matching purposes.
|
|
60
|
-
* This handles common variations in Arabic text where different characters are
|
|
61
|
-
* used interchangeably or have the same underlying meaning.
|
|
62
|
-
*
|
|
63
|
-
* Equivalence groups:
|
|
64
|
-
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
65
|
-
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
66
|
-
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
67
|
-
*
|
|
68
|
-
* @internal
|
|
11
|
+
* Arabic combining marks / annotation signs used by low-level regex helpers.
|
|
69
12
|
*/
|
|
70
|
-
const
|
|
71
|
-
[
|
|
72
|
-
"ا",
|
|
73
|
-
"آ",
|
|
74
|
-
"أ",
|
|
75
|
-
"إ"
|
|
76
|
-
],
|
|
77
|
-
["ة", "ه"],
|
|
78
|
-
["ى", "ي"]
|
|
79
|
-
];
|
|
13
|
+
const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
|
|
80
14
|
/**
|
|
81
|
-
*
|
|
82
|
-
*
|
|
83
|
-
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
84
|
-
*
|
|
85
|
-
* @param s - Any string to escape
|
|
86
|
-
* @returns String with regex metacharacters escaped
|
|
87
|
-
*
|
|
88
|
-
* @example
|
|
89
|
-
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
90
|
-
* escapeRegex('[test]') // → '\\[test\\]'
|
|
91
|
-
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
15
|
+
* A single Arabic base letter followed by zero or more combining marks.
|
|
92
16
|
*/
|
|
93
|
-
const
|
|
94
|
-
const getEquivClass = (ch) => {
|
|
95
|
-
const group = EQUIV_GROUPS.find((g) => g.includes(ch));
|
|
96
|
-
return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
|
|
97
|
-
};
|
|
98
|
-
const normalizeArabicLight = (str) => {
|
|
99
|
-
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
100
|
-
};
|
|
101
|
-
const makeDiacriticInsensitive = (text) => {
|
|
102
|
-
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
103
|
-
return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
104
|
-
};
|
|
105
|
-
const isCombiningMarkOrSelector = (char) => {
|
|
106
|
-
if (!char) return false;
|
|
107
|
-
return /\p{M}/u.test(char) || char === "︎" || char === "️";
|
|
108
|
-
};
|
|
109
|
-
const isJoiner = (char) => char === "" || char === "";
|
|
17
|
+
const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
|
|
110
18
|
/**
|
|
111
|
-
*
|
|
112
|
-
* combining marks, or zero-width joiners / variation selectors).
|
|
113
|
-
*
|
|
114
|
-
* This is only used as a last-resort fallback when we are forced to split
|
|
115
|
-
* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
|
|
19
|
+
* One or more Arabic letters, where each letter may carry combining marks.
|
|
116
20
|
*/
|
|
117
|
-
const
|
|
118
|
-
|
|
119
|
-
while (adjusted > 0) {
|
|
120
|
-
const high = content.charCodeAt(adjusted - 1);
|
|
121
|
-
const low = content.charCodeAt(adjusted);
|
|
122
|
-
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
|
|
123
|
-
adjusted -= 1;
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
const nextChar = content[adjusted];
|
|
127
|
-
const prevChar = content[adjusted - 1];
|
|
128
|
-
if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
|
|
129
|
-
adjusted -= 1;
|
|
130
|
-
continue;
|
|
131
|
-
}
|
|
132
|
-
break;
|
|
133
|
-
}
|
|
134
|
-
return adjusted;
|
|
135
|
-
};
|
|
136
|
-
|
|
137
|
-
//#endregion
|
|
138
|
-
//#region src/segmentation/tokens.ts
|
|
21
|
+
const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
|
|
22
|
+
const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
|
|
139
23
|
const RUMUZ_ATOM = `(?:${[
|
|
140
24
|
"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
141
25
|
"خت",
|
|
@@ -166,15 +50,25 @@ const RUMUZ_ATOM = `(?:${[
|
|
|
166
50
|
].join("|")})`;
|
|
167
51
|
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
168
52
|
const BASE_TOKENS = {
|
|
53
|
+
/** Chapter marker (باب). */
|
|
169
54
|
bab: "باب",
|
|
55
|
+
/** Basmala (بسم الله). Also matches ﷽. */
|
|
170
56
|
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
57
|
+
/** Bullet point variants: `•`, `*`, `°`. */
|
|
171
58
|
bullet: "[•*°]",
|
|
59
|
+
/** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
|
|
172
60
|
dash: "[-–—ـ]",
|
|
61
|
+
/** Section marker (فصل / مسألة). */
|
|
173
62
|
fasl: ["مسألة", "فصل"].join("|"),
|
|
63
|
+
/** Single Arabic letter (أ-ي). Does NOT include diacritics. */
|
|
174
64
|
harf: "[أ-ي]",
|
|
175
|
-
|
|
65
|
+
/** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
|
|
66
|
+
harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
|
|
67
|
+
/** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
|
|
176
68
|
hr: "[-–—ـ_=]{5,}",
|
|
69
|
+
/** Book marker (كتاب). */
|
|
177
70
|
kitab: "كتاب",
|
|
71
|
+
/** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
|
|
178
72
|
naql: [
|
|
179
73
|
"حدثني",
|
|
180
74
|
"وأخبرنا",
|
|
@@ -186,33 +80,58 @@ const BASE_TOKENS = {
|
|
|
186
80
|
"وحدثني",
|
|
187
81
|
"وحدثنيه"
|
|
188
82
|
].join("|"),
|
|
83
|
+
/** Newline character. Useful for breakpoints that split on line boundaries. */
|
|
189
84
|
newline: "\\n",
|
|
85
|
+
/** Single ASCII digit (0-9). */
|
|
190
86
|
num: "\\d",
|
|
87
|
+
/** One or more ASCII digits (0-9)+. */
|
|
191
88
|
nums: "\\d+",
|
|
89
|
+
/** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
|
|
192
90
|
raqm: "[\\u0660-\\u0669]",
|
|
91
|
+
/** One or more Arabic-Indic digits (٠-٩)+. */
|
|
193
92
|
raqms: "[\\u0660-\\u0669]+",
|
|
93
|
+
/** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
|
|
194
94
|
rumuz: RUMUZ_BLOCK,
|
|
95
|
+
/** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
|
|
195
96
|
tarqim: "[.!?؟؛]"
|
|
196
97
|
};
|
|
197
98
|
/** Pre-defined token constants for use in patterns. */
|
|
198
99
|
const Token = {
|
|
100
|
+
/** Chapter marker - باب */
|
|
199
101
|
BAB: "{{bab}}",
|
|
102
|
+
/** Basmala - بسم الله */
|
|
200
103
|
BASMALAH: "{{basmalah}}",
|
|
104
|
+
/** Bullet point variants */
|
|
201
105
|
BULLET: "{{bullet}}",
|
|
106
|
+
/** Dash variants (hyphen, en-dash, em-dash, tatweel) */
|
|
202
107
|
DASH: "{{dash}}",
|
|
108
|
+
/** Section marker - فصل / مسألة */
|
|
203
109
|
FASL: "{{fasl}}",
|
|
110
|
+
/** Single Arabic letter */
|
|
204
111
|
HARF: "{{harf}}",
|
|
112
|
+
/** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
|
|
205
113
|
HARFS: "{{harfs}}",
|
|
114
|
+
/** Horizontal rule / separator (repeated dashes) */
|
|
206
115
|
HR: "{{hr}}",
|
|
116
|
+
/** Book marker - كتاب */
|
|
207
117
|
KITAB: "{{kitab}}",
|
|
118
|
+
/** Hadith transmission phrases */
|
|
208
119
|
NAQL: "{{naql}}",
|
|
120
|
+
/** Newline character (for breakpoints) */
|
|
209
121
|
NEWLINE: "{{newline}}",
|
|
122
|
+
/** Single ASCII digit */
|
|
210
123
|
NUM: "{{num}}",
|
|
124
|
+
/** Composite: {{raqms}} {{dash}} (space) */
|
|
211
125
|
NUMBERED: "{{numbered}}",
|
|
126
|
+
/** One or more ASCII digits */
|
|
212
127
|
NUMS: "{{nums}}",
|
|
128
|
+
/** Single Arabic-Indic digit */
|
|
213
129
|
RAQM: "{{raqm}}",
|
|
130
|
+
/** One or more Arabic-Indic digits */
|
|
214
131
|
RAQMS: "{{raqms}}",
|
|
132
|
+
/** Source abbreviations (rijāl/takhrīj) */
|
|
215
133
|
RUMUZ: "{{rumuz}}",
|
|
134
|
+
/** Punctuation marks */
|
|
216
135
|
TARQIM: "{{tarqim}}"
|
|
217
136
|
};
|
|
218
137
|
/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
|
|
@@ -222,7 +141,9 @@ const withCapture = (token, name) => {
|
|
|
222
141
|
return `{{${match[1]}:${name}}}`;
|
|
223
142
|
};
|
|
224
143
|
/** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
|
|
225
|
-
const COMPOSITE_TOKENS = {
|
|
144
|
+
const COMPOSITE_TOKENS = {
|
|
145
|
+
/** Common hadith numbering format: Arabic-Indic digits + dash + space. */
|
|
146
|
+
numbered: "{{raqms}} {{dash}} " };
|
|
226
147
|
/** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
|
|
227
148
|
const expandCompositeTokensInTemplate = (template) => {
|
|
228
149
|
let out = template;
|
|
@@ -473,11 +394,11 @@ const templateToRegex = (template) => {
|
|
|
473
394
|
* Useful for documentation, validation, or building user interfaces
|
|
474
395
|
* that show available tokens.
|
|
475
396
|
*
|
|
476
|
-
* @returns Array of token names (e.g., `['bab', '
|
|
397
|
+
* @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
|
|
477
398
|
*
|
|
478
399
|
* @example
|
|
479
400
|
* getAvailableTokens()
|
|
480
|
-
* // → ['bab', '
|
|
401
|
+
* // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
481
402
|
*/
|
|
482
403
|
const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
483
404
|
/**
|
|
@@ -486,13 +407,13 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
|
486
407
|
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
487
408
|
* without any expansion or capture group wrapping.
|
|
488
409
|
*
|
|
489
|
-
* @param tokenName - The token name to look up (e.g., 'raqms'
|
|
490
|
-
* @returns The regex pattern string
|
|
410
|
+
* @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
|
|
411
|
+
* @returns The regex pattern string for that known token
|
|
491
412
|
*
|
|
492
413
|
* @example
|
|
493
414
|
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
494
415
|
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
495
|
-
* getTokenPattern('
|
|
416
|
+
* getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
|
|
496
417
|
*/
|
|
497
418
|
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
498
419
|
/**
|
|
@@ -571,7 +492,161 @@ const applyTokenMappings = (template, mappings) => {
|
|
|
571
492
|
const stripTokenMappings = (template) => {
|
|
572
493
|
return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
|
|
573
494
|
};
|
|
574
|
-
|
|
495
|
+
//#endregion
|
|
496
|
+
//#region src/utils/textUtils.ts
|
|
497
|
+
/**
|
|
498
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
499
|
+
*
|
|
500
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
501
|
+
* for consistent pattern matching across platforms.
|
|
502
|
+
*
|
|
503
|
+
* @param content - Raw content with potentially mixed line endings
|
|
504
|
+
* @returns Content with all line endings normalized to `\n`
|
|
505
|
+
*/
|
|
506
|
+
const normalizeLineEndings = (content) => {
|
|
507
|
+
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
508
|
+
};
|
|
509
|
+
/**
|
|
510
|
+
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
511
|
+
* but preserves content inside `{{...}}` token delimiters.
|
|
512
|
+
*
|
|
513
|
+
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
514
|
+
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
515
|
+
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
516
|
+
*
|
|
517
|
+
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
518
|
+
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
519
|
+
*
|
|
520
|
+
* @example
|
|
521
|
+
* escapeTemplateBrackets('({{harf}}): ')
|
|
522
|
+
* // → '\\({{harf}}\\): '
|
|
523
|
+
*
|
|
524
|
+
* @example
|
|
525
|
+
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
526
|
+
* // → '\\[{{raqm}}\\] '
|
|
527
|
+
*
|
|
528
|
+
* @example
|
|
529
|
+
* escapeTemplateBrackets('{{harf}}')
|
|
530
|
+
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
531
|
+
*/
|
|
532
|
+
const escapeTemplateBrackets = (pattern) => {
|
|
533
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
|
|
534
|
+
};
|
|
535
|
+
/**
|
|
536
|
+
* Character class matching all Arabic diacritics (Tashkeel/Harakat).
|
|
537
|
+
*
|
|
538
|
+
* Includes the following diacritical marks:
|
|
539
|
+
* - U+0640: ـ (tatweel / kashida)
|
|
540
|
+
* - U+064B: ً (fathatan - double fatha)
|
|
541
|
+
* - U+064C: ٌ (dammatan - double damma)
|
|
542
|
+
* - U+064D: ٍ (kasratan - double kasra)
|
|
543
|
+
* - U+064E: َ (fatha - short a)
|
|
544
|
+
* - U+064F: ُ (damma - short u)
|
|
545
|
+
* - U+0650: ِ (kasra - short i)
|
|
546
|
+
* - U+0651: ّ (shadda - gemination)
|
|
547
|
+
* - U+0652: ْ (sukun - no vowel)
|
|
548
|
+
*
|
|
549
|
+
* @internal
|
|
550
|
+
*/
|
|
551
|
+
const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
|
|
552
|
+
/**
|
|
553
|
+
* Groups of equivalent Arabic characters.
|
|
554
|
+
*
|
|
555
|
+
* Characters within the same group are considered equivalent for matching purposes.
|
|
556
|
+
* This handles common variations in Arabic text where different characters are
|
|
557
|
+
* used interchangeably or have the same underlying meaning.
|
|
558
|
+
*
|
|
559
|
+
* Equivalence groups:
|
|
560
|
+
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
561
|
+
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
562
|
+
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
563
|
+
*
|
|
564
|
+
* @internal
|
|
565
|
+
*/
|
|
566
|
+
const EQUIV_GROUPS = [
|
|
567
|
+
[
|
|
568
|
+
"ا",
|
|
569
|
+
"آ",
|
|
570
|
+
"أ",
|
|
571
|
+
"إ"
|
|
572
|
+
],
|
|
573
|
+
["ة", "ه"],
|
|
574
|
+
["ى", "ي"]
|
|
575
|
+
];
|
|
576
|
+
const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
|
|
577
|
+
/**
|
|
578
|
+
* Escapes a string for safe inclusion in a regular expression.
|
|
579
|
+
*
|
|
580
|
+
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
581
|
+
*
|
|
582
|
+
* @param s - Any string to escape
|
|
583
|
+
* @returns String with regex metacharacters escaped
|
|
584
|
+
*
|
|
585
|
+
* @example
|
|
586
|
+
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
587
|
+
* escapeRegex('[test]') // → '\\[test\\]'
|
|
588
|
+
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
589
|
+
*/
|
|
590
|
+
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
591
|
+
const getEquivClass = (ch) => {
|
|
592
|
+
const group = EQUIV_GROUPS.find((g) => g.includes(ch));
|
|
593
|
+
return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
|
|
594
|
+
};
|
|
595
|
+
const normalizeArabicLight = (str) => {
|
|
596
|
+
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
597
|
+
};
|
|
598
|
+
/**
|
|
599
|
+
* Normalizes Arabic text for exact comparisons while tolerating common variants.
|
|
600
|
+
*
|
|
601
|
+
* This removes Arabic diacritics, collapses whitespace, removes joiners, and
|
|
602
|
+
* maps common equivalent letters to a shared canonical form:
|
|
603
|
+
* - ا/آ/أ/إ -> ا
|
|
604
|
+
* - ة/ه -> ه
|
|
605
|
+
* - ى/ي -> ي
|
|
606
|
+
*/
|
|
607
|
+
const normalizeArabicForComparison = (text) => {
|
|
608
|
+
return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
|
|
609
|
+
if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
|
|
610
|
+
if (ch === "ة") return "ه";
|
|
611
|
+
if (ch === "ى") return "ي";
|
|
612
|
+
return ch;
|
|
613
|
+
}).join("");
|
|
614
|
+
};
|
|
615
|
+
const makeDiacriticInsensitive = (text) => {
|
|
616
|
+
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
617
|
+
return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
618
|
+
};
|
|
619
|
+
const isCombiningMarkOrSelector = (char) => {
|
|
620
|
+
if (!char) return false;
|
|
621
|
+
return /\p{M}/u.test(char) || char === "︎" || char === "️";
|
|
622
|
+
};
|
|
623
|
+
const isJoiner = (char) => char === "" || char === "";
|
|
624
|
+
/**
|
|
625
|
+
* Ensures the position does not split a grapheme cluster (surrogate pairs,
|
|
626
|
+
* combining marks, or zero-width joiners / variation selectors).
|
|
627
|
+
*
|
|
628
|
+
* This is only used as a last-resort fallback when we are forced to split
|
|
629
|
+
* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
|
|
630
|
+
*/
|
|
631
|
+
const adjustForUnicodeBoundary = (content, position) => {
|
|
632
|
+
let adjusted = position;
|
|
633
|
+
while (adjusted > 0) {
|
|
634
|
+
const high = content.charCodeAt(adjusted - 1);
|
|
635
|
+
const low = content.charCodeAt(adjusted);
|
|
636
|
+
if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
|
|
637
|
+
adjusted -= 1;
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
const nextChar = content[adjusted];
|
|
641
|
+
const prevChar = content[adjusted - 1];
|
|
642
|
+
if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
|
|
643
|
+
adjusted -= 1;
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
break;
|
|
647
|
+
}
|
|
648
|
+
return adjusted;
|
|
649
|
+
};
|
|
575
650
|
//#endregion
|
|
576
651
|
//#region src/analysis/shared.ts
|
|
577
652
|
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
@@ -632,7 +707,6 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
|
632
707
|
};
|
|
633
708
|
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
634
709
|
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
635
|
-
|
|
636
710
|
//#endregion
|
|
637
711
|
//#region src/analysis/line-starts.ts
|
|
638
712
|
const resolveOptions$1 = (options = {}) => ({
|
|
@@ -658,65 +732,141 @@ const compareBySpecificity = (a, b) => {
|
|
|
658
732
|
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
659
733
|
};
|
|
660
734
|
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
661
|
-
|
|
662
|
-
const trimTrailingWs = (out, mode) => {
|
|
663
|
-
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
664
|
-
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
665
|
-
return out;
|
|
666
|
-
};
|
|
667
|
-
/** Try to extract first word for fallback */
|
|
668
|
-
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
669
|
-
/** Consume prefix matchers at current position */
|
|
670
|
-
const consumePrefixes = (s, pos, out, matchers, ws) => {
|
|
671
|
-
let matched = false;
|
|
735
|
+
const appendPrefix = (s, pos, out, matchers, ws) => {
|
|
672
736
|
for (const re of matchers) {
|
|
673
737
|
if (pos >= s.length) break;
|
|
674
738
|
const m = re.exec(s.slice(pos));
|
|
675
739
|
if (!m?.index && m?.[0]) {
|
|
676
740
|
out += escapeSignatureLiteral(m[0]);
|
|
677
741
|
pos += m[0].length;
|
|
678
|
-
matched = true;
|
|
679
742
|
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
680
743
|
if (wsm) {
|
|
681
744
|
pos += wsm[0].length;
|
|
682
745
|
out = appendWs(out, ws);
|
|
683
746
|
}
|
|
747
|
+
return {
|
|
748
|
+
matched: true,
|
|
749
|
+
out,
|
|
750
|
+
pos
|
|
751
|
+
};
|
|
684
752
|
}
|
|
685
753
|
}
|
|
686
754
|
return {
|
|
687
|
-
matched,
|
|
755
|
+
matched: false,
|
|
688
756
|
out,
|
|
689
757
|
pos
|
|
690
758
|
};
|
|
691
759
|
};
|
|
692
|
-
|
|
693
|
-
const tryMatchToken = (s, pos, out, compiled) => {
|
|
760
|
+
const appendToken = (s, pos, out, compiled) => {
|
|
694
761
|
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
695
|
-
|
|
696
|
-
matched: false,
|
|
697
|
-
out,
|
|
698
|
-
pos
|
|
699
|
-
};
|
|
700
|
-
return {
|
|
762
|
+
return best ? {
|
|
701
763
|
matched: true,
|
|
702
764
|
out: `${out}{{${best.token}}}`,
|
|
703
765
|
pos: pos + best.text.length
|
|
766
|
+
} : {
|
|
767
|
+
matched: false,
|
|
768
|
+
out,
|
|
769
|
+
pos
|
|
704
770
|
};
|
|
705
771
|
};
|
|
706
|
-
|
|
707
|
-
const tryMatchDelimiter = (s, pos, out) => {
|
|
772
|
+
const appendDelimiter = (s, pos, out) => {
|
|
708
773
|
const ch = s[pos];
|
|
709
|
-
|
|
774
|
+
return ch && isCommonDelimiter(ch) ? {
|
|
775
|
+
matched: true,
|
|
776
|
+
out: `${out}${escapeSignatureLiteral(ch)}`,
|
|
777
|
+
pos: pos + 1
|
|
778
|
+
} : {
|
|
710
779
|
matched: false,
|
|
711
780
|
out,
|
|
712
781
|
pos
|
|
713
782
|
};
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
783
|
+
};
|
|
784
|
+
const appendFallbackWord = (s, pos, out) => {
|
|
785
|
+
const word = extractFirstWord(s.slice(pos));
|
|
786
|
+
return word ? `${out}${escapeSignatureLiteral(word)}` : null;
|
|
787
|
+
};
|
|
788
|
+
const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
|
|
789
|
+
const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
|
|
790
|
+
if (ws.skipped) return {
|
|
791
|
+
done: false,
|
|
792
|
+
matchedAny,
|
|
793
|
+
matchedToken,
|
|
794
|
+
out: ws.out,
|
|
795
|
+
pos: ws.pos,
|
|
796
|
+
steps: 0
|
|
797
|
+
};
|
|
798
|
+
const tok = appendToken(s, pos, out, compiled);
|
|
799
|
+
if (tok.matched) return {
|
|
800
|
+
done: false,
|
|
801
|
+
matchedAny: true,
|
|
802
|
+
matchedToken: true,
|
|
803
|
+
out: tok.out,
|
|
804
|
+
pos: tok.pos,
|
|
805
|
+
steps: 1
|
|
806
|
+
};
|
|
807
|
+
if (matchedAny) {
|
|
808
|
+
const delim = appendDelimiter(s, pos, out);
|
|
809
|
+
if (delim.matched) return {
|
|
810
|
+
done: false,
|
|
811
|
+
matchedAny,
|
|
812
|
+
matchedToken,
|
|
813
|
+
out: delim.out,
|
|
814
|
+
pos: delim.pos,
|
|
815
|
+
steps: 0
|
|
816
|
+
};
|
|
817
|
+
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
818
|
+
const fallback = appendFallbackWord(s, pos, out);
|
|
819
|
+
if (fallback) return {
|
|
820
|
+
done: true,
|
|
821
|
+
matchedAny,
|
|
822
|
+
matchedToken,
|
|
823
|
+
out: fallback,
|
|
824
|
+
pos,
|
|
825
|
+
steps: 1
|
|
826
|
+
};
|
|
827
|
+
}
|
|
828
|
+
return {
|
|
829
|
+
done: true,
|
|
830
|
+
matchedAny,
|
|
831
|
+
matchedToken,
|
|
832
|
+
out,
|
|
833
|
+
pos,
|
|
834
|
+
steps: 0
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
if (!opts.includeFirstWordFallback) return {
|
|
838
|
+
done: true,
|
|
839
|
+
matchedAny,
|
|
840
|
+
matchedToken,
|
|
841
|
+
out,
|
|
842
|
+
pos,
|
|
843
|
+
steps: 0
|
|
844
|
+
};
|
|
845
|
+
const fallback = appendFallbackWord(s, pos, out);
|
|
846
|
+
return fallback ? {
|
|
847
|
+
done: true,
|
|
848
|
+
matchedAny: true,
|
|
849
|
+
matchedToken,
|
|
850
|
+
out: fallback,
|
|
851
|
+
pos,
|
|
852
|
+
steps: 0
|
|
853
|
+
} : {
|
|
854
|
+
done: true,
|
|
855
|
+
matchedAny,
|
|
856
|
+
matchedToken,
|
|
857
|
+
out,
|
|
858
|
+
pos,
|
|
859
|
+
steps: 0
|
|
718
860
|
};
|
|
719
861
|
};
|
|
862
|
+
/** Remove trailing whitespace placeholders */
|
|
863
|
+
const trimTrailingWs = (out, mode) => {
|
|
864
|
+
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
865
|
+
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
866
|
+
return out;
|
|
867
|
+
};
|
|
868
|
+
/** Try to extract first word for fallback */
|
|
869
|
+
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
720
870
|
/** Skip whitespace at position */
|
|
721
871
|
const skipWhitespace$1 = (s, pos, out, ws) => {
|
|
722
872
|
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
@@ -737,47 +887,25 @@ const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
|
737
887
|
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
738
888
|
const compiled = compileTokenRegexes(tokenNames);
|
|
739
889
|
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
740
|
-
const prefix =
|
|
890
|
+
const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
741
891
|
pos = prefix.pos;
|
|
742
892
|
out = prefix.out;
|
|
743
893
|
matchedAny = prefix.matched;
|
|
744
894
|
while (steps < 6 && pos < s.length) {
|
|
745
|
-
const
|
|
746
|
-
if (
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if (tok.matched) {
|
|
753
|
-
pos = tok.pos;
|
|
754
|
-
out = tok.out;
|
|
755
|
-
matchedAny = matchedToken = true;
|
|
756
|
-
steps++;
|
|
757
|
-
continue;
|
|
758
|
-
}
|
|
759
|
-
if (matchedAny) {
|
|
760
|
-
const delim = tryMatchDelimiter(s, pos, out);
|
|
761
|
-
if (delim.matched) {
|
|
762
|
-
pos = delim.pos;
|
|
763
|
-
out = delim.out;
|
|
764
|
-
continue;
|
|
765
|
-
}
|
|
766
|
-
}
|
|
767
|
-
if (matchedAny) {
|
|
768
|
-
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
769
|
-
const word = extractFirstWord(s.slice(pos));
|
|
770
|
-
if (word) {
|
|
771
|
-
out += escapeSignatureLiteral(word);
|
|
772
|
-
steps++;
|
|
773
|
-
}
|
|
774
|
-
}
|
|
895
|
+
const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
|
|
896
|
+
if (next.done) {
|
|
897
|
+
if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
|
|
898
|
+
if (next.steps > 0) steps += next.steps;
|
|
899
|
+
matchedAny = next.matchedAny;
|
|
900
|
+
matchedToken = next.matchedToken;
|
|
901
|
+
out = next.out;
|
|
775
902
|
break;
|
|
776
903
|
}
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
904
|
+
pos = next.pos;
|
|
905
|
+
out = next.out;
|
|
906
|
+
matchedAny = next.matchedAny;
|
|
907
|
+
matchedToken = next.matchedToken;
|
|
908
|
+
steps += next.steps;
|
|
781
909
|
}
|
|
782
910
|
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
783
911
|
};
|
|
@@ -821,7 +949,6 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
|
821
949
|
pattern
|
|
822
950
|
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
823
951
|
};
|
|
824
|
-
|
|
825
952
|
//#endregion
|
|
826
953
|
//#region src/analysis/repeating-sequences.ts
|
|
827
954
|
const resolveOptions = (options) => {
|
|
@@ -843,6 +970,7 @@ const resolveOptions = (options) => {
|
|
|
843
970
|
const createRawCursor = (text, normalize) => {
|
|
844
971
|
let rawPos = 0;
|
|
845
972
|
return {
|
|
973
|
+
/** Advance cursor, returning the raw text chunk consumed */
|
|
846
974
|
advance(normalizedLen) {
|
|
847
975
|
if (!normalize) {
|
|
848
976
|
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
@@ -947,23 +1075,27 @@ const buildExample = (page, window, contextChars) => {
|
|
|
947
1075
|
text: page.content.slice(start, end)
|
|
948
1076
|
};
|
|
949
1077
|
};
|
|
1078
|
+
const recordPattern = (page, window, opts, stats) => {
|
|
1079
|
+
if (opts.requireToken && !hasTokenInWindow(window)) return;
|
|
1080
|
+
const pattern = buildPattern(window, opts.whitespace);
|
|
1081
|
+
let entry = stats.get(pattern);
|
|
1082
|
+
if (!entry) {
|
|
1083
|
+
if (stats.size >= opts.maxUniquePatterns) return;
|
|
1084
|
+
entry = {
|
|
1085
|
+
count: 0,
|
|
1086
|
+
examples: [],
|
|
1087
|
+
...computeWindowStats(window)
|
|
1088
|
+
};
|
|
1089
|
+
stats.set(pattern, entry);
|
|
1090
|
+
}
|
|
1091
|
+
entry.count++;
|
|
1092
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
1093
|
+
};
|
|
950
1094
|
/** Extract N-grams from a single page */
|
|
951
1095
|
const extractPageNgrams = (page, items, opts, stats) => {
|
|
952
|
-
for (let i = 0; i <= items.length - opts.minElements; i++)
|
|
953
|
-
const
|
|
954
|
-
|
|
955
|
-
const pattern = buildPattern(window, opts.whitespace);
|
|
956
|
-
if (!stats.has(pattern)) {
|
|
957
|
-
if (stats.size >= opts.maxUniquePatterns) continue;
|
|
958
|
-
stats.set(pattern, {
|
|
959
|
-
count: 0,
|
|
960
|
-
examples: [],
|
|
961
|
-
...computeWindowStats(window)
|
|
962
|
-
});
|
|
963
|
-
}
|
|
964
|
-
const entry = stats.get(pattern);
|
|
965
|
-
entry.count++;
|
|
966
|
-
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
1096
|
+
for (let i = 0; i <= items.length - opts.minElements; i++) {
|
|
1097
|
+
const maxWindowSize = Math.min(opts.maxElements, items.length - i);
|
|
1098
|
+
for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
|
|
967
1099
|
}
|
|
968
1100
|
};
|
|
969
1101
|
/**
|
|
@@ -985,7 +1117,6 @@ const analyzeRepeatingSequences = (pages, options) => {
|
|
|
985
1117
|
pattern
|
|
986
1118
|
}));
|
|
987
1119
|
};
|
|
988
|
-
|
|
989
1120
|
//#endregion
|
|
990
1121
|
//#region src/detection.ts
|
|
991
1122
|
/**
|
|
@@ -1147,7 +1278,6 @@ const analyzeTextForRule = (text) => {
|
|
|
1147
1278
|
...suggestPatternConfig(detected)
|
|
1148
1279
|
};
|
|
1149
1280
|
};
|
|
1150
|
-
|
|
1151
1281
|
//#endregion
|
|
1152
1282
|
//#region src/types/rules.ts
|
|
1153
1283
|
/**
|
|
@@ -1170,9 +1300,9 @@ const PATTERN_TYPE_KEYS = [
|
|
|
1170
1300
|
"lineStartsAfter",
|
|
1171
1301
|
"lineEndsWith",
|
|
1172
1302
|
"template",
|
|
1173
|
-
"regex"
|
|
1303
|
+
"regex",
|
|
1304
|
+
"dictionaryEntry"
|
|
1174
1305
|
];
|
|
1175
|
-
|
|
1176
1306
|
//#endregion
|
|
1177
1307
|
//#region src/optimization/optimize-rules.ts
|
|
1178
1308
|
const MERGEABLE_KEYS = new Set([
|
|
@@ -1190,11 +1320,17 @@ const getPatternArray = (rule, key) => {
|
|
|
1190
1320
|
};
|
|
1191
1321
|
const getPatternString = (rule, key) => {
|
|
1192
1322
|
const value = rule[key];
|
|
1193
|
-
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : "";
|
|
1323
|
+
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
|
|
1194
1324
|
};
|
|
1195
1325
|
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
1326
|
+
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
1327
|
+
if (!("dictionaryEntry" in rule)) return 0;
|
|
1328
|
+
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
1329
|
+
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
1330
|
+
};
|
|
1196
1331
|
const getSpecificityScore = (rule) => {
|
|
1197
1332
|
const key = getPatternKey(rule);
|
|
1333
|
+
if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
|
|
1198
1334
|
return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
|
|
1199
1335
|
};
|
|
1200
1336
|
const createMergeKey = (rule) => {
|
|
@@ -1231,7 +1367,6 @@ const optimizeRules = (rules) => {
|
|
|
1231
1367
|
rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
|
|
1232
1368
|
};
|
|
1233
1369
|
};
|
|
1234
|
-
|
|
1235
1370
|
//#endregion
|
|
1236
1371
|
//#region src/preprocessing/transforms.ts
|
|
1237
1372
|
/** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
|
|
@@ -1340,170 +1475,115 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
|
1340
1475
|
}
|
|
1341
1476
|
return result;
|
|
1342
1477
|
};
|
|
1343
|
-
|
|
1344
1478
|
//#endregion
|
|
1345
|
-
//#region src/segmentation/rule
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
*
|
|
1355
|
-
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
1356
|
-
*/
|
|
1357
|
-
const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
|
|
1358
|
-
/**
|
|
1359
|
-
* Extracts named capture group names from a regex pattern.
|
|
1360
|
-
*
|
|
1361
|
-
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
1362
|
-
*
|
|
1363
|
-
* @example
|
|
1364
|
-
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
1365
|
-
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
1366
|
-
* extractNamedCaptureNames('^\\d+') // []
|
|
1367
|
-
*/
|
|
1368
|
-
const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([^>]+)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
|
|
1369
|
-
/**
|
|
1370
|
-
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1371
|
-
*/
|
|
1372
|
-
const compileRuleRegex = (pattern) => {
|
|
1373
|
-
try {
|
|
1374
|
-
return new RegExp(pattern, "gmu");
|
|
1375
|
-
} catch (error) {
|
|
1376
|
-
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
|
|
1479
|
+
//#region src/segmentation/arabic-dictionary-rule.ts
|
|
1480
|
+
const uniqueCanonicalWords = (words) => {
|
|
1481
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1482
|
+
const result = [];
|
|
1483
|
+
for (const word of words) {
|
|
1484
|
+
const normalized = normalizeArabicForComparison(word);
|
|
1485
|
+
if (!normalized || seen.has(normalized)) continue;
|
|
1486
|
+
seen.add(normalized);
|
|
1487
|
+
result.push(word);
|
|
1377
1488
|
}
|
|
1489
|
+
return result;
|
|
1378
1490
|
};
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
const
|
|
1385
|
-
|
|
1491
|
+
const buildStopAlternation = (stopWords) => {
|
|
1492
|
+
const unique = uniqueCanonicalWords(stopWords);
|
|
1493
|
+
if (unique.length === 0) return "";
|
|
1494
|
+
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1495
|
+
};
|
|
1496
|
+
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1497
|
+
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
1498
|
+
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
1499
|
+
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
1500
|
+
};
|
|
1501
|
+
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1502
|
+
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1503
|
+
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
1504
|
+
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1505
|
+
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1506
|
+
};
|
|
1507
|
+
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
1508
|
+
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1509
|
+
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1510
|
+
if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1511
|
+
};
|
|
1512
|
+
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
1513
|
+
validateDictionaryEntryOptions({
|
|
1514
|
+
captureName,
|
|
1515
|
+
maxLetters,
|
|
1516
|
+
minLetters
|
|
1517
|
+
});
|
|
1518
|
+
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1519
|
+
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1520
|
+
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1521
|
+
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1522
|
+
const stopAlternation = buildStopAlternation(stopWords);
|
|
1523
|
+
const lemmaBody = buildHeadwordBody({
|
|
1524
|
+
allowCommaSeparated,
|
|
1525
|
+
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1526
|
+
stopAlternation,
|
|
1527
|
+
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1528
|
+
unit: lemmaUnit
|
|
1529
|
+
});
|
|
1530
|
+
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
1531
|
+
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
1532
|
+
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
1533
|
+
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
1534
|
+
allowParenthesized,
|
|
1535
|
+
allowWhitespaceBeforeColon,
|
|
1536
|
+
captureName: prefixedCaptureName,
|
|
1537
|
+
headwordBody: lemmaBody
|
|
1538
|
+
});
|
|
1386
1539
|
return {
|
|
1387
|
-
captureNames,
|
|
1388
|
-
|
|
1540
|
+
captureNames: [prefixedCaptureName],
|
|
1541
|
+
regex
|
|
1389
1542
|
};
|
|
1390
1543
|
};
|
|
1391
1544
|
/**
|
|
1392
|
-
*
|
|
1545
|
+
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1393
1546
|
*
|
|
1394
|
-
*
|
|
1395
|
-
*
|
|
1396
|
-
*
|
|
1397
|
-
*/
|
|
1398
|
-
const processBreakpointPattern = (pattern) => {
|
|
1399
|
-
const { pattern: expanded } = expandTokensWithCaptures(pattern);
|
|
1400
|
-
return expanded;
|
|
1401
|
-
};
|
|
1402
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1403
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1404
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
1405
|
-
return {
|
|
1406
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1407
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
|
|
1408
|
-
};
|
|
1409
|
-
};
|
|
1410
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1411
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1412
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
1413
|
-
return {
|
|
1414
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1415
|
-
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
|
|
1416
|
-
};
|
|
1417
|
-
};
|
|
1418
|
-
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1419
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1420
|
-
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
1421
|
-
return {
|
|
1422
|
-
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1423
|
-
regex: `(?:${alternatives})$`
|
|
1424
|
-
};
|
|
1425
|
-
};
|
|
1426
|
-
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
1427
|
-
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
1428
|
-
return {
|
|
1429
|
-
captureNames,
|
|
1430
|
-
regex: pattern
|
|
1431
|
-
};
|
|
1432
|
-
};
|
|
1433
|
-
/**
|
|
1434
|
-
* Builds a compiled regex and metadata from a split rule.
|
|
1547
|
+
* The returned rule preserves authoring intent as a serializable
|
|
1548
|
+
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1549
|
+
* regex string.
|
|
1435
1550
|
*
|
|
1436
|
-
*
|
|
1551
|
+
* @example
|
|
1552
|
+
* createArabicDictionaryEntryRule({
|
|
1553
|
+
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1554
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1555
|
+
* })
|
|
1556
|
+
*
|
|
1557
|
+
* @example
|
|
1558
|
+
* createArabicDictionaryEntryRule({
|
|
1559
|
+
* allowParenthesized: true,
|
|
1560
|
+
* allowWhitespaceBeforeColon: true,
|
|
1561
|
+
* allowCommaSeparated: true,
|
|
1562
|
+
* stopWords: ['الليث', 'العجاج'],
|
|
1563
|
+
* })
|
|
1437
1564
|
*/
|
|
1438
|
-
const
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
]);
|
|
1445
|
-
if (lineStartsAfter?.length) {
|
|
1446
|
-
const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(lineStartsAfter, fuzzy, capturePrefix);
|
|
1447
|
-
return {
|
|
1448
|
-
captureNames,
|
|
1449
|
-
regex: compileRuleRegex(lsaRegex),
|
|
1450
|
-
usesCapture: true,
|
|
1451
|
-
usesLineStartsAfter: true
|
|
1452
|
-
};
|
|
1453
|
-
}
|
|
1454
|
-
let finalRegex = regex;
|
|
1455
|
-
let allCaptureNames = [];
|
|
1456
|
-
if (lineStartsWith?.length) {
|
|
1457
|
-
const res = buildLineStartsWithRegexSource(lineStartsWith, fuzzy, capturePrefix);
|
|
1458
|
-
finalRegex = res.regex;
|
|
1459
|
-
allCaptureNames = res.captureNames;
|
|
1460
|
-
}
|
|
1461
|
-
if (lineEndsWith?.length) {
|
|
1462
|
-
const res = buildLineEndsWithRegexSource(lineEndsWith, fuzzy, capturePrefix);
|
|
1463
|
-
finalRegex = res.regex;
|
|
1464
|
-
allCaptureNames = res.captureNames;
|
|
1465
|
-
}
|
|
1466
|
-
if (template) {
|
|
1467
|
-
const res = buildTemplateRegexSource(template, capturePrefix);
|
|
1468
|
-
finalRegex = res.regex;
|
|
1469
|
-
allCaptureNames = [...allCaptureNames, ...res.captureNames];
|
|
1470
|
-
}
|
|
1471
|
-
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
1472
|
-
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
1565
|
+
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1566
|
+
validateDictionaryEntryOptions({
|
|
1567
|
+
captureName,
|
|
1568
|
+
maxLetters,
|
|
1569
|
+
minLetters
|
|
1570
|
+
});
|
|
1473
1571
|
return {
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1572
|
+
dictionaryEntry: {
|
|
1573
|
+
allowCommaSeparated,
|
|
1574
|
+
allowParenthesized,
|
|
1575
|
+
allowWhitespaceBeforeColon,
|
|
1576
|
+
captureName,
|
|
1577
|
+
maxLetters,
|
|
1578
|
+
midLineSubentries,
|
|
1579
|
+
minLetters,
|
|
1580
|
+
stopWords: uniqueCanonicalWords(stopWords)
|
|
1581
|
+
},
|
|
1582
|
+
meta,
|
|
1583
|
+
pageStartPrevWordStoplist,
|
|
1584
|
+
samePagePrevWordStoplist
|
|
1478
1585
|
};
|
|
1479
1586
|
};
|
|
1480
|
-
|
|
1481
|
-
//#endregion
|
|
1482
|
-
//#region src/segmentation/breakpoint-constants.ts
|
|
1483
|
-
/**
|
|
1484
|
-
* Shared constants for segmentation breakpoint processing.
|
|
1485
|
-
*/
|
|
1486
|
-
/**
|
|
1487
|
-
* Threshold for using offset-based fast path in boundary processing.
|
|
1488
|
-
*
|
|
1489
|
-
* Below this: accurate string-search (handles offset drift from structural rules).
|
|
1490
|
-
* At or above this: O(n) arithmetic (performance critical for large books).
|
|
1491
|
-
*
|
|
1492
|
-
* The value of 1000 is chosen based on typical Arabic book sizes:
|
|
1493
|
-
* - Sahih al-Bukhari: ~1000-3000 pages
|
|
1494
|
-
* - Standard hadith collections: 1000-7000 pages
|
|
1495
|
-
* - Large aggregated corpora: 10k-50k pages
|
|
1496
|
-
*
|
|
1497
|
-
* For segments ≥1000 pages, the performance gain from offset-based slicing
|
|
1498
|
-
* outweighs the minor accuracy loss from potential offset drift.
|
|
1499
|
-
*
|
|
1500
|
-
* @remarks
|
|
1501
|
-
* Fast path is skipped when:
|
|
1502
|
-
* - `maxContentLength` is set (requires character-accurate splitting)
|
|
1503
|
-
* - `debugMetaKey` is set (requires proper provenance tracking)
|
|
1504
|
-
* - Content was structurally modified by marker stripping (offsets may drift)
|
|
1505
|
-
*/
|
|
1506
|
-
const FAST_PATH_THRESHOLD = 1e3;
|
|
1507
1587
|
const WINDOW_PREFIX_LENGTHS = [
|
|
1508
1588
|
80,
|
|
1509
1589
|
60,
|
|
@@ -1530,23 +1610,6 @@ const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۞]/;
|
|
|
1530
1610
|
* Matches outside this range are rejected unless `ignoreDeviation` is active.
|
|
1531
1611
|
*/
|
|
1532
1612
|
const MAX_DEVIATION = 2e3;
|
|
1533
|
-
/**
|
|
1534
|
-
* Penalty score applied to non-newline anchor candidates.
|
|
1535
|
-
*
|
|
1536
|
-
* Designed to prioritize newline-aligned boundaries unless a whitespace match is
|
|
1537
|
-
* significantly closer (within 20 chars). Handles cases where marker stripping
|
|
1538
|
-
* shifts the boundary slightly.
|
|
1539
|
-
*/
|
|
1540
|
-
const NON_NEWLINE_PENALTY = 20;
|
|
1541
|
-
/**
|
|
1542
|
-
* Limit for inferring start offset from a relaxed search (characters).
|
|
1543
|
-
*
|
|
1544
|
-
* If the relaxed search finds a match more than this distance away from the
|
|
1545
|
-
* expected position, we assume it's a false positive (e.g. repeated content)
|
|
1546
|
-
* and do not use it to infer the start offset.
|
|
1547
|
-
*/
|
|
1548
|
-
const INFERENCE_PROXIMITY_LIMIT = 500;
|
|
1549
|
-
|
|
1550
1613
|
//#endregion
|
|
1551
1614
|
//#region src/segmentation/match-utils.ts
|
|
1552
1615
|
/**
|
|
@@ -1665,7 +1728,6 @@ const extractDebugIndex = (groups, prefix) => {
|
|
|
1665
1728
|
if (!Number.isNaN(idx)) return idx;
|
|
1666
1729
|
}
|
|
1667
1730
|
};
|
|
1668
|
-
|
|
1669
1731
|
//#endregion
|
|
1670
1732
|
//#region src/segmentation/breakpoint-utils.ts
|
|
1671
1733
|
/**
|
|
@@ -2067,8 +2129,8 @@ const findAnchorCandidates = (content, prefix, start, end) => {
|
|
|
2067
2129
|
/** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
|
|
2068
2130
|
const selectBestAnchor = (candidates, expectedBoundary) => {
|
|
2069
2131
|
return candidates.reduce((best, curr) => {
|
|
2070
|
-
const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 :
|
|
2071
|
-
return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 :
|
|
2132
|
+
const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
|
|
2133
|
+
return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
|
|
2072
2134
|
});
|
|
2073
2135
|
};
|
|
2074
2136
|
/**
|
|
@@ -2122,7 +2184,7 @@ const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetI
|
|
|
2122
2184
|
if (relaxedPos > 0) {
|
|
2123
2185
|
const inferredStartOffset = rawBoundary - relaxedPos;
|
|
2124
2186
|
const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2125
|
-
if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) <
|
|
2187
|
+
if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
|
|
2126
2188
|
startOffsetInFromPage = inferredStartOffset;
|
|
2127
2189
|
expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
|
|
2128
2190
|
pos = relaxedPos;
|
|
@@ -2196,7 +2258,7 @@ const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCoun
|
|
|
2196
2258
|
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
2197
2259
|
const pageCount = toIdx - fromIdx + 1;
|
|
2198
2260
|
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2199
|
-
if (pageCount >=
|
|
2261
|
+
if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
|
|
2200
2262
|
return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
2201
2263
|
};
|
|
2202
2264
|
/**
|
|
@@ -2428,7 +2490,6 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
|
|
|
2428
2490
|
}
|
|
2429
2491
|
return -1;
|
|
2430
2492
|
};
|
|
2431
|
-
|
|
2432
2493
|
//#endregion
|
|
2433
2494
|
//#region src/segmentation/debug-meta.ts
|
|
2434
2495
|
const resolveDebugConfig = (debug) => {
|
|
@@ -2470,59 +2531,222 @@ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
|
|
|
2470
2531
|
...word !== void 0 ? { word } : {}
|
|
2471
2532
|
} };
|
|
2472
2533
|
};
|
|
2473
|
-
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2474
|
-
index: breakpointIndex,
|
|
2475
|
-
kind: rule.pattern === "" ? "pageBoundary" : "pattern",
|
|
2476
|
-
pattern: rule.pattern ?? rule.regex,
|
|
2477
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2478
|
-
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2479
|
-
} });
|
|
2534
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2535
|
+
index: breakpointIndex,
|
|
2536
|
+
kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
|
|
2537
|
+
pattern: rule.pattern ?? rule.regex,
|
|
2538
|
+
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2539
|
+
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2540
|
+
} });
|
|
2541
|
+
/**
|
|
2542
|
+
* Helper to format the debug info into a human-readable string.
|
|
2543
|
+
* @param meta - The segment metadata object
|
|
2544
|
+
* @param options - Formatting options
|
|
2545
|
+
*/
|
|
2546
|
+
const formatRuleReason = (rule, concise) => {
|
|
2547
|
+
const { index, patternType, wordIndex, word } = rule;
|
|
2548
|
+
if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
|
|
2549
|
+
const wordInfo = word ? ` (Matched: "${word}")` : "";
|
|
2550
|
+
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2551
|
+
};
|
|
2552
|
+
const formatBreakpointReason = (breakpoint, concise) => {
|
|
2553
|
+
const { index, kind, pattern, wordIndex, word } = breakpoint;
|
|
2554
|
+
if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
|
|
2555
|
+
if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
|
|
2556
|
+
if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
|
|
2557
|
+
return `Breakpoint #${index} (${kind}) - "${pattern}"`;
|
|
2558
|
+
};
|
|
2559
|
+
const formatContentLengthReason = (split, concise) => {
|
|
2560
|
+
const { maxContentLength, splitReason } = split;
|
|
2561
|
+
if (concise) return `> ${maxContentLength} (${splitReason})`;
|
|
2562
|
+
return `Safety Split (${splitReason}) > ${maxContentLength}`;
|
|
2563
|
+
};
|
|
2564
|
+
/**
|
|
2565
|
+
* Helper to format the debug info into a human-readable string.
|
|
2566
|
+
* @param meta - The segment metadata object
|
|
2567
|
+
* @param options - Formatting options
|
|
2568
|
+
*/
|
|
2569
|
+
const getDebugReason = (meta, options) => {
|
|
2570
|
+
const debug = meta?._flappa;
|
|
2571
|
+
if (!debug) return "-";
|
|
2572
|
+
const concise = options?.concise;
|
|
2573
|
+
if (debug.rule) return formatRuleReason(debug.rule, concise);
|
|
2574
|
+
if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
|
|
2575
|
+
if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
|
|
2576
|
+
return "Unknown";
|
|
2577
|
+
};
|
|
2578
|
+
/**
|
|
2579
|
+
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
2580
|
+
* @param segment - The segment object
|
|
2581
|
+
* @param options - Formatting options
|
|
2582
|
+
*/
|
|
2583
|
+
const getSegmentDebugReason = (segment, options) => {
|
|
2584
|
+
return getDebugReason(segment.meta, options);
|
|
2585
|
+
};
|
|
2586
|
+
//#endregion
|
|
2587
|
+
//#region src/segmentation/pattern-validator.ts
|
|
2588
|
+
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
2589
|
+
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
2590
|
+
const buildBareTokenRegex = () => {
|
|
2591
|
+
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
2592
|
+
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
2593
|
+
};
|
|
2594
|
+
/**
|
|
2595
|
+
* Validates a single pattern for common issues.
|
|
2596
|
+
*/
|
|
2597
|
+
const validatePattern = (pattern, seenPatterns) => {
|
|
2598
|
+
if (!pattern.trim()) return {
|
|
2599
|
+
message: "Empty pattern is not allowed",
|
|
2600
|
+
type: "empty_pattern"
|
|
2601
|
+
};
|
|
2602
|
+
if (seenPatterns.has(pattern)) return {
|
|
2603
|
+
message: `Duplicate pattern: "${pattern}"`,
|
|
2604
|
+
pattern,
|
|
2605
|
+
type: "duplicate"
|
|
2606
|
+
};
|
|
2607
|
+
seenPatterns.add(pattern);
|
|
2608
|
+
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
2609
|
+
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
2610
|
+
const name = match[1];
|
|
2611
|
+
if (!KNOWN_TOKENS.has(name)) return {
|
|
2612
|
+
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
2613
|
+
suggestion: "Check spelling or use a known token",
|
|
2614
|
+
token: name,
|
|
2615
|
+
type: "unknown_token"
|
|
2616
|
+
};
|
|
2617
|
+
}
|
|
2618
|
+
for (const match of pattern.matchAll(buildBareTokenRegex())) {
|
|
2619
|
+
const [full, name] = match;
|
|
2620
|
+
const idx = match.index;
|
|
2621
|
+
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
2622
|
+
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
2623
|
+
suggestion: `{{${full}}}`,
|
|
2624
|
+
token: name,
|
|
2625
|
+
type: "missing_braces"
|
|
2626
|
+
};
|
|
2627
|
+
}
|
|
2628
|
+
};
|
|
2480
2629
|
/**
|
|
2481
|
-
*
|
|
2482
|
-
* @param meta - The segment metadata object
|
|
2483
|
-
* @param options - Formatting options
|
|
2630
|
+
* Validates an array of patterns, returning parallel array of issues.
|
|
2484
2631
|
*/
|
|
2485
|
-
const
|
|
2486
|
-
const
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2632
|
+
const validatePatternArray = (patterns) => {
|
|
2633
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2634
|
+
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
2635
|
+
return issues.some(Boolean) ? issues : void 0;
|
|
2490
2636
|
};
|
|
2491
|
-
const
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
if (
|
|
2495
|
-
|
|
2496
|
-
return
|
|
2637
|
+
const applyRulePatternValidation = (result, key, patterns) => {
|
|
2638
|
+
if (!patterns) return false;
|
|
2639
|
+
const issues = validatePatternArray(patterns);
|
|
2640
|
+
if (!issues) return false;
|
|
2641
|
+
result[key] = issues;
|
|
2642
|
+
return true;
|
|
2497
2643
|
};
|
|
2498
|
-
const
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2644
|
+
const validateTemplateRule = (rule, result) => {
|
|
2645
|
+
if (rule.template === void 0) return false;
|
|
2646
|
+
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
2647
|
+
if (!issue) return false;
|
|
2648
|
+
result.template = issue;
|
|
2649
|
+
return true;
|
|
2650
|
+
};
|
|
2651
|
+
const validateRegexRule = (rule, result) => {
|
|
2652
|
+
if (rule.regex === void 0) return false;
|
|
2653
|
+
if (!rule.regex.trim()) {
|
|
2654
|
+
result.regex = {
|
|
2655
|
+
message: "Empty pattern is not allowed",
|
|
2656
|
+
type: "empty_pattern"
|
|
2657
|
+
};
|
|
2658
|
+
return true;
|
|
2659
|
+
}
|
|
2660
|
+
try {
|
|
2661
|
+
new RegExp(rule.regex, "u");
|
|
2662
|
+
return false;
|
|
2663
|
+
} catch (error) {
|
|
2664
|
+
result.regex = {
|
|
2665
|
+
message: error instanceof Error ? error.message : String(error),
|
|
2666
|
+
pattern: rule.regex,
|
|
2667
|
+
type: "invalid_regex"
|
|
2668
|
+
};
|
|
2669
|
+
return true;
|
|
2670
|
+
}
|
|
2671
|
+
};
|
|
2672
|
+
const invalidDictionaryEntryIssue = (message) => ({
|
|
2673
|
+
message,
|
|
2674
|
+
type: "invalid_option"
|
|
2675
|
+
});
|
|
2676
|
+
const validateDictionaryEntryRule = (rule, result) => {
|
|
2677
|
+
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
2678
|
+
const issues = {};
|
|
2679
|
+
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
2680
|
+
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
2681
|
+
if (allowCommaSeparated !== void 0 && typeof allowCommaSeparated !== "boolean") issues.allowCommaSeparated = invalidDictionaryEntryIssue("allowCommaSeparated must be a boolean");
|
|
2682
|
+
if (allowParenthesized !== void 0 && typeof allowParenthesized !== "boolean") issues.allowParenthesized = invalidDictionaryEntryIssue("allowParenthesized must be a boolean");
|
|
2683
|
+
if (allowWhitespaceBeforeColon !== void 0 && typeof allowWhitespaceBeforeColon !== "boolean") issues.allowWhitespaceBeforeColon = invalidDictionaryEntryIssue("allowWhitespaceBeforeColon must be a boolean");
|
|
2684
|
+
if (midLineSubentries !== void 0 && typeof midLineSubentries !== "boolean") issues.midLineSubentries = invalidDictionaryEntryIssue("midLineSubentries must be a boolean");
|
|
2685
|
+
if (captureName !== void 0 && !captureName.match(/^[A-Za-z_]\w*$/)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
2686
|
+
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
2687
|
+
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < (minLetters ?? 2))) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${minLetters ?? 2}`);
|
|
2688
|
+
if (Object.keys(issues).length === 0) return false;
|
|
2689
|
+
result.dictionaryEntry = issues;
|
|
2690
|
+
return true;
|
|
2691
|
+
};
|
|
2692
|
+
const formatValidationIssue = (_type, issue, loc) => {
|
|
2693
|
+
if (!issue) return null;
|
|
2694
|
+
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
2695
|
+
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
2696
|
+
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
2697
|
+
if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
|
|
2698
|
+
return `${loc}: ${issue.message || issue.type}`;
|
|
2502
2699
|
};
|
|
2503
2700
|
/**
|
|
2504
|
-
*
|
|
2505
|
-
*
|
|
2506
|
-
*
|
|
2701
|
+
* Validates split rules for common pattern issues.
|
|
2702
|
+
*
|
|
2703
|
+
* Checks for:
|
|
2704
|
+
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
2705
|
+
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
2706
|
+
* - Duplicate patterns within the same rule
|
|
2707
|
+
*
|
|
2708
|
+
* @param rules - Array of split rules to validate
|
|
2709
|
+
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
2710
|
+
*
|
|
2711
|
+
* @example
|
|
2712
|
+
* const issues = validateRules([
|
|
2713
|
+
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
2714
|
+
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
2715
|
+
* ]);
|
|
2716
|
+
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
2717
|
+
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
2507
2718
|
*/
|
|
2508
|
-
const
|
|
2509
|
-
const
|
|
2510
|
-
|
|
2511
|
-
const
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2719
|
+
const validateRules = (rules) => rules.map((rule) => {
|
|
2720
|
+
const result = {};
|
|
2721
|
+
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
|
|
2722
|
+
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
|
|
2723
|
+
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
|
|
2724
|
+
const templateIssues = validateTemplateRule(rule, result);
|
|
2725
|
+
const regexIssues = validateRegexRule(rule, result);
|
|
2726
|
+
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
2727
|
+
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
|
|
2728
|
+
});
|
|
2517
2729
|
/**
|
|
2518
|
-
*
|
|
2519
|
-
*
|
|
2520
|
-
*
|
|
2730
|
+
* Formats a validation result array into a list of human-readable error messages.
|
|
2731
|
+
*
|
|
2732
|
+
* Useful for displaying validation errors in UIs.
|
|
2733
|
+
*
|
|
2734
|
+
* @param results - The result array from `validateRules()`
|
|
2735
|
+
* @returns Array of formatted error strings
|
|
2736
|
+
*
|
|
2737
|
+
* @example
|
|
2738
|
+
* const issues = validateRules(rules);
|
|
2739
|
+
* const errors = formatValidationReport(issues);
|
|
2740
|
+
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
2521
2741
|
*/
|
|
2522
|
-
const
|
|
2523
|
-
|
|
2742
|
+
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
2743
|
+
if (!result) return [];
|
|
2744
|
+
return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
|
|
2745
|
+
});
|
|
2746
|
+
const formatValidationIssues = (type, issues, ruleNumber) => {
|
|
2747
|
+
if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
|
|
2748
|
+
return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
|
|
2524
2749
|
};
|
|
2525
|
-
|
|
2526
2750
|
//#endregion
|
|
2527
2751
|
//#region src/segmentation/breakpoint-processor.ts
|
|
2528
2752
|
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
@@ -2650,7 +2874,7 @@ const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx,
|
|
|
2650
2874
|
const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
|
|
2651
2875
|
const driftTolerance = Math.max(100, fullContent.length * .01);
|
|
2652
2876
|
const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
|
|
2653
|
-
if (!isAligned && pageCount >=
|
|
2877
|
+
if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
|
|
2654
2878
|
actualLength: fullContent.length,
|
|
2655
2879
|
drift: Math.abs(expectedLength - fullContent.length),
|
|
2656
2880
|
expectedLength,
|
|
@@ -2791,8 +3015,7 @@ const computeWindowEndPositionForIteration = (remainingContent, cursorPos, curre
|
|
|
2791
3015
|
if (maxPages === 0) {
|
|
2792
3016
|
const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
|
|
2793
3017
|
const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
|
|
2794
|
-
|
|
2795
|
-
return Math.min(capped, remainingContent.length);
|
|
3018
|
+
return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
|
|
2796
3019
|
}
|
|
2797
3020
|
const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
|
|
2798
3021
|
return Math.min(pos, remainingContent.length);
|
|
@@ -2847,7 +3070,7 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
|
|
|
2847
3070
|
const pageCount = toIdx - fromIdx + 1;
|
|
2848
3071
|
const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
|
|
2849
3072
|
const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
|
|
2850
|
-
if (pageCount <
|
|
3073
|
+
if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
|
|
2851
3074
|
if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
|
|
2852
3075
|
return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
|
|
2853
3076
|
};
|
|
@@ -3030,7 +3253,179 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
3030
3253
|
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
3031
3254
|
return result;
|
|
3032
3255
|
};
|
|
3033
|
-
|
|
3256
|
+
//#endregion
|
|
3257
|
+
//#region src/segmentation/rule-regex.ts
|
|
3258
|
+
/**
|
|
3259
|
+
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
3260
|
+
*
|
|
3261
|
+
* Detects standard capturing groups `(...)` while excluding:
|
|
3262
|
+
* - Non-capturing groups `(?:...)`
|
|
3263
|
+
* - Lookahead assertions `(?=...)` and `(?!...)`
|
|
3264
|
+
* - Lookbehind assertions `(?<=...)` and `(?<!...)`
|
|
3265
|
+
* - Named groups `(?<name>...)` (start with `(?` so excluded here)
|
|
3266
|
+
*
|
|
3267
|
+
* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
|
|
3268
|
+
*/
|
|
3269
|
+
const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
|
|
3270
|
+
/**
|
|
3271
|
+
* Extracts named capture group names from a regex pattern.
|
|
3272
|
+
*
|
|
3273
|
+
* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
|
|
3274
|
+
*
|
|
3275
|
+
* @example
|
|
3276
|
+
* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
|
|
3277
|
+
* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
|
|
3278
|
+
* extractNamedCaptureNames('^\\d+') // []
|
|
3279
|
+
*/
|
|
3280
|
+
const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
|
|
3281
|
+
/**
|
|
3282
|
+
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
3283
|
+
*/
|
|
3284
|
+
const compileRuleRegex = (pattern) => {
|
|
3285
|
+
try {
|
|
3286
|
+
return new RegExp(pattern, "gmu");
|
|
3287
|
+
} catch (error) {
|
|
3288
|
+
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${error instanceof Error ? error.message : String(error)}`);
|
|
3289
|
+
}
|
|
3290
|
+
};
|
|
3291
|
+
/**
|
|
3292
|
+
* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
|
|
3293
|
+
*
|
|
3294
|
+
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
3295
|
+
*/
|
|
3296
|
+
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
3297
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
3298
|
+
return {
|
|
3299
|
+
captureNames,
|
|
3300
|
+
pattern: expanded
|
|
3301
|
+
};
|
|
3302
|
+
};
|
|
3303
|
+
/**
|
|
3304
|
+
* Processes a breakpoint pattern by expanding tokens only.
|
|
3305
|
+
*
|
|
3306
|
+
* Unlike `processPattern`, this does NOT escape brackets because breakpoints
|
|
3307
|
+
* are treated as raw regex patterns (like the `regex` rule type).
|
|
3308
|
+
* Users have full control over regex syntax including `(?:...)` groups.
|
|
3309
|
+
*/
|
|
3310
|
+
const processBreakpointPattern = (pattern) => {
|
|
3311
|
+
const { pattern: expanded } = expandTokensWithCaptures(pattern);
|
|
3312
|
+
return expanded;
|
|
3313
|
+
};
|
|
3314
|
+
/**
|
|
3315
|
+
* Builds the raw regex source for a `lineStartsAfter` rule.
|
|
3316
|
+
*
|
|
3317
|
+
* Expands each pattern through `processPattern()`, combines them into an
|
|
3318
|
+
* alternation at the start of a line, and appends a trailing content capture.
|
|
3319
|
+
*
|
|
3320
|
+
* @param patterns - Template-like line-start markers to match
|
|
3321
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3322
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3323
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3324
|
+
*/
|
|
3325
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3326
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3327
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3328
|
+
return {
|
|
3329
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3330
|
+
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
|
|
3331
|
+
};
|
|
3332
|
+
};
|
|
3333
|
+
/**
|
|
3334
|
+
* Builds the raw regex source for a `lineStartsWith` rule.
|
|
3335
|
+
*
|
|
3336
|
+
* Expands each pattern through `processPattern()` and combines them into an
|
|
3337
|
+
* alternation anchored at the start of a line.
|
|
3338
|
+
*
|
|
3339
|
+
* @param patterns - Template-like line-start markers to match
|
|
3340
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3341
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3342
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3343
|
+
*/
|
|
3344
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3345
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3346
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3347
|
+
return {
|
|
3348
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3349
|
+
regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
|
|
3350
|
+
};
|
|
3351
|
+
};
|
|
3352
|
+
/**
|
|
3353
|
+
* Builds the raw regex source for a `lineEndsWith` rule.
|
|
3354
|
+
*
|
|
3355
|
+
* Expands each pattern through `processPattern()` and combines them into an
|
|
3356
|
+
* end-anchored alternation.
|
|
3357
|
+
*
|
|
3358
|
+
* @param patterns - Template-like line-end markers to match
|
|
3359
|
+
* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
|
|
3360
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3361
|
+
* @returns Regex source plus the named captures extracted from the patterns
|
|
3362
|
+
*/
|
|
3363
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
3364
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
3365
|
+
const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
|
|
3366
|
+
return {
|
|
3367
|
+
captureNames: processed.flatMap((p) => p.captureNames),
|
|
3368
|
+
regex: `(?:${alternatives})$`
|
|
3369
|
+
};
|
|
3370
|
+
};
|
|
3371
|
+
/**
|
|
3372
|
+
* Builds the raw regex source for a `template` rule.
|
|
3373
|
+
*
|
|
3374
|
+
* Expands tokens and named captures via `expandTokensWithCaptures()` after
|
|
3375
|
+
* applying `escapeTemplateBrackets()` to non-token brackets.
|
|
3376
|
+
*
|
|
3377
|
+
* @param template - Template string containing optional `{{token}}` markers
|
|
3378
|
+
* @param capturePrefix - Optional prefix used for internal named captures
|
|
3379
|
+
* @returns Regex source plus the named captures extracted from the template
|
|
3380
|
+
*/
|
|
3381
|
+
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
3382
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
3383
|
+
return {
|
|
3384
|
+
captureNames,
|
|
3385
|
+
regex: pattern
|
|
3386
|
+
};
|
|
3387
|
+
};
|
|
3388
|
+
const getFuzzyCandidatePatterns = (rule) => [
|
|
3389
|
+
..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
|
|
3390
|
+
..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
|
|
3391
|
+
..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
|
|
3392
|
+
];
|
|
3393
|
+
const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
|
|
3394
|
+
if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
|
|
3395
|
+
if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
|
|
3396
|
+
if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
|
|
3397
|
+
if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
|
|
3398
|
+
return null;
|
|
3399
|
+
};
|
|
3400
|
+
/**
|
|
3401
|
+
* Builds a compiled regex and metadata from a split rule.
|
|
3402
|
+
*
|
|
3403
|
+
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
3404
|
+
*/
|
|
3405
|
+
const buildRuleRegex = (rule, capturePrefix) => {
|
|
3406
|
+
const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
|
|
3407
|
+
if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
|
|
3408
|
+
const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
|
|
3409
|
+
return {
|
|
3410
|
+
captureNames,
|
|
3411
|
+
regex: compileRuleRegex(lsaRegex),
|
|
3412
|
+
usesCapture: true,
|
|
3413
|
+
usesLineStartsAfter: true
|
|
3414
|
+
};
|
|
3415
|
+
}
|
|
3416
|
+
const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
|
|
3417
|
+
let finalRegex = ruleRegexSource?.regex;
|
|
3418
|
+
let allCaptureNames = ruleRegexSource?.captureNames ?? [];
|
|
3419
|
+
if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
|
|
3420
|
+
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
|
|
3421
|
+
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
3422
|
+
return {
|
|
3423
|
+
captureNames: allCaptureNames,
|
|
3424
|
+
regex: compileRuleRegex(finalRegex),
|
|
3425
|
+
usesCapture: hasCapturingGroup(finalRegex),
|
|
3426
|
+
usesLineStartsAfter: false
|
|
3427
|
+
};
|
|
3428
|
+
};
|
|
3034
3429
|
//#endregion
|
|
3035
3430
|
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
3036
3431
|
/**
|
|
@@ -3078,9 +3473,8 @@ const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
|
3078
3473
|
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
3079
3474
|
if (!m) return null;
|
|
3080
3475
|
const token = m[1];
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
const compiled = compileLiteralAlternation(tokenPattern);
|
|
3476
|
+
if (!(token in TOKEN_PATTERNS)) return null;
|
|
3477
|
+
const compiled = compileLiteralAlternation(getTokenPattern(token));
|
|
3084
3478
|
return compiled ? {
|
|
3085
3479
|
alternatives: compiled.alternatives,
|
|
3086
3480
|
token
|
|
@@ -3093,11 +3487,11 @@ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
|
3093
3487
|
}
|
|
3094
3488
|
return null;
|
|
3095
3489
|
};
|
|
3096
|
-
|
|
3097
3490
|
//#endregion
|
|
3098
3491
|
//#region src/segmentation/segmenter-rule-utils.ts
|
|
3099
3492
|
const tryCompileFastFuzzyRule = (rule) => {
|
|
3100
|
-
|
|
3493
|
+
const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
|
|
3494
|
+
if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
|
|
3101
3495
|
if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
|
|
3102
3496
|
const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
|
|
3103
3497
|
if (compiled) return {
|
|
@@ -3139,7 +3533,10 @@ const partitionRulesForMatching = (rules) => {
|
|
|
3139
3533
|
prefix: `r${index}_`,
|
|
3140
3534
|
rule
|
|
3141
3535
|
});
|
|
3142
|
-
else standaloneRules.push(
|
|
3536
|
+
else standaloneRules.push({
|
|
3537
|
+
index,
|
|
3538
|
+
rule
|
|
3539
|
+
});
|
|
3143
3540
|
}
|
|
3144
3541
|
return {
|
|
3145
3542
|
combinableRules,
|
|
@@ -3147,9 +3544,37 @@ const partitionRulesForMatching = (rules) => {
|
|
|
3147
3544
|
standaloneRules
|
|
3148
3545
|
};
|
|
3149
3546
|
};
|
|
3547
|
+
const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
|
|
3548
|
+
const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
|
|
3549
|
+
const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
|
|
3550
|
+
const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
|
|
3551
|
+
const trimTrailingPageWrapNoise = (text) => {
|
|
3552
|
+
let trimmed = text.trimEnd();
|
|
3553
|
+
while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
|
|
3554
|
+
return trimmed;
|
|
3555
|
+
};
|
|
3556
|
+
const endsWithStrongSentenceTerminator = (pageContent) => {
|
|
3557
|
+
return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
|
|
3558
|
+
};
|
|
3559
|
+
const extractLastArabicWord = (pageContent) => {
|
|
3560
|
+
return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
|
|
3561
|
+
};
|
|
3562
|
+
const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
|
|
3563
|
+
if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
|
|
3564
|
+
const lastWord = extractLastArabicWord(previousPageContent);
|
|
3565
|
+
return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
|
|
3566
|
+
};
|
|
3567
|
+
const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
|
|
3568
|
+
if (!stoplist) return true;
|
|
3569
|
+
const lastWord = extractLastArabicWord(contentBeforeMatch);
|
|
3570
|
+
return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
|
|
3571
|
+
};
|
|
3150
3572
|
const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
3151
3573
|
const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
|
|
3152
3574
|
const compiledPageStartPrev = /* @__PURE__ */ new Map();
|
|
3575
|
+
const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3576
|
+
const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
|
|
3577
|
+
const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
|
|
3153
3578
|
const getPageStartPrevRegex = (rule, ruleIndex) => {
|
|
3154
3579
|
if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
|
|
3155
3580
|
const pattern = rule.pageStartGuard;
|
|
@@ -3161,6 +3586,33 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
|
3161
3586
|
compiledPageStartPrev.set(ruleIndex, re);
|
|
3162
3587
|
return re;
|
|
3163
3588
|
};
|
|
3589
|
+
const getPrevWordStoplist = (rule, ruleIndex) => {
|
|
3590
|
+
if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
|
|
3591
|
+
const stoplist = rule.pageStartPrevWordStoplist;
|
|
3592
|
+
if (!stoplist?.length) {
|
|
3593
|
+
compiledPrevWordStoplists.set(ruleIndex, null);
|
|
3594
|
+
return null;
|
|
3595
|
+
}
|
|
3596
|
+
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3597
|
+
compiledPrevWordStoplists.set(ruleIndex, normalized);
|
|
3598
|
+
return normalized;
|
|
3599
|
+
};
|
|
3600
|
+
const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
|
|
3601
|
+
if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
|
|
3602
|
+
const stoplist = rule.samePagePrevWordStoplist;
|
|
3603
|
+
if (!stoplist?.length) {
|
|
3604
|
+
compiledSamePagePrevWordStoplists.set(ruleIndex, null);
|
|
3605
|
+
return null;
|
|
3606
|
+
}
|
|
3607
|
+
const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
|
|
3608
|
+
compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
|
|
3609
|
+
return normalized;
|
|
3610
|
+
};
|
|
3611
|
+
const getPreviousPageContent = (boundaryIndex) => {
|
|
3612
|
+
if (boundaryIndex <= 0) return "";
|
|
3613
|
+
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
3614
|
+
return matchContent.slice(prevBoundary.start, prevBoundary.end);
|
|
3615
|
+
};
|
|
3164
3616
|
const getPrevPageLastNonWsChar = (boundaryIndex) => {
|
|
3165
3617
|
if (boundaryIndex <= 0) return "";
|
|
3166
3618
|
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
@@ -3170,13 +3622,24 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
|
3170
3622
|
}
|
|
3171
3623
|
return "";
|
|
3172
3624
|
};
|
|
3625
|
+
const getCurrentPageContentBeforeMatch = (matchStart) => {
|
|
3626
|
+
const pageId = pageMap.getId(matchStart);
|
|
3627
|
+
const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
|
|
3628
|
+
if (boundaryIndex === void 0) return "";
|
|
3629
|
+
const boundary = pageMap.boundaries[boundaryIndex];
|
|
3630
|
+
return matchContent.slice(boundary.start, matchStart);
|
|
3631
|
+
};
|
|
3173
3632
|
return (rule, ruleIndex, matchStart) => {
|
|
3174
3633
|
const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
|
|
3175
|
-
if (boundaryIndex
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3634
|
+
if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
|
|
3635
|
+
const prevReq = getPageStartPrevRegex(rule, ruleIndex);
|
|
3636
|
+
if (prevReq) {
|
|
3637
|
+
const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
|
|
3638
|
+
if (!lastChar || !prevReq.test(lastChar)) return false;
|
|
3639
|
+
}
|
|
3640
|
+
return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
|
|
3641
|
+
}
|
|
3642
|
+
return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
|
|
3180
3643
|
};
|
|
3181
3644
|
};
|
|
3182
3645
|
/**
|
|
@@ -3212,10 +3675,10 @@ const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule,
|
|
|
3212
3675
|
/**
|
|
3213
3676
|
* Processes matches for all fast-fuzzy rules at a specific line start.
|
|
3214
3677
|
*/
|
|
3215
|
-
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard,
|
|
3678
|
+
const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
|
|
3216
3679
|
for (const ffRule of fastFuzzyRules) {
|
|
3217
3680
|
if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
|
|
3218
|
-
if (
|
|
3681
|
+
if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
|
|
3219
3682
|
attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
|
|
3220
3683
|
}
|
|
3221
3684
|
};
|
|
@@ -3230,19 +3693,17 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
3230
3693
|
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
3231
3694
|
}
|
|
3232
3695
|
};
|
|
3233
|
-
const isPageStart = (offset) => offset === currentBoundary?.start;
|
|
3234
3696
|
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
3235
3697
|
advanceBoundaryTo(lineStart);
|
|
3236
3698
|
const pageId = currentBoundary?.id ?? 0;
|
|
3237
3699
|
if (lineStart >= matchContent.length) break;
|
|
3238
|
-
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard,
|
|
3700
|
+
processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
|
|
3239
3701
|
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
3240
3702
|
if (nextNl === -1) break;
|
|
3241
3703
|
lineStart = nextNl + 1;
|
|
3242
3704
|
}
|
|
3243
3705
|
return splitPointsByRule;
|
|
3244
3706
|
};
|
|
3245
|
-
|
|
3246
3707
|
//#endregion
|
|
3247
3708
|
//#region src/segmentation/split-point-helpers.ts
|
|
3248
3709
|
const MAX_REGEX_ITERATIONS = 1e5;
|
|
@@ -3256,7 +3717,7 @@ const buildContentOffsets = (match, ruleInfo) => {
|
|
|
3256
3717
|
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
3257
3718
|
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
3258
3719
|
if (captured === void 0) return {};
|
|
3259
|
-
return { contentStartOffset: (match.groups?.[ruleInfo.prefix]
|
|
3720
|
+
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
|
|
3260
3721
|
};
|
|
3261
3722
|
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
3262
3723
|
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
@@ -3271,7 +3732,32 @@ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
|
3271
3732
|
wordIndex
|
|
3272
3733
|
};
|
|
3273
3734
|
};
|
|
3735
|
+
const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
|
|
3736
|
+
const arr = splitPointsByRule.get(originalIndex);
|
|
3737
|
+
if (!arr) {
|
|
3738
|
+
splitPointsByRule.set(originalIndex, [point]);
|
|
3739
|
+
return;
|
|
3740
|
+
}
|
|
3741
|
+
arr.push(point);
|
|
3742
|
+
};
|
|
3743
|
+
/**
|
|
3744
|
+
* Executes a combined regex over the content for combinable rules and records
|
|
3745
|
+
* any resulting split points into `splitPointsByRule`.
|
|
3746
|
+
*
|
|
3747
|
+
* This function mutates `splitPointsByRule` in place and throws if the regex
|
|
3748
|
+
* iteration guard is exceeded.
|
|
3749
|
+
*
|
|
3750
|
+
* @param matchContent - Concatenated content being segmented
|
|
3751
|
+
* @param combinableRules - Rules that can be combined into a single alternation
|
|
3752
|
+
* @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
|
|
3753
|
+
* @param pageMap - Page boundary mapping utilities for the content
|
|
3754
|
+
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3755
|
+
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3756
|
+
* @param logger - Optional logger for iteration diagnostics
|
|
3757
|
+
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3758
|
+
*/
|
|
3274
3759
|
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
3760
|
+
assertCombinedRuleAlignment(combinableRules, ruleRegexes);
|
|
3275
3761
|
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
3276
3762
|
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
3277
3763
|
logger?.debug?.("[segmenter] combined regex built", {
|
|
@@ -3286,19 +3772,29 @@ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, page
|
|
|
3286
3772
|
iterations,
|
|
3287
3773
|
position: m.index
|
|
3288
3774
|
});
|
|
3289
|
-
|
|
3290
|
-
if (matchedIndex !== -1) {
|
|
3291
|
-
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
3292
|
-
if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
|
|
3293
|
-
const arr = splitPointsByRule.get(originalIndex);
|
|
3294
|
-
if (!arr) splitPointsByRule.set(originalIndex, [createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex])]);
|
|
3295
|
-
else arr.push(createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex]));
|
|
3296
|
-
}
|
|
3297
|
-
}
|
|
3775
|
+
processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
|
|
3298
3776
|
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
3299
3777
|
m = combinedRegex.exec(matchContent);
|
|
3300
3778
|
}
|
|
3301
3779
|
};
|
|
3780
|
+
const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
|
|
3781
|
+
if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
|
|
3782
|
+
for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
|
|
3783
|
+
};
|
|
3784
|
+
const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
|
|
3785
|
+
const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
|
|
3786
|
+
if (matchedIndex === -1) return;
|
|
3787
|
+
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
3788
|
+
if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
|
|
3789
|
+
addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
|
|
3790
|
+
};
|
|
3791
|
+
/**
|
|
3792
|
+
* Builds compiled regex metadata for each combinable rule while preserving the
|
|
3793
|
+
* prefix used to identify the matching branch inside a combined alternation.
|
|
3794
|
+
*
|
|
3795
|
+
* @param combinableRules - Rules eligible for combined-regex processing
|
|
3796
|
+
* @returns Rule regex metadata aligned with the input order
|
|
3797
|
+
*/
|
|
3302
3798
|
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
3303
3799
|
const built = buildRuleRegex(rule, prefix);
|
|
3304
3800
|
return {
|
|
@@ -3307,6 +3803,18 @@ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefi
|
|
|
3307
3803
|
source: `(?<${prefix}>${built.regex.source})`
|
|
3308
3804
|
};
|
|
3309
3805
|
});
|
|
3806
|
+
/**
|
|
3807
|
+
* Processes a standalone rule by matching it independently and appending its
|
|
3808
|
+
* resulting split points into `splitPointsByRule`.
|
|
3809
|
+
*
|
|
3810
|
+
* @param rule - The standalone split rule to evaluate
|
|
3811
|
+
* @param ruleIndex - Original rule index in the caller's rules array
|
|
3812
|
+
* @param matchContent - Concatenated content being segmented
|
|
3813
|
+
* @param pageMap - Page boundary mapping utilities for the content
|
|
3814
|
+
* @param passesPageStartGuard - Callback that decides whether a match is allowed
|
|
3815
|
+
* @param splitPointsByRule - Mutable map collecting split points by rule index
|
|
3816
|
+
* @returns Nothing; results are written into `splitPointsByRule`
|
|
3817
|
+
*/
|
|
3310
3818
|
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
3311
3819
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
3312
3820
|
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
@@ -3341,6 +3849,15 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
|
3341
3849
|
}
|
|
3342
3850
|
return matches;
|
|
3343
3851
|
};
|
|
3852
|
+
/**
|
|
3853
|
+
* Applies per-rule occurrence filtering and optional debug metadata patches to
|
|
3854
|
+
* the collected split points.
|
|
3855
|
+
*
|
|
3856
|
+
* @param rules - Full rule list in original order
|
|
3857
|
+
* @param splitPointsByRule - Split points grouped by originating rule index
|
|
3858
|
+
* @param debugMetaKey - Optional metadata key used for debug provenance patches
|
|
3859
|
+
* @returns Flattened split points after occurrence filtering and debug merging
|
|
3860
|
+
*/
|
|
3344
3861
|
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
3345
3862
|
const result = [];
|
|
3346
3863
|
rules.forEach((rule, index) => {
|
|
@@ -3358,7 +3875,6 @@ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
|
3358
3875
|
});
|
|
3359
3876
|
return result;
|
|
3360
3877
|
};
|
|
3361
|
-
|
|
3362
3878
|
//#endregion
|
|
3363
3879
|
//#region src/segmentation/segmenter.ts
|
|
3364
3880
|
/**
|
|
@@ -3432,10 +3948,30 @@ const dedupeSplitPoints = (splitPoints) => {
|
|
|
3432
3948
|
const byIndex = /* @__PURE__ */ new Map();
|
|
3433
3949
|
for (const p of splitPoints) {
|
|
3434
3950
|
const existing = byIndex.get(p.index);
|
|
3435
|
-
if (!existing
|
|
3951
|
+
if (!existing) {
|
|
3952
|
+
byIndex.set(p.index, p);
|
|
3953
|
+
continue;
|
|
3954
|
+
}
|
|
3955
|
+
byIndex.set(p.index, mergeSplitPoints(existing, p));
|
|
3436
3956
|
}
|
|
3437
3957
|
return [...byIndex.values()].sort((a, b) => a.index - b.index);
|
|
3438
3958
|
};
|
|
3959
|
+
const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
|
|
3960
|
+
const mergeRecord = (existing, incoming) => existing || incoming ? {
|
|
3961
|
+
...existing ?? {},
|
|
3962
|
+
...incoming ?? {}
|
|
3963
|
+
} : void 0;
|
|
3964
|
+
const mergeSplitPoints = (existing, incoming) => {
|
|
3965
|
+
const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
|
|
3966
|
+
const fallback = preferred === incoming ? existing : incoming;
|
|
3967
|
+
return {
|
|
3968
|
+
...fallback,
|
|
3969
|
+
...preferred,
|
|
3970
|
+
contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
|
|
3971
|
+
meta: mergeRecord(existing.meta, incoming.meta),
|
|
3972
|
+
namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
|
|
3973
|
+
};
|
|
3974
|
+
};
|
|
3439
3975
|
/**
|
|
3440
3976
|
* If no structural rules produced segments, create a single segment spanning all pages.
|
|
3441
3977
|
* This allows breakpoint processing to still run.
|
|
@@ -3468,7 +4004,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey,
|
|
|
3468
4004
|
});
|
|
3469
4005
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
3470
4006
|
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
3471
|
-
for (const rule of standaloneRules) processStandaloneRule(rule,
|
|
4007
|
+
for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
3472
4008
|
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
3473
4009
|
};
|
|
3474
4010
|
/**
|
|
@@ -3508,7 +4044,7 @@ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
|
|
|
3508
4044
|
* @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
|
|
3509
4045
|
*/
|
|
3510
4046
|
const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
|
|
3511
|
-
if (!content
|
|
4047
|
+
if (!content?.includes("\n")) return content;
|
|
3512
4048
|
if (pageJoiner === "newline") return content;
|
|
3513
4049
|
const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
|
|
3514
4050
|
if (breaksInRange.length === 0) return content;
|
|
@@ -3616,16 +4152,23 @@ const segmentPages = (pages, options) => {
|
|
|
3616
4152
|
* @returns Array of segment objects
|
|
3617
4153
|
*/
|
|
3618
4154
|
const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
4155
|
+
const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
|
|
4156
|
+
const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
|
|
4157
|
+
const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
|
|
4158
|
+
const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
|
|
4159
|
+
...meta,
|
|
4160
|
+
...namedCaptures
|
|
4161
|
+
} : void 0;
|
|
3619
4162
|
/**
|
|
3620
4163
|
* Creates a single segment from a content range.
|
|
3621
4164
|
*/
|
|
3622
4165
|
const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
|
|
3623
|
-
const actualStart = start
|
|
4166
|
+
const actualStart = getActualStart(start, contentStartOffset);
|
|
3624
4167
|
const sliced = content.slice(actualStart, end);
|
|
3625
|
-
let text =
|
|
4168
|
+
let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
|
|
3626
4169
|
if (!text) return null;
|
|
3627
4170
|
if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
|
|
3628
|
-
const adjustedStart = actualStart
|
|
4171
|
+
const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
|
|
3629
4172
|
const from = pageMap.getId(adjustedStart);
|
|
3630
4173
|
const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
|
|
3631
4174
|
const seg = {
|
|
@@ -3633,10 +4176,8 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
|
3633
4176
|
from
|
|
3634
4177
|
};
|
|
3635
4178
|
if (to !== from) seg.to = to;
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
...namedCaptures
|
|
3639
|
-
};
|
|
4179
|
+
const mergedMeta = applyMeta(meta, namedCaptures);
|
|
4180
|
+
if (mergedMeta) seg.meta = mergedMeta;
|
|
3640
4181
|
return seg;
|
|
3641
4182
|
};
|
|
3642
4183
|
/**
|
|
@@ -3668,659 +4209,6 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
|
3668
4209
|
}
|
|
3669
4210
|
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
3670
4211
|
};
|
|
3671
|
-
|
|
3672
|
-
//#endregion
|
|
3673
|
-
//#region src/recovery.ts
|
|
3674
|
-
const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
|
|
3675
|
-
const normalizeForCompare = (s, mode) => {
|
|
3676
|
-
if (mode === "none") return s;
|
|
3677
|
-
let out = s;
|
|
3678
|
-
if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
|
|
3679
|
-
out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
|
|
3680
|
-
return out;
|
|
3681
|
-
};
|
|
3682
|
-
const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
|
|
3683
|
-
const buildFixedOptions = (options, selectedRuleIndices) => {
|
|
3684
|
-
const fixedRules = (options.rules ?? []).map((r, idx) => {
|
|
3685
|
-
if (!selectedRuleIndices.has(idx)) return r;
|
|
3686
|
-
if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
|
|
3687
|
-
const { lineStartsAfter, ...rest } = r;
|
|
3688
|
-
return {
|
|
3689
|
-
...rest,
|
|
3690
|
-
lineStartsWith: lineStartsAfter
|
|
3691
|
-
};
|
|
3692
|
-
});
|
|
3693
|
-
return {
|
|
3694
|
-
...options,
|
|
3695
|
-
rules: fixedRules
|
|
3696
|
-
};
|
|
3697
|
-
};
|
|
3698
|
-
const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
|
|
3699
|
-
const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
|
|
3700
|
-
const parts = [];
|
|
3701
|
-
for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
|
|
3702
|
-
const matchContent = parts.join("\n");
|
|
3703
|
-
if (pageJoiner === "newline") return {
|
|
3704
|
-
matchContent,
|
|
3705
|
-
outputContent: matchContent
|
|
3706
|
-
};
|
|
3707
|
-
return {
|
|
3708
|
-
matchContent,
|
|
3709
|
-
outputContent: parts.join(" ")
|
|
3710
|
-
};
|
|
3711
|
-
};
|
|
3712
|
-
const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
|
|
3713
|
-
const rules = options.rules ?? [];
|
|
3714
|
-
const compiled = [];
|
|
3715
|
-
for (const idx of selectedRuleIndices) {
|
|
3716
|
-
const r = rules[idx];
|
|
3717
|
-
if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3718
|
-
const { lineStartsAfter, ...rest } = r;
|
|
3719
|
-
const built = buildRuleRegex({
|
|
3720
|
-
...rest,
|
|
3721
|
-
lineStartsWith: lineStartsAfter
|
|
3722
|
-
});
|
|
3723
|
-
compiled.push({
|
|
3724
|
-
ruleIndex: idx,
|
|
3725
|
-
startsWithRegex: new RegExp(built.regex.source, "mu")
|
|
3726
|
-
});
|
|
3727
|
-
}
|
|
3728
|
-
return compiled;
|
|
3729
|
-
};
|
|
3730
|
-
const findUniqueAnchorPos = (outputContent, segmentContent) => {
|
|
3731
|
-
for (const len of [
|
|
3732
|
-
80,
|
|
3733
|
-
60,
|
|
3734
|
-
40,
|
|
3735
|
-
30,
|
|
3736
|
-
20,
|
|
3737
|
-
15
|
|
3738
|
-
]) {
|
|
3739
|
-
const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
|
|
3740
|
-
if (!needle.trim()) continue;
|
|
3741
|
-
const first = outputContent.indexOf(needle);
|
|
3742
|
-
if (first === -1) continue;
|
|
3743
|
-
if (outputContent.indexOf(needle, first + 1) === -1) return first;
|
|
3744
|
-
}
|
|
3745
|
-
return null;
|
|
3746
|
-
};
|
|
3747
|
-
const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
|
|
3748
|
-
const line = matchContent.slice(lineStart);
|
|
3749
|
-
for (const mr of compiledMistaken) {
|
|
3750
|
-
mr.startsWithRegex.lastIndex = 0;
|
|
3751
|
-
const m = mr.startsWithRegex.exec(line);
|
|
3752
|
-
if (!m || m.index !== 0) continue;
|
|
3753
|
-
const markerMatch = m[0];
|
|
3754
|
-
const markerEnd = lineStart + markerMatch.length;
|
|
3755
|
-
if (anchorPos < markerEnd) continue;
|
|
3756
|
-
const gap = matchContent.slice(markerEnd, anchorPos);
|
|
3757
|
-
const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
|
|
3758
|
-
if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
|
|
3759
|
-
return { prefix: recoveredPrefix };
|
|
3760
|
-
}
|
|
3761
|
-
return { reason: "no selected marker pattern matched at anchored line start" };
|
|
3762
|
-
};
|
|
3763
|
-
const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
|
|
3764
|
-
const fromIdx = pageIdToIndex.get(segment.from);
|
|
3765
|
-
const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
|
|
3766
|
-
if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
|
|
3767
|
-
kind: "unresolved",
|
|
3768
|
-
reason: "segment page range not found in pages"
|
|
3769
|
-
};
|
|
3770
|
-
const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
|
|
3771
|
-
if (!segment.content) return {
|
|
3772
|
-
kind: "unresolved",
|
|
3773
|
-
reason: "empty segment content"
|
|
3774
|
-
};
|
|
3775
|
-
const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
|
|
3776
|
-
if (anchorPos === null) return {
|
|
3777
|
-
kind: "unresolved",
|
|
3778
|
-
reason: "could not uniquely anchor segment content in page range"
|
|
3779
|
-
};
|
|
3780
|
-
const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
|
|
3781
|
-
const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
|
|
3782
|
-
if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
|
|
3783
|
-
kind: "unresolved",
|
|
3784
|
-
reason: found.reason
|
|
3785
|
-
};
|
|
3786
|
-
return {
|
|
3787
|
-
kind: "recovered",
|
|
3788
|
-
recoveredContent: `${found.prefix}${segment.content}`,
|
|
3789
|
-
recoveredPrefix: found.prefix
|
|
3790
|
-
};
|
|
3791
|
-
};
|
|
3792
|
-
const resolveRuleIndicesSelector = (rules, indicesIn) => {
|
|
3793
|
-
const errors = [];
|
|
3794
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3795
|
-
for (const idx of indicesIn) {
|
|
3796
|
-
if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
|
|
3797
|
-
errors.push(`Selector index out of range: ${idx}`);
|
|
3798
|
-
continue;
|
|
3799
|
-
}
|
|
3800
|
-
const rule = rules[idx];
|
|
3801
|
-
if (!rule || !("lineStartsAfter" in rule)) {
|
|
3802
|
-
errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
|
|
3803
|
-
continue;
|
|
3804
|
-
}
|
|
3805
|
-
indices.add(idx);
|
|
3806
|
-
}
|
|
3807
|
-
return {
|
|
3808
|
-
errors,
|
|
3809
|
-
indices,
|
|
3810
|
-
warnings: []
|
|
3811
|
-
};
|
|
3812
|
-
};
|
|
3813
|
-
const resolvePredicateSelector = (rules, predicate) => {
|
|
3814
|
-
const errors = [];
|
|
3815
|
-
const warnings = [];
|
|
3816
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3817
|
-
rules.forEach((r, i) => {
|
|
3818
|
-
try {
|
|
3819
|
-
if (!predicate(r, i)) return;
|
|
3820
|
-
if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
|
|
3821
|
-
indices.add(i);
|
|
3822
|
-
return;
|
|
3823
|
-
}
|
|
3824
|
-
warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
|
|
3825
|
-
} catch (e) {
|
|
3826
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
3827
|
-
errors.push(`Predicate threw at rule ${i}: ${msg}`);
|
|
3828
|
-
}
|
|
3829
|
-
});
|
|
3830
|
-
if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
|
|
3831
|
-
return {
|
|
3832
|
-
errors,
|
|
3833
|
-
indices,
|
|
3834
|
-
warnings
|
|
3835
|
-
};
|
|
3836
|
-
};
|
|
3837
|
-
const resolvePatternsSelector = (rules, patterns, matchMode) => {
|
|
3838
|
-
const errors = [];
|
|
3839
|
-
const warnings = [];
|
|
3840
|
-
const indices = /* @__PURE__ */ new Set();
|
|
3841
|
-
const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
|
|
3842
|
-
const targets = patterns.map(normalizePattern);
|
|
3843
|
-
for (let pi = 0; pi < patterns.length; pi++) {
|
|
3844
|
-
const rawPattern = patterns[pi];
|
|
3845
|
-
const pat = targets[pi];
|
|
3846
|
-
const matched = [];
|
|
3847
|
-
for (let i = 0; i < rules.length; i++) {
|
|
3848
|
-
const r = rules[i];
|
|
3849
|
-
if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3850
|
-
if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
|
|
3851
|
-
}
|
|
3852
|
-
if (matched.length === 0) {
|
|
3853
|
-
errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
|
|
3854
|
-
continue;
|
|
3855
|
-
}
|
|
3856
|
-
if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
|
|
3857
|
-
matched.forEach((i) => {
|
|
3858
|
-
indices.add(i);
|
|
3859
|
-
});
|
|
3860
|
-
}
|
|
3861
|
-
return {
|
|
3862
|
-
errors,
|
|
3863
|
-
indices,
|
|
3864
|
-
warnings
|
|
3865
|
-
};
|
|
3866
|
-
};
|
|
3867
|
-
const resolveSelectorToRuleIndices = (options, selector) => {
|
|
3868
|
-
const rules = options.rules ?? [];
|
|
3869
|
-
if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
|
|
3870
|
-
if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
|
|
3871
|
-
return resolvePatternsSelector(rules, selector.patterns, selector.match);
|
|
3872
|
-
};
|
|
3873
|
-
const longestCommonSuffixLength = (a, b) => {
|
|
3874
|
-
const max = Math.min(a.length, b.length);
|
|
3875
|
-
let i = 0;
|
|
3876
|
-
while (i < max) {
|
|
3877
|
-
if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
|
|
3878
|
-
i++;
|
|
3879
|
-
}
|
|
3880
|
-
return i;
|
|
3881
|
-
};
|
|
3882
|
-
const AMBIGUITY_SCORE_GAP = 5;
|
|
3883
|
-
const scoreCandidate = (orig, fixed, normalizeMode) => {
|
|
3884
|
-
if (fixed.content === orig.content) return {
|
|
3885
|
-
fixedIndex: -1,
|
|
3886
|
-
kind: "exact",
|
|
3887
|
-
score: 100
|
|
3888
|
-
};
|
|
3889
|
-
if (fixed.content.endsWith(orig.content)) {
|
|
3890
|
-
const markerLen = fixed.content.length - orig.content.length;
|
|
3891
|
-
return {
|
|
3892
|
-
fixedIndex: -1,
|
|
3893
|
-
kind: "exact_suffix",
|
|
3894
|
-
score: 90 + Math.min(30, markerLen)
|
|
3895
|
-
};
|
|
3896
|
-
}
|
|
3897
|
-
if (normalizeMode !== "none") {
|
|
3898
|
-
const normFixed = normalizeForCompare(fixed.content, normalizeMode);
|
|
3899
|
-
const normOrig = normalizeForCompare(orig.content, normalizeMode);
|
|
3900
|
-
if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
|
|
3901
|
-
const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
|
|
3902
|
-
return {
|
|
3903
|
-
fixedIndex: -1,
|
|
3904
|
-
kind: "normalized_suffix",
|
|
3905
|
-
score: 70 + Math.floor(overlap * 20)
|
|
3906
|
-
};
|
|
3907
|
-
}
|
|
3908
|
-
}
|
|
3909
|
-
return null;
|
|
3910
|
-
};
|
|
3911
|
-
const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
|
|
3912
|
-
const warnings = [...reportBase.warnings];
|
|
3913
|
-
warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
|
|
3914
|
-
const details = segments.map((s, i) => {
|
|
3915
|
-
const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
|
|
3916
|
-
return {
|
|
3917
|
-
from: s.from,
|
|
3918
|
-
notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
|
|
3919
|
-
originalStartPreview: preview(s.content),
|
|
3920
|
-
segmentIndex: i,
|
|
3921
|
-
status,
|
|
3922
|
-
strategy: "none",
|
|
3923
|
-
to: s.to
|
|
3924
|
-
};
|
|
3925
|
-
});
|
|
3926
|
-
return {
|
|
3927
|
-
report: {
|
|
3928
|
-
...reportBase,
|
|
3929
|
-
details,
|
|
3930
|
-
summary: {
|
|
3931
|
-
mode,
|
|
3932
|
-
recovered: 0,
|
|
3933
|
-
totalSegments: segments.length,
|
|
3934
|
-
unchanged: segments.length,
|
|
3935
|
-
unresolved: selectorErrors.length ? segments.length : 0
|
|
3936
|
-
},
|
|
3937
|
-
warnings
|
|
3938
|
-
},
|
|
3939
|
-
segments
|
|
3940
|
-
};
|
|
3941
|
-
};
|
|
3942
|
-
const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
|
|
3943
|
-
const recoveredAtIndex = /* @__PURE__ */ new Map();
|
|
3944
|
-
const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
|
|
3945
|
-
if (mode !== "best_effort_then_rerun") return {
|
|
3946
|
-
recoveredAtIndex,
|
|
3947
|
-
recoveredDetailAtIndex
|
|
3948
|
-
};
|
|
3949
|
-
const pageIdToIndex = buildPageIdToIndex(pages);
|
|
3950
|
-
const pageJoiner = options.pageJoiner ?? "space";
|
|
3951
|
-
const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
|
|
3952
|
-
for (let i = 0; i < segments.length; i++) {
|
|
3953
|
-
const orig = segments[i];
|
|
3954
|
-
const r = tryBestEffortRecoverOneSegment(orig, pages, pageIdToIndex, compiledMistaken, pageJoiner);
|
|
3955
|
-
if (r.kind !== "recovered") continue;
|
|
3956
|
-
const seg = {
|
|
3957
|
-
...orig,
|
|
3958
|
-
content: r.recoveredContent
|
|
3959
|
-
};
|
|
3960
|
-
recoveredAtIndex.set(i, seg);
|
|
3961
|
-
recoveredDetailAtIndex.set(i, {
|
|
3962
|
-
from: orig.from,
|
|
3963
|
-
originalStartPreview: preview(orig.content),
|
|
3964
|
-
recoveredPrefixPreview: preview(r.recoveredPrefix),
|
|
3965
|
-
recoveredStartPreview: preview(seg.content),
|
|
3966
|
-
segmentIndex: i,
|
|
3967
|
-
status: "recovered",
|
|
3968
|
-
strategy: "stage1",
|
|
3969
|
-
to: orig.to
|
|
3970
|
-
});
|
|
3971
|
-
}
|
|
3972
|
-
return {
|
|
3973
|
-
recoveredAtIndex,
|
|
3974
|
-
recoveredDetailAtIndex
|
|
3975
|
-
};
|
|
3976
|
-
};
|
|
3977
|
-
const buildFixedBuckets = (fixedSegments) => {
|
|
3978
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
3979
|
-
for (let i = 0; i < fixedSegments.length; i++) {
|
|
3980
|
-
const k = segmentRangeKey(fixedSegments[i]);
|
|
3981
|
-
const arr = buckets.get(k);
|
|
3982
|
-
if (!arr) buckets.set(k, [i]);
|
|
3983
|
-
else arr.push(i);
|
|
3984
|
-
}
|
|
3985
|
-
return buckets;
|
|
3986
|
-
};
|
|
3987
|
-
const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
|
|
3988
|
-
let best = null;
|
|
3989
|
-
let secondBestScore = -Infinity;
|
|
3990
|
-
for (const fixedIdx of candidates) {
|
|
3991
|
-
if (usedFixed.has(fixedIdx)) continue;
|
|
3992
|
-
const fixed = fixedSegments[fixedIdx];
|
|
3993
|
-
const scored = scoreCandidate(orig, fixed, normalizeCompare);
|
|
3994
|
-
if (!scored) continue;
|
|
3995
|
-
const candidateScore = scored.score;
|
|
3996
|
-
if (!best || candidateScore > best.score) {
|
|
3997
|
-
secondBestScore = best?.score ?? -Infinity;
|
|
3998
|
-
best = {
|
|
3999
|
-
fixedIdx,
|
|
4000
|
-
score: candidateScore
|
|
4001
|
-
};
|
|
4002
|
-
} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
|
|
4003
|
-
}
|
|
4004
|
-
if (!best) return { kind: "none" };
|
|
4005
|
-
if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
|
|
4006
|
-
return {
|
|
4007
|
-
fixedIdx: best.fixedIdx,
|
|
4008
|
-
kind: "match"
|
|
4009
|
-
};
|
|
4010
|
-
};
|
|
4011
|
-
const detailUnresolved = (orig, segmentIndex, notes) => ({
|
|
4012
|
-
from: orig.from,
|
|
4013
|
-
notes,
|
|
4014
|
-
originalStartPreview: preview(orig.content),
|
|
4015
|
-
segmentIndex,
|
|
4016
|
-
status: "unresolved_alignment",
|
|
4017
|
-
strategy: "rerun",
|
|
4018
|
-
to: orig.to
|
|
4019
|
-
});
|
|
4020
|
-
const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
|
|
4021
|
-
from: orig.from,
|
|
4022
|
-
notes,
|
|
4023
|
-
originalStartPreview: preview(orig.content),
|
|
4024
|
-
segmentIndex,
|
|
4025
|
-
status: "skipped_idempotent",
|
|
4026
|
-
strategy: "rerun",
|
|
4027
|
-
to: orig.to
|
|
4028
|
-
});
|
|
4029
|
-
const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
|
|
4030
|
-
let recoveredPrefixPreview;
|
|
4031
|
-
if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
|
|
4032
|
-
return {
|
|
4033
|
-
from: orig.from,
|
|
4034
|
-
originalStartPreview: preview(orig.content),
|
|
4035
|
-
recoveredPrefixPreview,
|
|
4036
|
-
recoveredStartPreview: preview(fixed.content),
|
|
4037
|
-
segmentIndex,
|
|
4038
|
-
status: "recovered",
|
|
4039
|
-
strategy: "rerun",
|
|
4040
|
-
to: orig.to
|
|
4041
|
-
};
|
|
4042
|
-
};
|
|
4043
|
-
const mergeWithRerun = (params) => {
|
|
4044
|
-
const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
|
|
4045
|
-
const usedFixed = /* @__PURE__ */ new Set();
|
|
4046
|
-
const out = [];
|
|
4047
|
-
const details = [];
|
|
4048
|
-
let recovered = 0;
|
|
4049
|
-
let unresolved = 0;
|
|
4050
|
-
let unchanged = 0;
|
|
4051
|
-
for (let i = 0; i < originalSegments.length; i++) {
|
|
4052
|
-
const stage1Recovered = stage1RecoveredAtIndex.get(i);
|
|
4053
|
-
if (stage1Recovered) {
|
|
4054
|
-
out.push(stage1Recovered);
|
|
4055
|
-
recovered++;
|
|
4056
|
-
details.push(recoveredDetailAtIndex.get(i) ?? {
|
|
4057
|
-
from: stage1Recovered.from,
|
|
4058
|
-
originalStartPreview: preview(originalSegments[i].content),
|
|
4059
|
-
recoveredStartPreview: preview(stage1Recovered.content),
|
|
4060
|
-
segmentIndex: i,
|
|
4061
|
-
status: "recovered",
|
|
4062
|
-
strategy: "stage1",
|
|
4063
|
-
to: stage1Recovered.to
|
|
4064
|
-
});
|
|
4065
|
-
continue;
|
|
4066
|
-
}
|
|
4067
|
-
const orig = originalSegments[i];
|
|
4068
|
-
const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
|
|
4069
|
-
if (best.kind === "none") {
|
|
4070
|
-
out.push(orig);
|
|
4071
|
-
unresolved++;
|
|
4072
|
-
details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
|
|
4073
|
-
continue;
|
|
4074
|
-
}
|
|
4075
|
-
if (best.kind === "ambiguous") {
|
|
4076
|
-
out.push(orig);
|
|
4077
|
-
unresolved++;
|
|
4078
|
-
details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
|
|
4079
|
-
continue;
|
|
4080
|
-
}
|
|
4081
|
-
usedFixed.add(best.fixedIdx);
|
|
4082
|
-
const fixed = fixedSegments[best.fixedIdx];
|
|
4083
|
-
if (fixed.content === orig.content) {
|
|
4084
|
-
out.push(orig);
|
|
4085
|
-
unchanged++;
|
|
4086
|
-
details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
|
|
4087
|
-
continue;
|
|
4088
|
-
}
|
|
4089
|
-
out.push({
|
|
4090
|
-
...orig,
|
|
4091
|
-
content: fixed.content
|
|
4092
|
-
});
|
|
4093
|
-
recovered++;
|
|
4094
|
-
details.push(detailRecoveredRerun(orig, fixed, i));
|
|
4095
|
-
}
|
|
4096
|
-
return {
|
|
4097
|
-
details,
|
|
4098
|
-
segments: out,
|
|
4099
|
-
summary: {
|
|
4100
|
-
recovered,
|
|
4101
|
-
unchanged,
|
|
4102
|
-
unresolved
|
|
4103
|
-
}
|
|
4104
|
-
};
|
|
4105
|
-
};
|
|
4106
|
-
function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
|
|
4107
|
-
const mode = opts?.mode ?? "rerun_only";
|
|
4108
|
-
const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
|
|
4109
|
-
const resolved = resolveSelectorToRuleIndices(options, selector);
|
|
4110
|
-
const reportBase = {
|
|
4111
|
-
byRun: void 0,
|
|
4112
|
-
errors: resolved.errors,
|
|
4113
|
-
warnings: resolved.warnings
|
|
4114
|
-
};
|
|
4115
|
-
if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
|
|
4116
|
-
const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
|
|
4117
|
-
const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
|
|
4118
|
-
const merged = mergeWithRerun({
|
|
4119
|
-
fixedBuckets: buildFixedBuckets(fixedSegments),
|
|
4120
|
-
fixedSegments,
|
|
4121
|
-
normalizeCompare,
|
|
4122
|
-
originalSegments: segments,
|
|
4123
|
-
recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
|
|
4124
|
-
stage1RecoveredAtIndex: stage1.recoveredAtIndex
|
|
4125
|
-
});
|
|
4126
|
-
return {
|
|
4127
|
-
report: {
|
|
4128
|
-
...reportBase,
|
|
4129
|
-
details: merged.details,
|
|
4130
|
-
summary: {
|
|
4131
|
-
mode,
|
|
4132
|
-
recovered: merged.summary.recovered,
|
|
4133
|
-
totalSegments: segments.length,
|
|
4134
|
-
unchanged: merged.summary.unchanged,
|
|
4135
|
-
unresolved: merged.summary.unresolved
|
|
4136
|
-
}
|
|
4137
|
-
},
|
|
4138
|
-
segments: merged.segments
|
|
4139
|
-
};
|
|
4140
|
-
}
|
|
4141
|
-
function recoverMistakenMarkersForRuns(runs, opts) {
|
|
4142
|
-
const allSegments = [];
|
|
4143
|
-
const byRun = [];
|
|
4144
|
-
const details = [];
|
|
4145
|
-
const warnings = [];
|
|
4146
|
-
const errors = [];
|
|
4147
|
-
let recovered = 0;
|
|
4148
|
-
let unchanged = 0;
|
|
4149
|
-
let unresolved = 0;
|
|
4150
|
-
let offset = 0;
|
|
4151
|
-
for (let i = 0; i < runs.length; i++) {
|
|
4152
|
-
const run = runs[i];
|
|
4153
|
-
const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
|
|
4154
|
-
allSegments.push(...res.segments);
|
|
4155
|
-
for (const d of res.report.details) details.push({
|
|
4156
|
-
...d,
|
|
4157
|
-
segmentIndex: d.segmentIndex + offset
|
|
4158
|
-
});
|
|
4159
|
-
offset += run.segments.length;
|
|
4160
|
-
recovered += res.report.summary.recovered;
|
|
4161
|
-
unchanged += res.report.summary.unchanged;
|
|
4162
|
-
unresolved += res.report.summary.unresolved;
|
|
4163
|
-
warnings.push(...res.report.warnings);
|
|
4164
|
-
errors.push(...res.report.errors);
|
|
4165
|
-
byRun.push({
|
|
4166
|
-
recovered: res.report.summary.recovered,
|
|
4167
|
-
runIndex: i,
|
|
4168
|
-
totalSegments: run.segments.length,
|
|
4169
|
-
unresolved: res.report.summary.unresolved
|
|
4170
|
-
});
|
|
4171
|
-
}
|
|
4172
|
-
return {
|
|
4173
|
-
report: {
|
|
4174
|
-
byRun,
|
|
4175
|
-
details,
|
|
4176
|
-
errors,
|
|
4177
|
-
summary: {
|
|
4178
|
-
mode: opts?.mode ?? "rerun_only",
|
|
4179
|
-
recovered,
|
|
4180
|
-
totalSegments: offset,
|
|
4181
|
-
unchanged,
|
|
4182
|
-
unresolved
|
|
4183
|
-
},
|
|
4184
|
-
warnings
|
|
4185
|
-
},
|
|
4186
|
-
segments: allSegments
|
|
4187
|
-
};
|
|
4188
|
-
}
|
|
4189
|
-
|
|
4190
|
-
//#endregion
|
|
4191
|
-
//#region src/segmentation/pattern-validator.ts
|
|
4192
|
-
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
4193
|
-
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
4194
|
-
const buildBareTokenRegex = () => {
|
|
4195
|
-
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
4196
|
-
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
4197
|
-
};
|
|
4198
|
-
/**
|
|
4199
|
-
* Validates a single pattern for common issues.
|
|
4200
|
-
*/
|
|
4201
|
-
const validatePattern = (pattern, seenPatterns) => {
|
|
4202
|
-
if (!pattern.trim()) return {
|
|
4203
|
-
message: "Empty pattern is not allowed",
|
|
4204
|
-
type: "empty_pattern"
|
|
4205
|
-
};
|
|
4206
|
-
if (seenPatterns.has(pattern)) return {
|
|
4207
|
-
message: `Duplicate pattern: "${pattern}"`,
|
|
4208
|
-
pattern,
|
|
4209
|
-
type: "duplicate"
|
|
4210
|
-
};
|
|
4211
|
-
seenPatterns.add(pattern);
|
|
4212
|
-
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
4213
|
-
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
4214
|
-
const name = match[1];
|
|
4215
|
-
if (!KNOWN_TOKENS.has(name)) return {
|
|
4216
|
-
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
4217
|
-
suggestion: "Check spelling or use a known token",
|
|
4218
|
-
token: name,
|
|
4219
|
-
type: "unknown_token"
|
|
4220
|
-
};
|
|
4221
|
-
}
|
|
4222
|
-
for (const match of pattern.matchAll(buildBareTokenRegex())) {
|
|
4223
|
-
const [full, name] = match;
|
|
4224
|
-
const idx = match.index;
|
|
4225
|
-
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
4226
|
-
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
4227
|
-
suggestion: `{{${full}}}`,
|
|
4228
|
-
token: name,
|
|
4229
|
-
type: "missing_braces"
|
|
4230
|
-
};
|
|
4231
|
-
}
|
|
4232
|
-
};
|
|
4233
|
-
/**
|
|
4234
|
-
* Validates an array of patterns, returning parallel array of issues.
|
|
4235
|
-
*/
|
|
4236
|
-
const validatePatternArray = (patterns) => {
|
|
4237
|
-
const seen = /* @__PURE__ */ new Set();
|
|
4238
|
-
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
4239
|
-
return issues.some(Boolean) ? issues : void 0;
|
|
4240
|
-
};
|
|
4241
|
-
/**
|
|
4242
|
-
* Validates split rules for common pattern issues.
|
|
4243
|
-
*
|
|
4244
|
-
* Checks for:
|
|
4245
|
-
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
4246
|
-
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
4247
|
-
* - Duplicate patterns within the same rule
|
|
4248
|
-
*
|
|
4249
|
-
* @param rules - Array of split rules to validate
|
|
4250
|
-
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
4251
|
-
*
|
|
4252
|
-
* @example
|
|
4253
|
-
* const issues = validateRules([
|
|
4254
|
-
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
4255
|
-
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
4256
|
-
* ]);
|
|
4257
|
-
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
4258
|
-
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
4259
|
-
*/
|
|
4260
|
-
const validateRules = (rules) => rules.map((rule) => {
|
|
4261
|
-
const result = {};
|
|
4262
|
-
let hasIssues = false;
|
|
4263
|
-
for (const key of [
|
|
4264
|
-
"lineStartsWith",
|
|
4265
|
-
"lineStartsAfter",
|
|
4266
|
-
"lineEndsWith"
|
|
4267
|
-
]) if (key in rule && rule[key]) {
|
|
4268
|
-
const issues = validatePatternArray(rule[key]);
|
|
4269
|
-
if (issues) {
|
|
4270
|
-
result[key] = issues;
|
|
4271
|
-
hasIssues = true;
|
|
4272
|
-
}
|
|
4273
|
-
}
|
|
4274
|
-
if ("template" in rule && rule.template !== void 0) {
|
|
4275
|
-
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
4276
|
-
if (issue) {
|
|
4277
|
-
result.template = issue;
|
|
4278
|
-
hasIssues = true;
|
|
4279
|
-
}
|
|
4280
|
-
}
|
|
4281
|
-
return hasIssues ? result : void 0;
|
|
4282
|
-
});
|
|
4283
|
-
/**
|
|
4284
|
-
* Formats a validation result array into a list of human-readable error messages.
|
|
4285
|
-
*
|
|
4286
|
-
* Useful for displaying validation errors in UIs.
|
|
4287
|
-
*
|
|
4288
|
-
* @param results - The result array from `validateRules()`
|
|
4289
|
-
* @returns Array of formatted error strings
|
|
4290
|
-
*
|
|
4291
|
-
* @example
|
|
4292
|
-
* const issues = validateRules(rules);
|
|
4293
|
-
* const errors = formatValidationReport(issues);
|
|
4294
|
-
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
4295
|
-
*/
|
|
4296
|
-
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
4297
|
-
if (!result) return [];
|
|
4298
|
-
return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => {
|
|
4299
|
-
if (!issue) return null;
|
|
4300
|
-
const loc = `Rule ${i + 1}, ${type}`;
|
|
4301
|
-
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
4302
|
-
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
4303
|
-
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
4304
|
-
return `${loc}: ${issue.message || issue.type}`;
|
|
4305
|
-
})).filter((msg) => msg !== null);
|
|
4306
|
-
});
|
|
4307
|
-
|
|
4308
|
-
//#endregion
|
|
4309
|
-
//#region src/validation/validation-constants.ts
|
|
4310
|
-
/**
|
|
4311
|
-
* Validation-specific constants
|
|
4312
|
-
*/
|
|
4313
|
-
/**
|
|
4314
|
-
* Limit for validation issue preview length (characters).
|
|
4315
|
-
*/
|
|
4316
|
-
const PREVIEW_LIMIT = 140;
|
|
4317
|
-
/**
|
|
4318
|
-
* Threshold for short segment content (characters).
|
|
4319
|
-
* Segments shorter than this will trigger a full-document search fallback
|
|
4320
|
-
* if not found in the expected window.
|
|
4321
|
-
*/
|
|
4322
|
-
const FULL_SEARCH_THRESHOLD = 500;
|
|
4323
|
-
|
|
4324
4212
|
//#endregion
|
|
4325
4213
|
//#region src/validation/validate-segments.ts
|
|
4326
4214
|
/**
|
|
@@ -4329,8 +4217,8 @@ const FULL_SEARCH_THRESHOLD = 500;
|
|
|
4329
4217
|
*/
|
|
4330
4218
|
const buildPreview = (text) => {
|
|
4331
4219
|
const normalized = text.replace(/\s+/g, " ").trim();
|
|
4332
|
-
if (normalized.length <=
|
|
4333
|
-
return `${normalized.slice(0,
|
|
4220
|
+
if (normalized.length <= 140) return normalized;
|
|
4221
|
+
return `${normalized.slice(0, 140)}...`;
|
|
4334
4222
|
};
|
|
4335
4223
|
/**
|
|
4336
4224
|
* Creates a lightweight snapshot of a segment for inclusion in validation checks.
|
|
@@ -4358,19 +4246,18 @@ const normalizePages = (pages, options) => {
|
|
|
4358
4246
|
*/
|
|
4359
4247
|
const buildJoinedContent = (pages, joiner) => {
|
|
4360
4248
|
const boundaries = [];
|
|
4361
|
-
const
|
|
4362
|
-
const joined = nonEmptyPages.map((p) => p.content).join(joiner);
|
|
4249
|
+
const joined = pages.map((p) => p.content).join(joiner);
|
|
4363
4250
|
let offset = 0;
|
|
4364
|
-
for (let i = 0; i <
|
|
4365
|
-
const content =
|
|
4251
|
+
for (let i = 0; i < pages.length; i++) {
|
|
4252
|
+
const content = pages[i].content;
|
|
4366
4253
|
const start = offset;
|
|
4367
|
-
const end = start + content.length
|
|
4254
|
+
const end = start + content.length;
|
|
4368
4255
|
boundaries.push({
|
|
4369
4256
|
end,
|
|
4370
|
-
id:
|
|
4257
|
+
id: pages[i].id,
|
|
4371
4258
|
start
|
|
4372
4259
|
});
|
|
4373
|
-
offset
|
|
4260
|
+
offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
|
|
4374
4261
|
}
|
|
4375
4262
|
return {
|
|
4376
4263
|
boundaries,
|
|
@@ -4561,7 +4448,7 @@ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, search
|
|
|
4561
4448
|
const bufferSize = 1e3;
|
|
4562
4449
|
const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
|
|
4563
4450
|
if (rawMatches.length === 0) {
|
|
4564
|
-
const threshold = validationOptions?.fullSearchThreshold ??
|
|
4451
|
+
const threshold = validationOptions?.fullSearchThreshold ?? 500;
|
|
4565
4452
|
if (content.length < threshold) {
|
|
4566
4453
|
const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
|
|
4567
4454
|
const validMatch = fullMatches.find((m) => {
|
|
@@ -4715,7 +4602,7 @@ const validateSegments = (pages, options, segments, validationOptions) => {
|
|
|
4715
4602
|
}
|
|
4716
4603
|
};
|
|
4717
4604
|
};
|
|
4718
|
-
|
|
4719
4605
|
//#endregion
|
|
4720
|
-
export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive,
|
|
4606
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
4607
|
+
|
|
4721
4608
|
//# sourceMappingURL=index.mjs.map
|