flappa-doormal 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +407 -205
- package/dist/index.d.mts +722 -332
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1363 -396
- package/dist/index.mjs.map +1 -1
- package/package.json +11 -9
package/dist/index.mjs
CHANGED
|
@@ -1,517 +1,1484 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
//#region src/segmentation/fuzzy.ts
|
|
2
|
+
/**
|
|
3
|
+
* Fuzzy matching utilities for Arabic text.
|
|
4
|
+
*
|
|
5
|
+
* Provides diacritic-insensitive and character-equivalence matching for Arabic text.
|
|
6
|
+
* This allows matching text regardless of:
|
|
7
|
+
* - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
|
|
8
|
+
* - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
|
|
9
|
+
*
|
|
10
|
+
* @module fuzzy
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* // Make a pattern diacritic-insensitive
|
|
14
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
15
|
+
* new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* Character class matching all Arabic diacritics (Tashkeel/Harakat).
|
|
19
|
+
*
|
|
20
|
+
* Includes the following diacritical marks:
|
|
21
|
+
* - U+064B: ً (fathatan - double fatha)
|
|
22
|
+
* - U+064C: ٌ (dammatan - double damma)
|
|
23
|
+
* - U+064D: ٍ (kasratan - double kasra)
|
|
24
|
+
* - U+064E: َ (fatha - short a)
|
|
25
|
+
* - U+064F: ُ (damma - short u)
|
|
26
|
+
* - U+0650: ِ (kasra - short i)
|
|
27
|
+
* - U+0651: ّ (shadda - gemination)
|
|
28
|
+
* - U+0652: ْ (sukun - no vowel)
|
|
29
|
+
*
|
|
30
|
+
* @internal
|
|
31
|
+
*/
|
|
32
|
+
const DIACRITICS_CLASS = "[ًٌٍَُِّْ]";
|
|
4
33
|
/**
|
|
5
|
-
*
|
|
34
|
+
* Groups of equivalent Arabic characters.
|
|
35
|
+
*
|
|
36
|
+
* Characters within the same group are considered equivalent for matching purposes.
|
|
37
|
+
* This handles common variations in Arabic text where different characters are
|
|
38
|
+
* used interchangeably or have the same underlying meaning.
|
|
39
|
+
*
|
|
40
|
+
* Equivalence groups:
|
|
41
|
+
* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
|
|
42
|
+
* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
|
|
43
|
+
* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
|
|
44
|
+
*
|
|
45
|
+
* @internal
|
|
6
46
|
*/
|
|
7
|
-
const
|
|
47
|
+
const EQUIV_GROUPS = [
|
|
48
|
+
[
|
|
49
|
+
"ا",
|
|
50
|
+
"آ",
|
|
51
|
+
"أ",
|
|
52
|
+
"إ"
|
|
53
|
+
],
|
|
54
|
+
["ة", "ه"],
|
|
55
|
+
["ى", "ي"]
|
|
56
|
+
];
|
|
8
57
|
/**
|
|
9
|
-
*
|
|
58
|
+
* Escapes a string for safe inclusion in a regular expression.
|
|
59
|
+
*
|
|
60
|
+
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
61
|
+
*
|
|
62
|
+
* @param s - Any string to escape
|
|
63
|
+
* @returns String with regex metacharacters escaped
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
67
|
+
* escapeRegex('[test]') // → '\\[test\\]'
|
|
68
|
+
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
10
69
|
*/
|
|
11
|
-
const
|
|
70
|
+
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
12
71
|
/**
|
|
13
|
-
*
|
|
72
|
+
* Returns a regex character class for all equivalents of a given character.
|
|
73
|
+
*
|
|
74
|
+
* If the character belongs to one of the predefined equivalence groups
|
|
75
|
+
* (e.g., ا/آ/أ/إ), the returned class will match any member of that group.
|
|
76
|
+
* Otherwise, the original character is simply escaped for safe regex inclusion.
|
|
77
|
+
*
|
|
78
|
+
* @param ch - A single character to expand into its equivalence class
|
|
79
|
+
* @returns A RegExp-safe string representing the character and its equivalents
|
|
80
|
+
*
|
|
81
|
+
* @example
|
|
82
|
+
* getEquivClass('ا') // → '[اآأإ]' (matches any alef variant)
|
|
83
|
+
* getEquivClass('ب') // → 'ب' (no equivalents, just escaped)
|
|
84
|
+
* getEquivClass('.') // → '\\.' (regex metachar escaped)
|
|
85
|
+
*
|
|
86
|
+
* @internal
|
|
14
87
|
*/
|
|
15
|
-
const
|
|
88
|
+
const getEquivClass = (ch) => {
|
|
89
|
+
for (const group of EQUIV_GROUPS) if (group.includes(ch)) return `[${group.map((c) => escapeRegex(c)).join("")}]`;
|
|
90
|
+
return escapeRegex(ch);
|
|
91
|
+
};
|
|
16
92
|
/**
|
|
17
|
-
*
|
|
93
|
+
* Performs light normalization on Arabic text for consistent matching.
|
|
94
|
+
*
|
|
95
|
+
* Normalization steps:
|
|
96
|
+
* 1. NFC normalization (canonical decomposition then composition)
|
|
97
|
+
* 2. Remove Zero-Width Joiner (U+200D) and Zero-Width Non-Joiner (U+200C)
|
|
98
|
+
* 3. Collapse multiple whitespace characters to single space
|
|
99
|
+
* 4. Trim leading and trailing whitespace
|
|
100
|
+
*
|
|
101
|
+
* This normalization preserves diacritics and letter forms while removing
|
|
102
|
+
* invisible characters that could interfere with matching.
|
|
103
|
+
*
|
|
104
|
+
* @param str - Arabic text to normalize
|
|
105
|
+
* @returns Normalized string
|
|
106
|
+
*
|
|
107
|
+
* @example
|
|
108
|
+
* normalizeArabicLight('حَدَّثَنَا') // → 'حَدَّثَنَا' (diacritics preserved)
|
|
109
|
+
* normalizeArabicLight('بسم الله') // → 'بسم الله' (spaces collapsed)
|
|
110
|
+
* normalizeArabicLight(' text ') // → 'text' (trimmed)
|
|
111
|
+
*
|
|
112
|
+
* @internal
|
|
18
113
|
*/
|
|
19
|
-
const
|
|
20
|
-
"
|
|
21
|
-
"latin": "\\d+"
|
|
114
|
+
const normalizeArabicLight = (str) => {
|
|
115
|
+
return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
|
|
22
116
|
};
|
|
23
117
|
/**
|
|
24
|
-
*
|
|
118
|
+
* Creates a diacritic-insensitive regex pattern for Arabic text matching.
|
|
119
|
+
*
|
|
120
|
+
* Transforms input text into a regex pattern that matches the text regardless
|
|
121
|
+
* of diacritical marks (harakat) and character variations. Each character in
|
|
122
|
+
* the input is:
|
|
123
|
+
* 1. Expanded to its equivalence class (if applicable)
|
|
124
|
+
* 2. Followed by an optional diacritics matcher
|
|
125
|
+
*
|
|
126
|
+
* This allows matching:
|
|
127
|
+
* - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
|
|
128
|
+
* - `الإيمان` with `الايمان` (alef variants)
|
|
129
|
+
* - `صلاة` with `صلاه` (ta marbuta ↔ ha)
|
|
130
|
+
*
|
|
131
|
+
* @param text - Input Arabic text to make diacritic-insensitive
|
|
132
|
+
* @returns Regex pattern string that matches the text with or without diacritics
|
|
133
|
+
*
|
|
134
|
+
* @example
|
|
135
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
136
|
+
* // Each char gets equivalence class + optional diacritics
|
|
137
|
+
* // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* const pattern = makeDiacriticInsensitive('باب');
|
|
141
|
+
* new RegExp(pattern, 'u').test('بَابٌ') // → true
|
|
142
|
+
* new RegExp(pattern, 'u').test('باب') // → true
|
|
143
|
+
*
|
|
144
|
+
* @example
|
|
145
|
+
* // Using with split rules
|
|
146
|
+
* {
|
|
147
|
+
* lineStartsWith: ['باب'],
|
|
148
|
+
* split: 'at',
|
|
149
|
+
* fuzzy: true // Applies makeDiacriticInsensitive internally
|
|
150
|
+
* }
|
|
25
151
|
*/
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
"none": "",
|
|
31
|
-
"paren": "\\)"
|
|
152
|
+
const makeDiacriticInsensitive = (text) => {
|
|
153
|
+
const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
|
|
154
|
+
const norm = normalizeArabicLight(text);
|
|
155
|
+
return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
32
156
|
};
|
|
33
157
|
|
|
34
158
|
//#endregion
|
|
35
|
-
//#region src/
|
|
159
|
+
//#region src/segmentation/breakpoint-utils.ts
|
|
36
160
|
/**
|
|
37
|
-
*
|
|
38
|
-
*
|
|
161
|
+
* Normalizes a breakpoint to the object form.
|
|
162
|
+
* Strings are converted to { pattern: str } with no constraints.
|
|
163
|
+
*
|
|
164
|
+
* @param bp - Breakpoint as string or object
|
|
165
|
+
* @returns Normalized BreakpointRule object
|
|
166
|
+
*
|
|
167
|
+
* @example
|
|
168
|
+
* normalizeBreakpoint('\\n\\n')
|
|
169
|
+
* // → { pattern: '\\n\\n' }
|
|
170
|
+
*
|
|
171
|
+
* normalizeBreakpoint({ pattern: '\\n', min: 10 })
|
|
172
|
+
* // → { pattern: '\\n', min: 10 }
|
|
39
173
|
*/
|
|
174
|
+
const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
|
|
40
175
|
/**
|
|
41
|
-
*
|
|
42
|
-
*
|
|
176
|
+
* Checks if a page ID is in an excluded list (single pages or ranges).
|
|
177
|
+
*
|
|
178
|
+
* @param pageId - Page ID to check
|
|
179
|
+
* @param excludeList - List of page IDs or [from, to] ranges to exclude
|
|
180
|
+
* @returns True if page is excluded
|
|
181
|
+
*
|
|
182
|
+
* @example
|
|
183
|
+
* isPageExcluded(5, [1, 5, 10])
|
|
184
|
+
* // → true
|
|
185
|
+
*
|
|
186
|
+
* isPageExcluded(5, [[3, 7]])
|
|
187
|
+
* // → true
|
|
188
|
+
*
|
|
189
|
+
* isPageExcluded(5, [[10, 20]])
|
|
190
|
+
* // → false
|
|
43
191
|
*/
|
|
44
|
-
const
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
192
|
+
const isPageExcluded = (pageId, excludeList) => {
|
|
193
|
+
if (!excludeList || excludeList.length === 0) return false;
|
|
194
|
+
for (const item of excludeList) if (typeof item === "number") {
|
|
195
|
+
if (pageId === item) return true;
|
|
196
|
+
} else {
|
|
197
|
+
const [from, to] = item;
|
|
198
|
+
if (pageId >= from && pageId <= to) return true;
|
|
199
|
+
}
|
|
200
|
+
return false;
|
|
201
|
+
};
|
|
54
202
|
/**
|
|
55
|
-
*
|
|
56
|
-
*
|
|
203
|
+
* Checks if a page ID is within a breakpoint's min/max range and not excluded.
|
|
204
|
+
*
|
|
205
|
+
* @param pageId - Page ID to check
|
|
206
|
+
* @param rule - Breakpoint rule with optional min/max/exclude constraints
|
|
207
|
+
* @returns True if page is within valid range
|
|
208
|
+
*
|
|
209
|
+
* @example
|
|
210
|
+
* isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
|
|
211
|
+
* // → true
|
|
212
|
+
*
|
|
213
|
+
* isInBreakpointRange(5, { pattern: '\\n', min: 10 })
|
|
214
|
+
* // → false (below min)
|
|
57
215
|
*/
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
//#endregion
|
|
65
|
-
//#region src/markers/tokens.ts
|
|
216
|
+
const isInBreakpointRange = (pageId, rule) => {
|
|
217
|
+
if (rule.min !== void 0 && pageId < rule.min) return false;
|
|
218
|
+
if (rule.max !== void 0 && pageId > rule.max) return false;
|
|
219
|
+
return !isPageExcluded(pageId, rule.exclude);
|
|
220
|
+
};
|
|
66
221
|
/**
|
|
67
|
-
*
|
|
68
|
-
*
|
|
222
|
+
* Builds an exclude set from a PageRange array for O(1) lookups.
|
|
223
|
+
*
|
|
224
|
+
* @param excludeList - List of page IDs or [from, to] ranges
|
|
225
|
+
* @returns Set of all excluded page IDs
|
|
226
|
+
*
|
|
227
|
+
* @remarks
|
|
228
|
+
* This expands ranges into explicit page IDs for fast membership checks. For typical
|
|
229
|
+
* book-scale inputs (thousands of pages), this is small and keeps downstream logic
|
|
230
|
+
* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
|
|
231
|
+
* consider avoiding broad excludes or introducing a range-based membership structure.
|
|
232
|
+
*
|
|
233
|
+
* @example
|
|
234
|
+
* buildExcludeSet([1, 5, [10, 12]])
|
|
235
|
+
* // → Set { 1, 5, 10, 11, 12 }
|
|
69
236
|
*/
|
|
237
|
+
const buildExcludeSet = (excludeList) => {
|
|
238
|
+
const excludeSet = /* @__PURE__ */ new Set();
|
|
239
|
+
for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
|
|
240
|
+
else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
|
|
241
|
+
return excludeSet;
|
|
242
|
+
};
|
|
70
243
|
/**
|
|
71
|
-
*
|
|
72
|
-
*
|
|
244
|
+
* Creates a segment with optional to and meta fields.
|
|
245
|
+
* Returns null if content is empty after trimming.
|
|
246
|
+
*
|
|
247
|
+
* @param content - Segment content
|
|
248
|
+
* @param fromPageId - Starting page ID
|
|
249
|
+
* @param toPageId - Optional ending page ID (omitted if same as from)
|
|
250
|
+
* @param meta - Optional metadata to attach
|
|
251
|
+
* @returns Segment object or null if empty
|
|
252
|
+
*
|
|
253
|
+
* @example
|
|
254
|
+
* createSegment('Hello world', 1, 3, { chapter: 1 })
|
|
255
|
+
* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
|
|
256
|
+
*
|
|
257
|
+
* createSegment(' ', 1, undefined, undefined)
|
|
258
|
+
* // → null (empty content)
|
|
73
259
|
*/
|
|
74
|
-
const
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
paren: "\\)",
|
|
85
|
-
s: "\\s?",
|
|
86
|
-
slash: "/",
|
|
87
|
-
space: "\\s+"
|
|
260
|
+
const createSegment = (content, fromPageId, toPageId, meta) => {
|
|
261
|
+
const trimmed = content.trim();
|
|
262
|
+
if (!trimmed) return null;
|
|
263
|
+
const seg = {
|
|
264
|
+
content: trimmed,
|
|
265
|
+
from: fromPageId
|
|
266
|
+
};
|
|
267
|
+
if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
|
|
268
|
+
if (meta) seg.meta = meta;
|
|
269
|
+
return seg;
|
|
88
270
|
};
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
*
|
|
94
|
-
*
|
|
95
|
-
*
|
|
96
|
-
*
|
|
97
|
-
*
|
|
98
|
-
*
|
|
99
|
-
*
|
|
100
|
-
* @returns Regex pattern string with named groups
|
|
101
|
-
*
|
|
102
|
-
* @example
|
|
103
|
-
* expandTemplate('{num} {dash}')
|
|
104
|
-
* // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
|
|
105
|
-
*/
|
|
106
|
-
function expandTemplate(template, options) {
|
|
107
|
-
const tokenMap = options?.tokens || TOKENS;
|
|
108
|
-
let expandedMarker = template;
|
|
109
|
-
for (const [token, pattern] of Object.entries(tokenMap)) {
|
|
110
|
-
const placeholder = `{${token}}`;
|
|
111
|
-
expandedMarker = expandedMarker.replaceAll(placeholder, pattern);
|
|
112
|
-
}
|
|
113
|
-
return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\s\S]*))`;
|
|
114
|
-
}
|
|
115
|
-
/**
|
|
116
|
-
* Create a custom token map by extending the base tokens.
|
|
117
|
-
*
|
|
118
|
-
* @param customTokens - Custom token definitions
|
|
119
|
-
* @returns Combined token map
|
|
120
|
-
*
|
|
121
|
-
* @example
|
|
122
|
-
* const myTokens = createTokenMap({
|
|
123
|
-
* verse: '\\[[\\u0660-\\u0669]+\\]',
|
|
124
|
-
* tafsir: 'تفسير'
|
|
125
|
-
* });
|
|
271
|
+
/**
|
|
272
|
+
* Expands breakpoint patterns and pre-computes exclude sets.
|
|
273
|
+
*
|
|
274
|
+
* @param breakpoints - Array of breakpoint patterns or rules
|
|
275
|
+
* @param processPattern - Function to expand tokens in patterns
|
|
276
|
+
* @returns Array of expanded breakpoints with compiled regexes
|
|
277
|
+
*
|
|
278
|
+
* @remarks
|
|
279
|
+
* This function compiles regex patterns dynamically. This can be a ReDoS vector
|
|
280
|
+
* if patterns come from untrusted sources. In typical usage, breakpoint rules
|
|
281
|
+
* are application configuration, not user input.
|
|
126
282
|
*/
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
* validateTemplate('{invalid}')
|
|
145
|
-
* // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
|
|
146
|
-
*/
|
|
147
|
-
function validateTemplate(template, tokens = TOKENS) {
|
|
148
|
-
const unknownTokens = (template.match(/\{(\w+)\}/g) || []).map((t) => t.slice(1, -1)).filter((name) => !tokens[name]);
|
|
149
|
-
if (unknownTokens.length > 0) return {
|
|
150
|
-
valid: false,
|
|
151
|
-
errors: [`Unknown tokens: ${unknownTokens.map((t) => `{${t}}`).join(", ")}`, `Available tokens: ${Object.keys(tokens).map((t) => `{${t}}`).join(", ")}`]
|
|
283
|
+
const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
|
|
284
|
+
const rule = normalizeBreakpoint(bp);
|
|
285
|
+
const excludeSet = buildExcludeSet(rule.exclude);
|
|
286
|
+
const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
|
|
287
|
+
const expandedSkip = processPattern$1(rule.skipWhen);
|
|
288
|
+
try {
|
|
289
|
+
return new RegExp(expandedSkip, "mu");
|
|
290
|
+
} catch (error) {
|
|
291
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
292
|
+
throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
|
|
293
|
+
}
|
|
294
|
+
})() : null;
|
|
295
|
+
if (rule.pattern === "") return {
|
|
296
|
+
excludeSet,
|
|
297
|
+
regex: null,
|
|
298
|
+
rule,
|
|
299
|
+
skipWhenRegex
|
|
152
300
|
};
|
|
153
|
-
|
|
154
|
-
|
|
301
|
+
const expanded = processPattern$1(rule.pattern);
|
|
302
|
+
try {
|
|
303
|
+
return {
|
|
304
|
+
excludeSet,
|
|
305
|
+
regex: new RegExp(expanded, "gmu"),
|
|
306
|
+
rule,
|
|
307
|
+
skipWhenRegex
|
|
308
|
+
};
|
|
309
|
+
} catch (error) {
|
|
310
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
311
|
+
throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
/**
|
|
315
|
+
* Finds the actual ending page index by searching backwards for page content prefix.
|
|
316
|
+
* Used to determine which page a segment actually ends on based on content matching.
|
|
317
|
+
*
|
|
318
|
+
* @param pieceContent - Content of the segment piece
|
|
319
|
+
* @param currentFromIdx - Current starting index in pageIds
|
|
320
|
+
* @param toIdx - Maximum ending index to search
|
|
321
|
+
* @param pageIds - Array of page IDs
|
|
322
|
+
* @param normalizedPages - Map of page ID to normalized content
|
|
323
|
+
* @returns The actual ending page index
|
|
324
|
+
*/
|
|
325
|
+
const findActualEndPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
|
|
326
|
+
for (let pi = toIdx; pi > currentFromIdx; pi--) {
|
|
327
|
+
const pageData = normalizedPages.get(pageIds[pi]);
|
|
328
|
+
if (pageData) {
|
|
329
|
+
const checkPortion = pageData.content.slice(0, Math.min(30, pageData.length));
|
|
330
|
+
if (checkPortion.length > 0 && pieceContent.indexOf(checkPortion) > 0) return pi;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
return currentFromIdx;
|
|
334
|
+
};
|
|
335
|
+
/**
|
|
336
|
+
* Finds the actual starting page index by searching forwards for page content prefix.
|
|
337
|
+
* Used to determine which page content actually starts from based on content matching.
|
|
338
|
+
*
|
|
339
|
+
* This is the counterpart to findActualEndPage - it searches forward to find which
|
|
340
|
+
* page the content starts on, rather than which page it ends on.
|
|
341
|
+
*
|
|
342
|
+
* @param pieceContent - Content of the segment piece
|
|
343
|
+
* @param currentFromIdx - Current starting index in pageIds
|
|
344
|
+
* @param toIdx - Maximum ending index to search
|
|
345
|
+
* @param pageIds - Array of page IDs
|
|
346
|
+
* @param normalizedPages - Map of page ID to normalized content
|
|
347
|
+
* @returns The actual starting page index
|
|
348
|
+
*/
|
|
349
|
+
const findActualStartPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
|
|
350
|
+
const trimmedPiece = pieceContent.trimStart();
|
|
351
|
+
if (!trimmedPiece) return currentFromIdx;
|
|
352
|
+
for (let pi = currentFromIdx; pi <= toIdx; pi++) {
|
|
353
|
+
const pageData = normalizedPages.get(pageIds[pi]);
|
|
354
|
+
if (pageData) {
|
|
355
|
+
const pagePrefix = pageData.content.slice(0, Math.min(30, pageData.length)).trim();
|
|
356
|
+
const piecePrefix = trimmedPiece.slice(0, Math.min(30, trimmedPiece.length));
|
|
357
|
+
if (pagePrefix.length > 0) {
|
|
358
|
+
if (trimmedPiece.startsWith(pagePrefix)) return pi;
|
|
359
|
+
if (pageData.content.trimStart().startsWith(piecePrefix)) return pi;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
return currentFromIdx;
|
|
364
|
+
};
|
|
365
|
+
/**
|
|
366
|
+
* Checks if any page in a range is excluded by the given exclude set.
|
|
367
|
+
*
|
|
368
|
+
* @param excludeSet - Set of excluded page IDs
|
|
369
|
+
* @param pageIds - Array of page IDs
|
|
370
|
+
* @param fromIdx - Start index (inclusive)
|
|
371
|
+
* @param toIdx - End index (inclusive)
|
|
372
|
+
* @returns True if any page in range is excluded
|
|
373
|
+
*/
|
|
374
|
+
const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
|
|
375
|
+
if (excludeSet.size === 0) return false;
|
|
376
|
+
for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
|
|
377
|
+
return false;
|
|
378
|
+
};
|
|
379
|
+
/**
|
|
380
|
+
* Finds the position of the next page content within remaining content.
|
|
381
|
+
* Returns -1 if not found.
|
|
382
|
+
*
|
|
383
|
+
* @param remainingContent - Content to search in
|
|
384
|
+
* @param nextPageData - Normalized data for the next page
|
|
385
|
+
* @returns Position of next page content, or -1 if not found
|
|
386
|
+
*/
|
|
387
|
+
const findNextPagePosition = (remainingContent, nextPageData) => {
|
|
388
|
+
const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
|
|
389
|
+
if (searchPrefix.length === 0) return -1;
|
|
390
|
+
const pos = remainingContent.indexOf(searchPrefix);
|
|
391
|
+
return pos > 0 ? pos : -1;
|
|
392
|
+
};
|
|
393
|
+
/**
|
|
394
|
+
* Finds matches within a window and returns the selected position based on preference.
|
|
395
|
+
*
|
|
396
|
+
* @param windowContent - Content to search
|
|
397
|
+
* @param regex - Regex to match
|
|
398
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
399
|
+
* @returns Break position after the selected match, or -1 if no matches
|
|
400
|
+
*/
|
|
401
|
+
const findPatternBreakPosition = (windowContent, regex, prefer) => {
|
|
402
|
+
const matches = [];
|
|
403
|
+
for (const m of windowContent.matchAll(regex)) matches.push({
|
|
404
|
+
index: m.index,
|
|
405
|
+
length: m[0].length
|
|
406
|
+
});
|
|
407
|
+
if (matches.length === 0) return -1;
|
|
408
|
+
const selected = prefer === "longer" ? matches[matches.length - 1] : matches[0];
|
|
409
|
+
return selected.index + selected.length;
|
|
410
|
+
};
|
|
411
|
+
/**
|
|
412
|
+
* Tries to find a break position within the current window using breakpoint patterns.
|
|
413
|
+
* Returns the break position or -1 if no suitable break was found.
|
|
414
|
+
*
|
|
415
|
+
* @param remainingContent - Content remaining to be segmented
|
|
416
|
+
* @param currentFromIdx - Current starting page index
|
|
417
|
+
* @param toIdx - Ending page index
|
|
418
|
+
* @param windowEndIdx - Maximum window end index
|
|
419
|
+
* @param ctx - Breakpoint context with page data and patterns
|
|
420
|
+
* @returns Break position in the content, or -1 if no break found
|
|
421
|
+
*/
|
|
422
|
+
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, ctx) => {
|
|
423
|
+
const { pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, prefer } = ctx;
|
|
424
|
+
for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
|
|
425
|
+
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
426
|
+
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
427
|
+
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
428
|
+
if (regex === null) {
|
|
429
|
+
const nextPageIdx = windowEndIdx + 1;
|
|
430
|
+
if (nextPageIdx <= toIdx) {
|
|
431
|
+
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
432
|
+
if (nextPageData) {
|
|
433
|
+
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
434
|
+
if (pos > 0) return pos;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
return Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
|
|
438
|
+
}
|
|
439
|
+
const windowEndPosition = Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
|
|
440
|
+
const breakPos = findPatternBreakPosition(remainingContent.slice(0, windowEndPosition), regex, prefer);
|
|
441
|
+
if (breakPos > 0) return breakPos;
|
|
442
|
+
}
|
|
443
|
+
return -1;
|
|
444
|
+
};
|
|
155
445
|
|
|
156
446
|
//#endregion
|
|
157
|
-
//#region src/
|
|
447
|
+
//#region src/segmentation/match-utils.ts
|
|
158
448
|
/**
|
|
159
|
-
*
|
|
449
|
+
* Utility functions for regex matching and result processing.
|
|
160
450
|
*
|
|
161
|
-
*
|
|
162
|
-
*
|
|
163
|
-
*
|
|
451
|
+
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
452
|
+
* and enable independent testing. They handle match filtering, capture
|
|
453
|
+
* extraction, and occurrence-based selection.
|
|
164
454
|
*
|
|
165
|
-
* @
|
|
166
|
-
|
|
167
|
-
|
|
455
|
+
* @module match-utils
|
|
456
|
+
*/
|
|
457
|
+
/**
|
|
458
|
+
* Extracts named capture groups from a regex match.
|
|
459
|
+
*
|
|
460
|
+
* Only includes groups that are in the `captureNames` list and have
|
|
461
|
+
* defined values. This filters out positional captures and ensures
|
|
462
|
+
* only explicitly requested named captures are returned.
|
|
463
|
+
*
|
|
464
|
+
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
465
|
+
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
466
|
+
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
168
467
|
*
|
|
169
468
|
* @example
|
|
170
|
-
*
|
|
171
|
-
*
|
|
469
|
+
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
470
|
+
* extractNamedCaptures(match.groups, ['num'])
|
|
471
|
+
* // → { num: '٦٦٩٦' }
|
|
172
472
|
*
|
|
173
473
|
* @example
|
|
174
|
-
* //
|
|
175
|
-
*
|
|
474
|
+
* // No matching captures
|
|
475
|
+
* extractNamedCaptures({}, ['num'])
|
|
476
|
+
* // → undefined
|
|
176
477
|
*
|
|
177
478
|
* @example
|
|
178
|
-
* //
|
|
179
|
-
*
|
|
180
|
-
*
|
|
181
|
-
* template: '{verse}',
|
|
182
|
-
* tokens: { verse: '\\[[0-9]+\\]' }
|
|
183
|
-
* });
|
|
479
|
+
* // Undefined groups
|
|
480
|
+
* extractNamedCaptures(undefined, ['num'])
|
|
481
|
+
* // → undefined
|
|
184
482
|
*/
|
|
185
|
-
|
|
186
|
-
if (
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if (!config.pattern) throw new Error("pattern marker must provide either a template or pattern");
|
|
192
|
-
return new RegExp(config.pattern, "u");
|
|
193
|
-
}
|
|
483
|
+
const extractNamedCaptures = (groups, captureNames) => {
|
|
484
|
+
if (!groups || captureNames.length === 0) return;
|
|
485
|
+
const namedCaptures = {};
|
|
486
|
+
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
487
|
+
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
488
|
+
};
|
|
194
489
|
/**
|
|
195
|
-
*
|
|
490
|
+
* Gets the last defined positional capture group from a match array.
|
|
196
491
|
*
|
|
197
|
-
*
|
|
198
|
-
*
|
|
492
|
+
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
493
|
+
* is always at the end of the pattern. Named captures may shift the
|
|
494
|
+
* positional indices, so we iterate backward to find the actual content.
|
|
199
495
|
*
|
|
200
|
-
* @
|
|
496
|
+
* @param match - RegExp exec result array
|
|
497
|
+
* @returns The last defined capture group value, or `undefined` if none
|
|
201
498
|
*
|
|
202
499
|
* @example
|
|
203
|
-
*
|
|
204
|
-
*
|
|
205
|
-
*
|
|
206
|
-
* //
|
|
500
|
+
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
501
|
+
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
502
|
+
* getLastPositionalCapture(match)
|
|
503
|
+
* // → 'content'
|
|
504
|
+
*
|
|
505
|
+
* @example
|
|
506
|
+
* // No captures
|
|
507
|
+
* getLastPositionalCapture(['full match'])
|
|
508
|
+
* // → undefined
|
|
207
509
|
*/
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
}
|
|
510
|
+
const getLastPositionalCapture = (match) => {
|
|
511
|
+
if (match.length <= 1) return;
|
|
512
|
+
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
513
|
+
};
|
|
213
514
|
/**
|
|
214
|
-
*
|
|
515
|
+
* Filters matches to only include those within page ID constraints.
|
|
215
516
|
*
|
|
216
|
-
*
|
|
217
|
-
*
|
|
218
|
-
* All phrases are made diacritic-insensitive.
|
|
517
|
+
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
518
|
+
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
219
519
|
*
|
|
220
|
-
* @param
|
|
221
|
-
* @
|
|
520
|
+
* @param matches - Array of match results to filter
|
|
521
|
+
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
522
|
+
* @param getId - Function that returns the page ID for a given offset
|
|
523
|
+
* @returns Filtered array containing only matches within constraints
|
|
222
524
|
*
|
|
223
525
|
* @example
|
|
224
|
-
*
|
|
225
|
-
*
|
|
226
|
-
*
|
|
526
|
+
* const matches = [
|
|
527
|
+
* { start: 0, end: 10 }, // Page 1
|
|
528
|
+
* { start: 100, end: 110 }, // Page 5
|
|
529
|
+
* { start: 200, end: 210 }, // Page 10
|
|
530
|
+
* ];
|
|
531
|
+
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
532
|
+
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
533
|
+
*/
|
|
534
|
+
const filterByConstraints = (matches, rule, getId) => {
|
|
535
|
+
return matches.filter((m) => {
|
|
536
|
+
const id = getId(m.start);
|
|
537
|
+
if (rule.min !== void 0 && id < rule.min) return false;
|
|
538
|
+
if (rule.max !== void 0 && id > rule.max) return false;
|
|
539
|
+
if (isPageExcluded(id, rule.exclude)) return false;
|
|
540
|
+
return true;
|
|
541
|
+
});
|
|
542
|
+
};
|
|
543
|
+
/**
|
|
544
|
+
* Filters matches based on occurrence setting (first, last, or all).
|
|
545
|
+
*
|
|
546
|
+
* Applies occurrence-based selection to a list of matches:
|
|
547
|
+
* - `'all'` or `undefined`: Return all matches (default)
|
|
548
|
+
* - `'first'`: Return only the first match
|
|
549
|
+
* - `'last'`: Return only the last match
|
|
550
|
+
*
|
|
551
|
+
* @param matches - Array of match results to filter
|
|
552
|
+
* @param occurrence - Which occurrence(s) to keep
|
|
553
|
+
* @returns Filtered array based on occurrence setting
|
|
227
554
|
*
|
|
228
555
|
* @example
|
|
229
|
-
*
|
|
230
|
-
*
|
|
231
|
-
*
|
|
232
|
-
*
|
|
233
|
-
*
|
|
556
|
+
* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
|
|
557
|
+
*
|
|
558
|
+
* filterByOccurrence(matches, 'first')
|
|
559
|
+
* // → [{ start: 0 }]
|
|
560
|
+
*
|
|
561
|
+
* filterByOccurrence(matches, 'last')
|
|
562
|
+
* // → [{ start: 20 }]
|
|
563
|
+
*
|
|
564
|
+
* filterByOccurrence(matches, 'all')
|
|
565
|
+
* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
|
|
566
|
+
*
|
|
567
|
+
* filterByOccurrence(matches, undefined)
|
|
568
|
+
* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
|
|
234
569
|
*/
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
570
|
+
const filterByOccurrence = (matches, occurrence) => {
|
|
571
|
+
if (!matches.length) return [];
|
|
572
|
+
if (occurrence === "first") return [matches[0]];
|
|
573
|
+
if (occurrence === "last") return [matches[matches.length - 1]];
|
|
574
|
+
return matches;
|
|
575
|
+
};
|
|
240
576
|
/**
|
|
241
|
-
*
|
|
577
|
+
* Checks if any rule in the list allows the given page ID.
|
|
578
|
+
*
|
|
579
|
+
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
580
|
+
* Rules without constraints allow all page IDs.
|
|
581
|
+
*
|
|
582
|
+
* This is used to determine whether to create a segment for content
|
|
583
|
+
* that appears before any split points (the "first segment").
|
|
242
584
|
*
|
|
243
|
-
*
|
|
244
|
-
* -
|
|
245
|
-
*
|
|
246
|
-
*
|
|
585
|
+
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
586
|
+
* @param pageId - Page ID to check
|
|
587
|
+
* @returns `true` if at least one rule allows the page ID
|
|
588
|
+
*
|
|
589
|
+
* @example
|
|
590
|
+
* const rules = [
|
|
591
|
+
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
592
|
+
* { min: 20 }, // Allows pages 20+
|
|
593
|
+
* ];
|
|
247
594
|
*
|
|
248
|
-
*
|
|
595
|
+
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
596
|
+
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
597
|
+
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
249
598
|
*
|
|
250
599
|
* @example
|
|
251
|
-
*
|
|
252
|
-
*
|
|
253
|
-
* // match.groups.marker -> 'بسم الله'
|
|
600
|
+
* // Rules without constraints allow everything
|
|
601
|
+
* anyRuleAllowsId([{}], 999) // → true
|
|
254
602
|
*/
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
603
|
+
const anyRuleAllowsId = (rules, pageId) => {
|
|
604
|
+
return rules.some((r) => {
|
|
605
|
+
const minOk = r.min === void 0 || pageId >= r.min;
|
|
606
|
+
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
607
|
+
return minOk && maxOk;
|
|
608
|
+
});
|
|
609
|
+
};
|
|
610
|
+
|
|
611
|
+
//#endregion
|
|
612
|
+
//#region src/segmentation/textUtils.ts
|
|
260
613
|
/**
|
|
261
|
-
*
|
|
614
|
+
* Strip all HTML tags from content, keeping only text.
|
|
262
615
|
*
|
|
263
|
-
*
|
|
264
|
-
*
|
|
616
|
+
* @param html - HTML content
|
|
617
|
+
* @returns Plain text content
|
|
618
|
+
*/
|
|
619
|
+
const stripHtmlTags = (html) => {
|
|
620
|
+
return html.replace(/<[^>]*>/g, "");
|
|
621
|
+
};
|
|
622
|
+
/**
|
|
623
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
265
624
|
*
|
|
266
|
-
*
|
|
267
|
-
*
|
|
268
|
-
* @throws {Error} When `phrases` is undefined or empty
|
|
625
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
626
|
+
* for consistent pattern matching across platforms.
|
|
269
627
|
*
|
|
270
|
-
* @
|
|
271
|
-
*
|
|
272
|
-
* type: 'phrase',
|
|
273
|
-
* phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
|
|
274
|
-
* });
|
|
628
|
+
* @param content - Raw content with potentially mixed line endings
|
|
629
|
+
* @returns Content with all line endings normalized to `\n`
|
|
275
630
|
*/
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
return new RegExp(pattern, "u");
|
|
281
|
-
}
|
|
631
|
+
const normalizeLineEndings = (content) => content.replace(/\r\n?/g, "\n");
|
|
632
|
+
|
|
633
|
+
//#endregion
|
|
634
|
+
//#region src/segmentation/tokens.ts
|
|
282
635
|
/**
|
|
283
|
-
*
|
|
636
|
+
* Token-based template system for Arabic text pattern matching.
|
|
284
637
|
*
|
|
285
|
-
*
|
|
286
|
-
*
|
|
287
|
-
*
|
|
288
|
-
* - ° [٦٥] - With degree prefix
|
|
638
|
+
* This module provides a human-readable way to define regex patterns using
|
|
639
|
+
* `{{token}}` placeholders that expand to their regex equivalents. It supports
|
|
640
|
+
* named capture groups for extracting matched values into metadata.
|
|
289
641
|
*
|
|
290
|
-
* @
|
|
642
|
+
* @module tokens
|
|
291
643
|
*
|
|
292
644
|
* @example
|
|
293
|
-
*
|
|
294
|
-
*
|
|
295
|
-
* //
|
|
645
|
+
* // Simple token expansion
|
|
646
|
+
* expandTokens('{{raqms}} {{dash}}')
|
|
647
|
+
* // → '[\\u0660-\\u0669]+ [-–—ـ]'
|
|
648
|
+
*
|
|
649
|
+
* @example
|
|
650
|
+
* // Named capture groups
|
|
651
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
652
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
296
653
|
*/
|
|
297
|
-
function generateSquareBracketRegex() {
|
|
298
|
-
const markerPattern = String.raw`[•°]?\s?\[[\u0660-\u0669]+\]\s?`;
|
|
299
|
-
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
300
|
-
return new RegExp(pattern, "u");
|
|
301
|
-
}
|
|
302
654
|
/**
|
|
303
|
-
*
|
|
655
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
304
656
|
*
|
|
305
|
-
*
|
|
306
|
-
* -
|
|
307
|
-
* -
|
|
657
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
658
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
659
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
660
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
308
661
|
*
|
|
309
|
-
* @
|
|
310
|
-
*
|
|
662
|
+
* @remarks
|
|
663
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
664
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
665
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
311
666
|
*
|
|
312
667
|
* @example
|
|
313
|
-
*
|
|
314
|
-
*
|
|
315
|
-
*
|
|
316
|
-
*
|
|
317
|
-
*
|
|
668
|
+
* // Using tokens in a split rule
|
|
669
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
670
|
+
*
|
|
671
|
+
* @example
|
|
672
|
+
* // Using tokens with named captures
|
|
673
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
318
674
|
*/
|
|
319
|
-
function generateNumLetterRegex(config) {
|
|
320
|
-
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
321
|
-
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
|
|
322
|
-
const markerPattern = String.raw`${numPattern} [أ-ي]\s?${sepPattern}`;
|
|
323
|
-
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
324
|
-
return new RegExp(pattern, "u");
|
|
325
|
-
}
|
|
326
675
|
/**
|
|
327
|
-
*
|
|
676
|
+
* Base token definitions mapping human-readable token names to regex patterns.
|
|
328
677
|
*
|
|
329
|
-
*
|
|
330
|
-
*
|
|
331
|
-
* - 5 (٦) - (number with parenthetical number)
|
|
678
|
+
* These tokens contain raw regex patterns and do not reference other tokens.
|
|
679
|
+
* For composite tokens that build on these, see `COMPOSITE_TOKENS`.
|
|
332
680
|
*
|
|
333
|
-
* @
|
|
334
|
-
|
|
681
|
+
* @internal
|
|
682
|
+
*/
|
|
683
|
+
const BASE_TOKENS = {
|
|
684
|
+
bab: "باب",
|
|
685
|
+
basmalah: "بسم الله|﷽",
|
|
686
|
+
bullet: "[•*°]",
|
|
687
|
+
dash: "[-–—ـ]",
|
|
688
|
+
fasl: "فصل|مسألة",
|
|
689
|
+
harf: "[أ-ي]",
|
|
690
|
+
kitab: "كتاب",
|
|
691
|
+
naql: "حدثنا|أخبرنا|حدثني|وحدثنا|أنبأنا|سمعت",
|
|
692
|
+
raqm: "[\\u0660-\\u0669]",
|
|
693
|
+
raqms: "[\\u0660-\\u0669]+",
|
|
694
|
+
tarqim: "[.!?؟؛]"
|
|
695
|
+
};
|
|
696
|
+
/**
|
|
697
|
+
* Composite token definitions using template syntax.
|
|
698
|
+
*
|
|
699
|
+
* These tokens reference base tokens using `{{token}}` syntax and are
|
|
700
|
+
* automatically expanded to their final regex patterns at module load time.
|
|
701
|
+
*
|
|
702
|
+
* This provides better abstraction - if base tokens change, composites
|
|
703
|
+
* automatically update on the next build.
|
|
704
|
+
*
|
|
705
|
+
* @internal
|
|
706
|
+
*/
|
|
707
|
+
const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
|
|
708
|
+
/**
|
|
709
|
+
* Expands base tokens in a template string.
|
|
710
|
+
* Used internally to pre-expand composite tokens.
|
|
711
|
+
*
|
|
712
|
+
* @param template - Template string with `{{token}}` placeholders
|
|
713
|
+
* @returns Expanded pattern with base tokens replaced
|
|
714
|
+
* @internal
|
|
715
|
+
*/
|
|
716
|
+
const expandBaseTokens = (template) => {
|
|
717
|
+
return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
|
|
718
|
+
return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
|
|
719
|
+
});
|
|
720
|
+
};
|
|
721
|
+
/**
|
|
722
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
723
|
+
*
|
|
724
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
725
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
726
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
727
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
728
|
+
*
|
|
729
|
+
* @remarks
|
|
730
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
731
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
732
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
335
733
|
*
|
|
336
734
|
* @example
|
|
337
|
-
*
|
|
338
|
-
*
|
|
339
|
-
*
|
|
340
|
-
*
|
|
341
|
-
*
|
|
735
|
+
* // Using tokens in a split rule
|
|
736
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
737
|
+
*
|
|
738
|
+
* @example
|
|
739
|
+
* // Using tokens with named captures
|
|
740
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
741
|
+
*
|
|
742
|
+
* @example
|
|
743
|
+
* // Using the numbered convenience token
|
|
744
|
+
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
745
|
+
*/
|
|
746
|
+
const TOKEN_PATTERNS = {
|
|
747
|
+
...BASE_TOKENS,
|
|
748
|
+
...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
|
|
749
|
+
};
|
|
750
|
+
/**
|
|
751
|
+
* Regex pattern for matching tokens with optional named capture syntax.
|
|
752
|
+
*
|
|
753
|
+
* Matches:
|
|
754
|
+
* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
|
|
755
|
+
* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
|
|
756
|
+
* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
|
|
757
|
+
*
|
|
758
|
+
* @internal
|
|
342
759
|
*/
|
|
343
|
-
|
|
344
|
-
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
345
|
-
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
|
|
346
|
-
const markerPattern = String.raw`${numPattern}\s*\([\u0600-\u06FF\u0660-\u0669\s]+\)\s?${sepPattern}`;
|
|
347
|
-
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
348
|
-
return new RegExp(pattern, "u");
|
|
349
|
-
}
|
|
760
|
+
const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
|
|
350
761
|
/**
|
|
351
|
-
*
|
|
762
|
+
* Regex pattern for simple token matching (no capture syntax).
|
|
763
|
+
*
|
|
764
|
+
* Matches only `{{token}}` format where token is one or more word characters.
|
|
765
|
+
* Used by `containsTokens()` for quick detection.
|
|
352
766
|
*
|
|
353
|
-
*
|
|
354
|
-
|
|
355
|
-
|
|
767
|
+
* @internal
|
|
768
|
+
*/
|
|
769
|
+
const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
|
|
770
|
+
/**
|
|
771
|
+
* Checks if a query string contains template tokens.
|
|
356
772
|
*
|
|
357
|
-
*
|
|
773
|
+
* Performs a quick test for `{{token}}` patterns without actually
|
|
774
|
+
* expanding them. Useful for determining whether to apply token
|
|
775
|
+
* expansion to a string.
|
|
358
776
|
*
|
|
359
|
-
* @param
|
|
360
|
-
* @returns
|
|
777
|
+
* @param query - String to check for tokens
|
|
778
|
+
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
361
779
|
*
|
|
362
780
|
* @example
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
*
|
|
366
|
-
* });
|
|
367
|
-
* const match1 = regex.exec('٥/٦ - نص');
|
|
368
|
-
* const match2 = regex.exec('٥ - نص'); // Also matches
|
|
781
|
+
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
782
|
+
* containsTokens('plain text') // → false
|
|
783
|
+
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
369
784
|
*/
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
375
|
-
return new RegExp(pattern, "u");
|
|
376
|
-
}
|
|
785
|
+
const containsTokens = (query) => {
|
|
786
|
+
SIMPLE_TOKEN_REGEX.lastIndex = 0;
|
|
787
|
+
return SIMPLE_TOKEN_REGEX.test(query);
|
|
788
|
+
};
|
|
377
789
|
/**
|
|
378
|
-
*
|
|
790
|
+
* Expands template tokens with support for named captures.
|
|
379
791
|
*
|
|
380
|
-
*
|
|
381
|
-
*
|
|
382
|
-
*
|
|
792
|
+
* This is the primary token expansion function that handles all token syntax:
|
|
793
|
+
* - `{{token}}` → Expands to the token's pattern (no capture group)
|
|
794
|
+
* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
|
|
795
|
+
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
383
796
|
*
|
|
384
|
-
*
|
|
385
|
-
* - Separator 'none' generates pattern without separator
|
|
386
|
-
* - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
|
|
797
|
+
* Unknown tokens are left as-is in the output, allowing for partial templates.
|
|
387
798
|
*
|
|
388
|
-
* @param
|
|
389
|
-
* @
|
|
799
|
+
* @param query - The template string containing tokens
|
|
800
|
+
* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
|
|
801
|
+
* Applied to both token patterns and plain Arabic text between tokens.
|
|
802
|
+
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
803
|
+
* @returns Object with expanded pattern, capture names, and capture flag
|
|
390
804
|
*
|
|
391
805
|
* @example
|
|
392
|
-
* //
|
|
393
|
-
*
|
|
394
|
-
*
|
|
395
|
-
* separator: 'dash',
|
|
396
|
-
* format: '{bullet}+ {num} {dash}'
|
|
397
|
-
* });
|
|
806
|
+
* // Simple token expansion
|
|
807
|
+
* expandTokensWithCaptures('{{raqms}} {{dash}}')
|
|
808
|
+
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
398
809
|
*
|
|
399
810
|
* @example
|
|
400
|
-
* //
|
|
401
|
-
*
|
|
402
|
-
*
|
|
403
|
-
* separator: 'dash'
|
|
404
|
-
* });
|
|
405
|
-
* const match = regex.exec('٥ - نص');
|
|
811
|
+
* // Named capture
|
|
812
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
813
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
406
814
|
*
|
|
407
815
|
* @example
|
|
408
|
-
* //
|
|
409
|
-
*
|
|
410
|
-
*
|
|
411
|
-
*
|
|
412
|
-
*
|
|
413
|
-
*
|
|
816
|
+
* // Capture-only token
|
|
817
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
|
|
818
|
+
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
|
|
819
|
+
*
|
|
820
|
+
* @example
|
|
821
|
+
* // With fuzzy transform
|
|
822
|
+
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
823
|
+
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
414
824
|
*/
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
825
|
+
const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
826
|
+
const captureNames = [];
|
|
827
|
+
const segments = [];
|
|
828
|
+
let lastIndex = 0;
|
|
829
|
+
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
830
|
+
let match;
|
|
831
|
+
while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
|
|
832
|
+
if (match.index > lastIndex) segments.push({
|
|
833
|
+
type: "text",
|
|
834
|
+
value: query.slice(lastIndex, match.index)
|
|
835
|
+
});
|
|
836
|
+
segments.push({
|
|
837
|
+
type: "token",
|
|
838
|
+
value: match[0]
|
|
839
|
+
});
|
|
840
|
+
lastIndex = match.index + match[0].length;
|
|
420
841
|
}
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
const
|
|
426
|
-
|
|
427
|
-
|
|
842
|
+
if (lastIndex < query.length) segments.push({
|
|
843
|
+
type: "text",
|
|
844
|
+
value: query.slice(lastIndex)
|
|
845
|
+
});
|
|
846
|
+
const processedParts = segments.map((segment) => {
|
|
847
|
+
if (segment.type === "text") {
|
|
848
|
+
if (fuzzyTransform && /[\u0600-\u06FF]/.test(segment.value)) return fuzzyTransform(segment.value);
|
|
849
|
+
return segment.value;
|
|
850
|
+
}
|
|
851
|
+
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
852
|
+
const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(segment.value);
|
|
853
|
+
if (!tokenMatch) return segment.value;
|
|
854
|
+
const [, tokenName, captureName] = tokenMatch;
|
|
855
|
+
if (!tokenName && captureName) {
|
|
856
|
+
captureNames.push(captureName);
|
|
857
|
+
return `(?<${captureName}>.+)`;
|
|
858
|
+
}
|
|
859
|
+
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
860
|
+
if (!tokenPattern) return segment.value;
|
|
861
|
+
if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
862
|
+
if (captureName) {
|
|
863
|
+
captureNames.push(captureName);
|
|
864
|
+
return `(?<${captureName}>${tokenPattern})`;
|
|
865
|
+
}
|
|
866
|
+
return tokenPattern;
|
|
867
|
+
});
|
|
868
|
+
return {
|
|
869
|
+
captureNames,
|
|
870
|
+
hasCaptures: captureNames.length > 0,
|
|
871
|
+
pattern: processedParts.join("")
|
|
872
|
+
};
|
|
873
|
+
};
|
|
428
874
|
/**
|
|
429
|
-
*
|
|
875
|
+
* Expands template tokens in a query string to their regex equivalents.
|
|
430
876
|
*
|
|
431
|
-
*
|
|
432
|
-
*
|
|
433
|
-
* - * (asterisk)
|
|
434
|
-
* - ° (degree)
|
|
435
|
-
* - - (dash)
|
|
877
|
+
* This is the simple version without capture support. It returns only the
|
|
878
|
+
* expanded pattern string, not capture metadata.
|
|
436
879
|
*
|
|
437
|
-
*
|
|
880
|
+
* Unknown tokens are left as-is, allowing for partial templates.
|
|
881
|
+
*
|
|
882
|
+
* @param query - Template string containing `{{token}}` placeholders
|
|
883
|
+
* @returns Expanded regex pattern string
|
|
438
884
|
*
|
|
439
885
|
* @example
|
|
440
|
-
*
|
|
441
|
-
*
|
|
442
|
-
* //
|
|
886
|
+
* expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
|
|
887
|
+
* expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
|
|
888
|
+
* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
|
|
889
|
+
* expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
|
|
890
|
+
*
|
|
891
|
+
* @see expandTokensWithCaptures for full capture group support
|
|
443
892
|
*/
|
|
444
|
-
|
|
445
|
-
return new RegExp("^(?<full>(?<marker>[•*°\\-]\\s?)(?<content>[\\s\\S]*))", "u");
|
|
446
|
-
}
|
|
893
|
+
const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
|
|
447
894
|
/**
|
|
448
|
-
*
|
|
895
|
+
* Converts a template string to a compiled RegExp.
|
|
449
896
|
*
|
|
450
|
-
*
|
|
451
|
-
*
|
|
452
|
-
* - ## Heading 2
|
|
453
|
-
* - ### Heading 3
|
|
454
|
-
* - etc.
|
|
897
|
+
* Expands all tokens and attempts to compile the result as a RegExp
|
|
898
|
+
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
455
899
|
*
|
|
456
|
-
* @
|
|
900
|
+
* @remarks
|
|
901
|
+
* This function dynamically compiles regular expressions from template strings.
|
|
902
|
+
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
903
|
+
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
904
|
+
* Consider validating pattern complexity or applying execution timeouts when
|
|
905
|
+
* running user-submitted patterns.
|
|
906
|
+
*
|
|
907
|
+
* @param template - Template string containing `{{token}}` placeholders
|
|
908
|
+
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
909
|
+
*
|
|
910
|
+
* @example
|
|
911
|
+
* templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
|
|
912
|
+
* templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
|
|
913
|
+
* templateToRegex('(((') // → null (invalid regex)
|
|
914
|
+
*/
|
|
915
|
+
const templateToRegex = (template) => {
|
|
916
|
+
const expanded = expandTokens(template);
|
|
917
|
+
try {
|
|
918
|
+
return new RegExp(expanded, "u");
|
|
919
|
+
} catch {
|
|
920
|
+
return null;
|
|
921
|
+
}
|
|
922
|
+
};
|
|
923
|
+
/**
|
|
924
|
+
* Lists all available token names defined in `TOKEN_PATTERNS`.
|
|
925
|
+
*
|
|
926
|
+
* Useful for documentation, validation, or building user interfaces
|
|
927
|
+
* that show available tokens.
|
|
928
|
+
*
|
|
929
|
+
* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
|
|
930
|
+
*
|
|
931
|
+
* @example
|
|
932
|
+
* getAvailableTokens()
|
|
933
|
+
* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
934
|
+
*/
|
|
935
|
+
const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
936
|
+
/**
|
|
937
|
+
* Gets the regex pattern for a specific token name.
|
|
938
|
+
*
|
|
939
|
+
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
940
|
+
* without any expansion or capture group wrapping.
|
|
941
|
+
*
|
|
942
|
+
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
943
|
+
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
457
944
|
*
|
|
458
945
|
* @example
|
|
459
|
-
*
|
|
460
|
-
*
|
|
461
|
-
* //
|
|
462
|
-
* // match.groups.content -> 'عنوان فرعي'
|
|
946
|
+
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
947
|
+
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
948
|
+
* getTokenPattern('unknown') // → undefined
|
|
463
949
|
*/
|
|
464
|
-
|
|
465
|
-
return new RegExp("^(?<full>(?<marker>#+\\s?)(?<content>[\\s\\S]*))", "u");
|
|
466
|
-
}
|
|
950
|
+
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
467
951
|
|
|
468
952
|
//#endregion
|
|
469
|
-
//#region src/
|
|
470
|
-
/**
|
|
471
|
-
*
|
|
472
|
-
*
|
|
473
|
-
*
|
|
474
|
-
*
|
|
475
|
-
*
|
|
476
|
-
*
|
|
477
|
-
*
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
*
|
|
481
|
-
*
|
|
482
|
-
*
|
|
483
|
-
*
|
|
484
|
-
*
|
|
485
|
-
*
|
|
486
|
-
*
|
|
487
|
-
*
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
953
|
+
//#region src/segmentation/segmenter.ts
|
|
954
|
+
/**
|
|
955
|
+
* Core segmentation engine for splitting Arabic text pages into logical segments.
|
|
956
|
+
*
|
|
957
|
+
* The segmenter takes an array of pages and applies pattern-based rules to
|
|
958
|
+
* identify split points, producing segments with content, page references,
|
|
959
|
+
* and optional metadata.
|
|
960
|
+
*
|
|
961
|
+
* @module segmenter
|
|
962
|
+
*/
|
|
963
|
+
/**
|
|
964
|
+
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
965
|
+
*
|
|
966
|
+
* Detects standard capturing groups `(...)` while excluding:
|
|
967
|
+
* - Non-capturing groups `(?:...)`
|
|
968
|
+
* - Lookahead assertions `(?=...)` and `(?!...)`
|
|
969
|
+
* - Lookbehind assertions `(?<=...)` and `(?<!...)`
|
|
970
|
+
* - Named groups `(?<name>...)` (start with `(?` so excluded here)
|
|
971
|
+
*
|
|
972
|
+
* **Note**: Named capture groups `(?<name>...)` ARE capturing groups but are
|
|
973
|
+
* excluded by this check because they are tracked separately via the
|
|
974
|
+
* `captureNames` array from token expansion. This function only detects
|
|
975
|
+
* anonymous capturing groups like `(.*)`.
|
|
976
|
+
*
|
|
977
|
+
* @param pattern - Regex pattern string to analyze
|
|
978
|
+
* @returns `true` if the pattern contains at least one anonymous capturing group
|
|
979
|
+
*/
|
|
980
|
+
const hasCapturingGroup = (pattern) => {
|
|
981
|
+
return /\((?!\?)/.test(pattern);
|
|
982
|
+
};
|
|
983
|
+
/**
|
|
984
|
+
* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
|
|
985
|
+
*
|
|
986
|
+
* Fuzzy matching makes Arabic text diacritic-insensitive. When enabled, the
|
|
987
|
+
* transform is applied to token patterns BEFORE wrapping with capture groups,
|
|
988
|
+
* ensuring regex metacharacters (`(`, `)`, `|`, etc.) are not corrupted.
|
|
989
|
+
*
|
|
990
|
+
* @param pattern - Pattern string potentially containing `{{token}}` placeholders
|
|
991
|
+
* @param fuzzy - Whether to apply diacritic-insensitive transformation
|
|
992
|
+
* @returns Processed pattern with expanded tokens and capture names
|
|
993
|
+
*
|
|
994
|
+
* @example
|
|
995
|
+
* processPattern('{{raqms:num}} {{dash}}', false)
|
|
996
|
+
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ]', captureNames: ['num'] }
|
|
997
|
+
*
|
|
998
|
+
* @example
|
|
999
|
+
* processPattern('{{naql}}', true)
|
|
1000
|
+
* // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
|
|
1001
|
+
*/
|
|
1002
|
+
const processPattern = (pattern, fuzzy) => {
|
|
1003
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(pattern, fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1004
|
+
return {
|
|
1005
|
+
captureNames,
|
|
1006
|
+
pattern: expanded
|
|
494
1007
|
};
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
1008
|
+
};
|
|
1009
|
+
/**
|
|
1010
|
+
* Builds a compiled regex and metadata from a split rule.
|
|
1011
|
+
*
|
|
1012
|
+
* Handles all pattern types:
|
|
1013
|
+
* - `regex`: Used as-is (no token expansion)
|
|
1014
|
+
* - `template`: Tokens expanded via `expandTokensWithCaptures`
|
|
1015
|
+
* - `lineStartsWith`: Converted to `^(?:patterns...)`
|
|
1016
|
+
* - `lineStartsAfter`: Converted to `^(?:patterns...)(.*)`
|
|
1017
|
+
* - `lineEndsWith`: Converted to `(?:patterns...)$`
|
|
1018
|
+
*
|
|
1019
|
+
* @param rule - Split rule containing pattern and options
|
|
1020
|
+
* @returns Compiled regex with capture metadata
|
|
1021
|
+
*/
|
|
1022
|
+
const buildRuleRegex = (rule) => {
|
|
1023
|
+
const s = { ...rule };
|
|
1024
|
+
const fuzzy = rule.fuzzy ?? false;
|
|
1025
|
+
let allCaptureNames = [];
|
|
1026
|
+
/**
|
|
1027
|
+
* Safely compiles a regex pattern, throwing a helpful error if invalid.
|
|
1028
|
+
*
|
|
1029
|
+
* @remarks
|
|
1030
|
+
* This catches syntax errors only. It does NOT protect against ReDoS
|
|
1031
|
+
* (catastrophic backtracking) from pathological patterns. Avoid compiling
|
|
1032
|
+
* patterns from untrusted sources.
|
|
1033
|
+
*/
|
|
1034
|
+
const compileRegex = (pattern) => {
|
|
1035
|
+
try {
|
|
1036
|
+
return new RegExp(pattern, "gmu");
|
|
1037
|
+
} catch (error) {
|
|
1038
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1039
|
+
throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
|
|
511
1040
|
}
|
|
1041
|
+
};
|
|
1042
|
+
if (s.lineStartsAfter?.length) {
|
|
1043
|
+
const processed = s.lineStartsAfter.map((p) => processPattern(p, fuzzy));
|
|
1044
|
+
const patterns = processed.map((p) => p.pattern).join("|");
|
|
1045
|
+
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1046
|
+
s.regex = `^(?:${patterns})(.*)`;
|
|
1047
|
+
return {
|
|
1048
|
+
captureNames: allCaptureNames,
|
|
1049
|
+
regex: compileRegex(s.regex),
|
|
1050
|
+
usesCapture: true,
|
|
1051
|
+
usesLineStartsAfter: true
|
|
1052
|
+
};
|
|
1053
|
+
}
|
|
1054
|
+
if (s.lineStartsWith?.length) {
|
|
1055
|
+
const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
|
|
1056
|
+
const patterns = processed.map((p) => p.pattern).join("|");
|
|
1057
|
+
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1058
|
+
s.template = `^(?:${patterns})`;
|
|
1059
|
+
}
|
|
1060
|
+
if (s.lineEndsWith?.length) {
|
|
1061
|
+
const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
|
|
1062
|
+
const patterns = processed.map((p) => p.pattern).join("|");
|
|
1063
|
+
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1064
|
+
s.template = `(?:${patterns})$`;
|
|
1065
|
+
}
|
|
1066
|
+
if (s.template) {
|
|
1067
|
+
const { pattern, captureNames } = expandTokensWithCaptures(s.template);
|
|
1068
|
+
s.regex = pattern;
|
|
1069
|
+
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1070
|
+
}
|
|
1071
|
+
if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
|
|
1072
|
+
const usesCapture = hasCapturingGroup(s.regex) || allCaptureNames.length > 0;
|
|
1073
|
+
return {
|
|
1074
|
+
captureNames: allCaptureNames,
|
|
1075
|
+
regex: compileRegex(s.regex),
|
|
1076
|
+
usesCapture,
|
|
1077
|
+
usesLineStartsAfter: false
|
|
1078
|
+
};
|
|
1079
|
+
};
|
|
1080
|
+
/**
|
|
1081
|
+
* Builds a concatenated content string and page mapping from input pages.
|
|
1082
|
+
*
|
|
1083
|
+
* Pages are joined with newline characters, and a page map is created to
|
|
1084
|
+
* track which page each offset belongs to. This allows pattern matching
|
|
1085
|
+
* across page boundaries while preserving page reference information.
|
|
1086
|
+
*
|
|
1087
|
+
* @param pages - Array of input pages with id and content
|
|
1088
|
+
* @returns Concatenated content string and page mapping utilities
|
|
1089
|
+
*
|
|
1090
|
+
* @example
|
|
1091
|
+
* const pages = [
|
|
1092
|
+
* { id: 1, content: 'Page 1 text' },
|
|
1093
|
+
* { id: 2, content: 'Page 2 text' }
|
|
1094
|
+
* ];
|
|
1095
|
+
* const { content, pageMap } = buildPageMap(pages);
|
|
1096
|
+
* // content = 'Page 1 text\nPage 2 text'
|
|
1097
|
+
* // pageMap.getId(0) = 1
|
|
1098
|
+
* // pageMap.getId(12) = 2
|
|
1099
|
+
*/
|
|
1100
|
+
const buildPageMap = (pages) => {
|
|
1101
|
+
const boundaries = [];
|
|
1102
|
+
const pageBreaks = [];
|
|
1103
|
+
let offset = 0;
|
|
1104
|
+
const parts = [];
|
|
1105
|
+
for (let i = 0; i < pages.length; i++) {
|
|
1106
|
+
const normalized = normalizeLineEndings(pages[i].content);
|
|
1107
|
+
boundaries.push({
|
|
1108
|
+
end: offset + normalized.length,
|
|
1109
|
+
id: pages[i].id,
|
|
1110
|
+
start: offset
|
|
1111
|
+
});
|
|
1112
|
+
parts.push(normalized);
|
|
1113
|
+
if (i < pages.length - 1) {
|
|
1114
|
+
pageBreaks.push(offset + normalized.length);
|
|
1115
|
+
offset += normalized.length + 1;
|
|
1116
|
+
} else offset += normalized.length;
|
|
1117
|
+
}
|
|
1118
|
+
/**
|
|
1119
|
+
* Finds the page boundary containing the given offset using binary search.
|
|
1120
|
+
* O(log n) complexity for efficient lookup with many pages.
|
|
1121
|
+
*
|
|
1122
|
+
* @param off - Character offset to look up
|
|
1123
|
+
* @returns Page boundary or the last boundary as fallback
|
|
1124
|
+
*/
|
|
1125
|
+
const findBoundary = (off) => {
|
|
1126
|
+
let lo = 0;
|
|
1127
|
+
let hi = boundaries.length - 1;
|
|
1128
|
+
while (lo <= hi) {
|
|
1129
|
+
const mid = lo + hi >>> 1;
|
|
1130
|
+
const b = boundaries[mid];
|
|
1131
|
+
if (off < b.start) hi = mid - 1;
|
|
1132
|
+
else if (off > b.end) lo = mid + 1;
|
|
1133
|
+
else return b;
|
|
1134
|
+
}
|
|
1135
|
+
return boundaries[boundaries.length - 1];
|
|
1136
|
+
};
|
|
1137
|
+
return {
|
|
1138
|
+
content: parts.join("\n"),
|
|
1139
|
+
normalizedPages: parts,
|
|
1140
|
+
pageMap: {
|
|
1141
|
+
boundaries,
|
|
1142
|
+
getId: (off) => findBoundary(off)?.id ?? 0,
|
|
1143
|
+
pageBreaks,
|
|
1144
|
+
pageIds: boundaries.map((b) => b.id)
|
|
1145
|
+
}
|
|
1146
|
+
};
|
|
1147
|
+
};
|
|
1148
|
+
/**
|
|
1149
|
+
* Executes a regex against content and extracts match results with capture information.
|
|
1150
|
+
*
|
|
1151
|
+
* @param content - Full content string to search
|
|
1152
|
+
* @param regex - Compiled regex with 'g' flag
|
|
1153
|
+
* @param usesCapture - Whether to extract captured content
|
|
1154
|
+
* @param captureNames - Names of expected named capture groups
|
|
1155
|
+
* @returns Array of match results with positions and captures
|
|
1156
|
+
*/
|
|
1157
|
+
const findMatches = (content, regex, usesCapture, captureNames) => {
|
|
1158
|
+
const matches = [];
|
|
1159
|
+
regex.lastIndex = 0;
|
|
1160
|
+
let m = regex.exec(content);
|
|
1161
|
+
while (m !== null) {
|
|
1162
|
+
const result = {
|
|
1163
|
+
end: m.index + m[0].length,
|
|
1164
|
+
start: m.index
|
|
1165
|
+
};
|
|
1166
|
+
result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
|
|
1167
|
+
if (usesCapture) result.captured = getLastPositionalCapture(m);
|
|
1168
|
+
matches.push(result);
|
|
1169
|
+
if (m[0].length === 0) regex.lastIndex++;
|
|
1170
|
+
m = regex.exec(content);
|
|
1171
|
+
}
|
|
1172
|
+
return matches;
|
|
1173
|
+
};
|
|
1174
|
+
/**
|
|
1175
|
+
* Finds page breaks within a given offset range using binary search.
|
|
1176
|
+
* O(log n + k) where n = total breaks, k = breaks in range.
|
|
1177
|
+
*
|
|
1178
|
+
* @param startOffset - Start of range (inclusive)
|
|
1179
|
+
* @param endOffset - End of range (exclusive)
|
|
1180
|
+
* @param sortedBreaks - Sorted array of page break offsets
|
|
1181
|
+
* @returns Array of break offsets relative to startOffset
|
|
1182
|
+
*/
|
|
1183
|
+
const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
|
|
1184
|
+
if (sortedBreaks.length === 0) return [];
|
|
1185
|
+
let lo = 0;
|
|
1186
|
+
let hi = sortedBreaks.length;
|
|
1187
|
+
while (lo < hi) {
|
|
1188
|
+
const mid = lo + hi >>> 1;
|
|
1189
|
+
if (sortedBreaks[mid] < startOffset) lo = mid + 1;
|
|
1190
|
+
else hi = mid;
|
|
1191
|
+
}
|
|
1192
|
+
const result = [];
|
|
1193
|
+
for (let i = lo; i < sortedBreaks.length && sortedBreaks[i] < endOffset; i++) result.push(sortedBreaks[i] - startOffset);
|
|
1194
|
+
return result;
|
|
1195
|
+
};
|
|
1196
|
+
/**
|
|
1197
|
+
* Converts page-break newlines to spaces in segment content.
|
|
1198
|
+
*
|
|
1199
|
+
* When a segment spans multiple pages, the newline characters that were
|
|
1200
|
+
* inserted as page separators during concatenation are converted to spaces
|
|
1201
|
+
* for more natural reading.
|
|
1202
|
+
*
|
|
1203
|
+
* Uses binary search for O(log n + k) lookup instead of O(n) iteration.
|
|
1204
|
+
*
|
|
1205
|
+
* @param content - Segment content string
|
|
1206
|
+
* @param startOffset - Starting offset of this content in concatenated string
|
|
1207
|
+
* @param pageBreaks - Sorted array of page break offsets
|
|
1208
|
+
* @returns Content with page-break newlines converted to spaces
|
|
1209
|
+
*/
|
|
1210
|
+
const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
1211
|
+
const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
|
|
1212
|
+
if (breaksInRange.length === 0) return content;
|
|
1213
|
+
const breakSet = new Set(breaksInRange);
|
|
1214
|
+
return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
|
|
1215
|
+
};
|
|
1216
|
+
/**
|
|
1217
|
+
* Applies breakpoints to oversized segments.
|
|
1218
|
+
*
|
|
1219
|
+
* For each segment that spans more than maxPages, tries the breakpoint patterns
|
|
1220
|
+
* in order to find a suitable split point. Structural markers (from rules) are
|
|
1221
|
+
* always respected - segments are only broken within their boundaries.
|
|
1222
|
+
*
|
|
1223
|
+
* @param segments - Initial segments from rule processing
|
|
1224
|
+
* @param pages - Original pages for page lookup
|
|
1225
|
+
* @param maxPages - Maximum pages before breakpoints apply
|
|
1226
|
+
* @param breakpoints - Patterns to try in order (tokens supported)
|
|
1227
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
1228
|
+
* @returns Processed segments with oversized ones broken up
|
|
1229
|
+
*/
|
|
1230
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer) => {
|
|
1231
|
+
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
|
|
1232
|
+
const startingPageId = pageIds$1[currentFromIdx];
|
|
1233
|
+
if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
|
|
1234
|
+
for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
|
|
1235
|
+
const pageId = pageIds$1[pageIdx];
|
|
1236
|
+
if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets$1[pageIdx] - cumulativeOffsets$1[currentFromIdx];
|
|
1237
|
+
}
|
|
1238
|
+
return -1;
|
|
1239
|
+
};
|
|
1240
|
+
const pageIds = pages.map((p) => p.id);
|
|
1241
|
+
const pageIdToIndex = new Map(pageIds.map((id, i) => [id, i]));
|
|
1242
|
+
const normalizedPages = /* @__PURE__ */ new Map();
|
|
1243
|
+
for (let i = 0; i < pages.length; i++) {
|
|
1244
|
+
const content = normalizedContent[i];
|
|
1245
|
+
normalizedPages.set(pages[i].id, {
|
|
1246
|
+
content,
|
|
1247
|
+
index: i,
|
|
1248
|
+
length: content.length
|
|
1249
|
+
});
|
|
1250
|
+
}
|
|
1251
|
+
const cumulativeOffsets = [0];
|
|
1252
|
+
let totalOffset = 0;
|
|
1253
|
+
for (let i = 0; i < pageIds.length; i++) {
|
|
1254
|
+
const pageData = normalizedPages.get(pageIds[i]);
|
|
1255
|
+
totalOffset += pageData ? pageData.length : 0;
|
|
1256
|
+
if (i < pageIds.length - 1) totalOffset += 1;
|
|
1257
|
+
cumulativeOffsets.push(totalOffset);
|
|
512
1258
|
}
|
|
513
|
-
|
|
1259
|
+
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1260
|
+
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
|
|
1261
|
+
const result = [];
|
|
1262
|
+
for (const segment of segments) {
|
|
1263
|
+
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
1264
|
+
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
1265
|
+
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
1266
|
+
const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
1267
|
+
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
1268
|
+
result.push(segment);
|
|
1269
|
+
continue;
|
|
1270
|
+
}
|
|
1271
|
+
let remainingContent = segment.content;
|
|
1272
|
+
let currentFromIdx = fromIdx;
|
|
1273
|
+
let isFirstPiece = true;
|
|
1274
|
+
while (currentFromIdx <= toIdx) {
|
|
1275
|
+
const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
|
|
1276
|
+
const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
|
|
1277
|
+
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1278
|
+
const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1279
|
+
if (finalSeg) result.push(finalSeg);
|
|
1280
|
+
break;
|
|
1281
|
+
}
|
|
1282
|
+
const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
|
|
1283
|
+
let windowEndIdx = currentFromIdx;
|
|
1284
|
+
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
1285
|
+
else break;
|
|
1286
|
+
const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
|
|
1287
|
+
let breakPosition = -1;
|
|
1288
|
+
if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1289
|
+
if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, {
|
|
1290
|
+
cumulativeOffsets,
|
|
1291
|
+
expandedBreakpoints,
|
|
1292
|
+
normalizedPages,
|
|
1293
|
+
pageIds,
|
|
1294
|
+
prefer
|
|
1295
|
+
});
|
|
1296
|
+
if (breakPosition <= 0) {
|
|
1297
|
+
if (windowEndIdx === currentFromIdx) {
|
|
1298
|
+
const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
|
|
1299
|
+
const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
|
|
1300
|
+
if (pageSeg) result.push(pageSeg);
|
|
1301
|
+
remainingContent = remainingContent.slice(pageContent.length).trim();
|
|
1302
|
+
currentFromIdx++;
|
|
1303
|
+
isFirstPiece = false;
|
|
1304
|
+
continue;
|
|
1305
|
+
}
|
|
1306
|
+
breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
1307
|
+
}
|
|
1308
|
+
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
1309
|
+
const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1310
|
+
const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1311
|
+
if (pieceContent) {
|
|
1312
|
+
const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1313
|
+
if (pieceSeg) result.push(pieceSeg);
|
|
1314
|
+
}
|
|
1315
|
+
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
1316
|
+
let nextFromIdx = actualEndIdx;
|
|
1317
|
+
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
1318
|
+
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
1319
|
+
if (nextPageData) {
|
|
1320
|
+
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
1321
|
+
if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
currentFromIdx = nextFromIdx;
|
|
1325
|
+
isFirstPiece = false;
|
|
1326
|
+
}
|
|
1327
|
+
}
|
|
1328
|
+
return result;
|
|
1329
|
+
};
|
|
1330
|
+
/**
|
|
1331
|
+
* Segments pages of content based on pattern-matching rules.
|
|
1332
|
+
*
|
|
1333
|
+
* This is the main entry point for the segmentation engine. It takes an array
|
|
1334
|
+
* of pages and applies the provided rules to identify split points, producing
|
|
1335
|
+
* an array of segments with content, page references, and metadata.
|
|
1336
|
+
*
|
|
1337
|
+
* @param pages - Array of pages with id and content
|
|
1338
|
+
* @param options - Segmentation options including splitting rules
|
|
1339
|
+
* @returns Array of segments with content, from/to page references, and optional metadata
|
|
1340
|
+
*
|
|
1341
|
+
* @example
|
|
1342
|
+
* // Split markdown by headers
|
|
1343
|
+
* const segments = segmentPages(pages, {
|
|
1344
|
+
* rules: [
|
|
1345
|
+
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
|
|
1346
|
+
* ]
|
|
1347
|
+
* });
|
|
1348
|
+
*
|
|
1349
|
+
* @example
|
|
1350
|
+
* // Split Arabic hadith text with number extraction
|
|
1351
|
+
* const segments = segmentPages(pages, {
|
|
1352
|
+
* rules: [
|
|
1353
|
+
* {
|
|
1354
|
+
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
1355
|
+
* split: 'at',
|
|
1356
|
+
* fuzzy: true,
|
|
1357
|
+
* meta: { type: 'hadith' }
|
|
1358
|
+
* }
|
|
1359
|
+
* ]
|
|
1360
|
+
* });
|
|
1361
|
+
*
|
|
1362
|
+
* @example
|
|
1363
|
+
* // Multiple rules with page constraints
|
|
1364
|
+
* const segments = segmentPages(pages, {
|
|
1365
|
+
* rules: [
|
|
1366
|
+
* { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
|
|
1367
|
+
* { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
|
|
1368
|
+
* { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
|
|
1369
|
+
* ]
|
|
1370
|
+
* });
|
|
1371
|
+
*/
|
|
1372
|
+
const segmentPages = (pages, options) => {
|
|
1373
|
+
const { rules = [], maxPages, breakpoints, prefer = "longer" } = options;
|
|
1374
|
+
if (!pages.length) return [];
|
|
1375
|
+
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
|
|
1376
|
+
const splitPoints = [];
|
|
1377
|
+
for (const rule of rules) {
|
|
1378
|
+
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1379
|
+
const finalMatches = filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence);
|
|
1380
|
+
for (const m of finalMatches) {
|
|
1381
|
+
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1382
|
+
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1383
|
+
splitPoints.push({
|
|
1384
|
+
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
1385
|
+
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
1386
|
+
index: rule.split === "at" ? m.start : m.end,
|
|
1387
|
+
meta: rule.meta,
|
|
1388
|
+
namedCaptures: m.namedCaptures
|
|
1389
|
+
});
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
const byIndex = /* @__PURE__ */ new Map();
|
|
1393
|
+
for (const p of splitPoints) {
|
|
1394
|
+
const existing = byIndex.get(p.index);
|
|
1395
|
+
if (!existing) byIndex.set(p.index, p);
|
|
1396
|
+
else if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
|
|
1397
|
+
}
|
|
1398
|
+
const unique = [...byIndex.values()];
|
|
1399
|
+
unique.sort((a, b) => a.index - b.index);
|
|
1400
|
+
let segments = buildSegments(unique, matchContent, pageMap, rules);
|
|
1401
|
+
if (segments.length === 0 && pages.length > 0) {
|
|
1402
|
+
const firstPage = pages[0];
|
|
1403
|
+
const lastPage = pages[pages.length - 1];
|
|
1404
|
+
const initialSeg = {
|
|
1405
|
+
content: pages.map((p) => normalizeLineEndings(p.content)).join("\n").trim(),
|
|
1406
|
+
from: firstPage.id
|
|
1407
|
+
};
|
|
1408
|
+
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
1409
|
+
if (initialSeg.content) segments = [initialSeg];
|
|
1410
|
+
}
|
|
1411
|
+
if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer);
|
|
1412
|
+
return segments;
|
|
1413
|
+
};
|
|
1414
|
+
/**
|
|
1415
|
+
* Creates segment objects from split points.
|
|
1416
|
+
*
|
|
1417
|
+
* Handles segment creation including:
|
|
1418
|
+
* - Content extraction (with captured content for `lineStartsAfter`)
|
|
1419
|
+
* - Page break conversion to spaces
|
|
1420
|
+
* - From/to page reference calculation
|
|
1421
|
+
* - Metadata merging (static + named captures)
|
|
1422
|
+
*
|
|
1423
|
+
* @param splitPoints - Sorted, unique split points
|
|
1424
|
+
* @param content - Full concatenated content string
|
|
1425
|
+
* @param pageMap - Page mapping utilities
|
|
1426
|
+
* @param rules - Original rules (for constraint checking on first segment)
|
|
1427
|
+
* @returns Array of segment objects
|
|
1428
|
+
*/
|
|
1429
|
+
const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
1430
|
+
/**
|
|
1431
|
+
* Creates a single segment from a content range.
|
|
1432
|
+
*/
|
|
1433
|
+
const createSegment$1 = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
|
|
1434
|
+
const actualStart = start + (contentStartOffset ?? 0);
|
|
1435
|
+
const sliced = content.slice(actualStart, end);
|
|
1436
|
+
let text = capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
|
|
1437
|
+
if (!text) return null;
|
|
1438
|
+
if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks);
|
|
1439
|
+
const from = pageMap.getId(actualStart);
|
|
1440
|
+
const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(actualStart + text.length - 1);
|
|
1441
|
+
const seg = {
|
|
1442
|
+
content: text,
|
|
1443
|
+
from
|
|
1444
|
+
};
|
|
1445
|
+
if (to !== from) seg.to = to;
|
|
1446
|
+
if (meta || namedCaptures) seg.meta = {
|
|
1447
|
+
...meta,
|
|
1448
|
+
...namedCaptures
|
|
1449
|
+
};
|
|
1450
|
+
return seg;
|
|
1451
|
+
};
|
|
1452
|
+
/**
|
|
1453
|
+
* Creates segments from an array of split points.
|
|
1454
|
+
*/
|
|
1455
|
+
const createSegmentsFromSplitPoints = () => {
|
|
1456
|
+
const result = [];
|
|
1457
|
+
for (let i = 0; i < splitPoints.length; i++) {
|
|
1458
|
+
const sp = splitPoints[i];
|
|
1459
|
+
const end = i < splitPoints.length - 1 ? splitPoints[i + 1].index : content.length;
|
|
1460
|
+
const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
1461
|
+
if (s) result.push(s);
|
|
1462
|
+
}
|
|
1463
|
+
return result;
|
|
1464
|
+
};
|
|
1465
|
+
const segments = [];
|
|
1466
|
+
if (!splitPoints.length) {
|
|
1467
|
+
if (anyRuleAllowsId(rules, pageMap.getId(0))) {
|
|
1468
|
+
const s = createSegment$1(0, content.length);
|
|
1469
|
+
if (s) segments.push(s);
|
|
1470
|
+
}
|
|
1471
|
+
return segments;
|
|
1472
|
+
}
|
|
1473
|
+
if (splitPoints[0].index > 0) {
|
|
1474
|
+
if (anyRuleAllowsId(rules, pageMap.getId(0))) {
|
|
1475
|
+
const s = createSegment$1(0, splitPoints[0].index);
|
|
1476
|
+
if (s) segments.push(s);
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
1480
|
+
};
|
|
514
1481
|
|
|
515
1482
|
//#endregion
|
|
516
|
-
export {
|
|
1483
|
+
export { TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
|
|
517
1484
|
//# sourceMappingURL=index.mjs.map
|