flappa-doormal 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,517 +1,1484 @@
1
- import { makeDiacriticInsensitive } from "bitaboom";
2
-
3
- //#region src/markers/defaults.ts
1
+ //#region src/segmentation/fuzzy.ts
2
+ /**
3
+ * Fuzzy matching utilities for Arabic text.
4
+ *
5
+ * Provides diacritic-insensitive and character-equivalence matching for Arabic text.
6
+ * This allows matching text regardless of:
7
+ * - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
8
+ * - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
9
+ *
10
+ * @module fuzzy
11
+ *
12
+ * @example
13
+ * // Make a pattern diacritic-insensitive
14
+ * const pattern = makeDiacriticInsensitive('حدثنا');
15
+ * new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
16
+ */
17
+ /**
18
+ * Character class matching all Arabic diacritics (Tashkeel/Harakat).
19
+ *
20
+ * Includes the following diacritical marks:
21
+ * - U+064B: ً (fathatan - double fatha)
22
+ * - U+064C: ٌ (dammatan - double damma)
23
+ * - U+064D: ٍ (kasratan - double kasra)
24
+ * - U+064E: َ (fatha - short a)
25
+ * - U+064F: ُ (damma - short u)
26
+ * - U+0650: ِ (kasra - short i)
27
+ * - U+0651: ّ (shadda - gemination)
28
+ * - U+0652: ْ (sukun - no vowel)
29
+ *
30
+ * @internal
31
+ */
32
+ const DIACRITICS_CLASS = "[ًٌٍَُِّْ]";
4
33
  /**
5
- * Default numbering style for markers
34
+ * Groups of equivalent Arabic characters.
35
+ *
36
+ * Characters within the same group are considered equivalent for matching purposes.
37
+ * This handles common variations in Arabic text where different characters are
38
+ * used interchangeably or have the same underlying meaning.
39
+ *
40
+ * Equivalence groups:
41
+ * - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
42
+ * - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
43
+ * - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
44
+ *
45
+ * @internal
6
46
  */
7
- const DEFAULT_NUMBERING = "arabic-indic";
47
+ const EQUIV_GROUPS = [
48
+ [
49
+ "ا",
50
+ "آ",
51
+ "أ",
52
+ "إ"
53
+ ],
54
+ ["ة", "ه"],
55
+ ["ى", "ي"]
56
+ ];
8
57
  /**
9
- * Default separator style for markers
58
+ * Escapes a string for safe inclusion in a regular expression.
59
+ *
60
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
61
+ *
62
+ * @param s - Any string to escape
63
+ * @returns String with regex metacharacters escaped
64
+ *
65
+ * @example
66
+ * escapeRegex('hello.world') // → 'hello\\.world'
67
+ * escapeRegex('[test]') // → '\\[test\\]'
68
+ * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
10
69
  */
11
- const DEFAULT_SEPARATOR = "dash";
70
+ const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
12
71
  /**
13
- * Default separator pattern (used when separator is a custom string)
72
+ * Returns a regex character class for all equivalents of a given character.
73
+ *
74
+ * If the character belongs to one of the predefined equivalence groups
75
+ * (e.g., ا/آ/أ/إ), the returned class will match any member of that group.
76
+ * Otherwise, the original character is simply escaped for safe regex inclusion.
77
+ *
78
+ * @param ch - A single character to expand into its equivalence class
79
+ * @returns A RegExp-safe string representing the character and its equivalents
80
+ *
81
+ * @example
82
+ * getEquivClass('ا') // → '[اآأإ]' (matches any alef variant)
83
+ * getEquivClass('ب') // → 'ب' (no equivalents, just escaped)
84
+ * getEquivClass('.') // → '\\.' (regex metachar escaped)
85
+ *
86
+ * @internal
14
87
  */
15
- const DEFAULT_SEPARATOR_PATTERN = "[-–—ـ]";
88
+ const getEquivClass = (ch) => {
89
+ for (const group of EQUIV_GROUPS) if (group.includes(ch)) return `[${group.map((c) => escapeRegex(c)).join("")}]`;
90
+ return escapeRegex(ch);
91
+ };
16
92
  /**
17
- * Numbering patterns mapped by style
93
+ * Performs light normalization on Arabic text for consistent matching.
94
+ *
95
+ * Normalization steps:
96
+ * 1. NFC normalization (canonical decomposition then composition)
97
+ * 2. Remove Zero-Width Joiner (U+200D) and Zero-Width Non-Joiner (U+200C)
98
+ * 3. Collapse multiple whitespace characters to single space
99
+ * 4. Trim leading and trailing whitespace
100
+ *
101
+ * This normalization preserves diacritics and letter forms while removing
102
+ * invisible characters that could interfere with matching.
103
+ *
104
+ * @param str - Arabic text to normalize
105
+ * @returns Normalized string
106
+ *
107
+ * @example
108
+ * normalizeArabicLight('حَدَّثَنَا') // → 'حَدَّثَنَا' (diacritics preserved)
109
+ * normalizeArabicLight('بسم الله') // → 'بسم الله' (spaces collapsed)
110
+ * normalizeArabicLight(' text ') // → 'text' (trimmed)
111
+ *
112
+ * @internal
18
113
  */
19
- const NUMBERING_PATTERNS = {
20
- "arabic-indic": "[\\u0660-\\u0669]+",
21
- "latin": "\\d+"
114
+ const normalizeArabicLight = (str) => {
115
+ return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
22
116
  };
23
117
  /**
24
- * Separator patterns mapped by style
118
+ * Creates a diacritic-insensitive regex pattern for Arabic text matching.
119
+ *
120
+ * Transforms input text into a regex pattern that matches the text regardless
121
+ * of diacritical marks (harakat) and character variations. Each character in
122
+ * the input is:
123
+ * 1. Expanded to its equivalence class (if applicable)
124
+ * 2. Followed by an optional diacritics matcher
125
+ *
126
+ * This allows matching:
127
+ * - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
128
+ * - `الإيمان` with `الايمان` (alef variants)
129
+ * - `صلاة` with `صلاه` (ta marbuta ↔ ha)
130
+ *
131
+ * @param text - Input Arabic text to make diacritic-insensitive
132
+ * @returns Regex pattern string that matches the text with or without diacritics
133
+ *
134
+ * @example
135
+ * const pattern = makeDiacriticInsensitive('حدثنا');
136
+ * // Each char gets equivalence class + optional diacritics
137
+ * // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
138
+ *
139
+ * @example
140
+ * const pattern = makeDiacriticInsensitive('باب');
141
+ * new RegExp(pattern, 'u').test('بَابٌ') // → true
142
+ * new RegExp(pattern, 'u').test('باب') // → true
143
+ *
144
+ * @example
145
+ * // Using with split rules
146
+ * {
147
+ * lineStartsWith: ['باب'],
148
+ * split: 'at',
149
+ * fuzzy: true // Applies makeDiacriticInsensitive internally
150
+ * }
25
151
  */
26
- const SEPARATOR_PATTERNS = {
27
- "colon": ":",
28
- "dash": "[-–—ـ]",
29
- "dot": "\\.",
30
- "none": "",
31
- "paren": "\\)"
152
+ const makeDiacriticInsensitive = (text) => {
153
+ const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
154
+ const norm = normalizeArabicLight(text);
155
+ return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
32
156
  };
33
157
 
34
158
  //#endregion
35
- //#region src/markers/presets.ts
159
+ //#region src/segmentation/breakpoint-utils.ts
36
160
  /**
37
- * Default phrase lists for preset marker types.
38
- * Export these so users can extend them.
161
+ * Normalizes a breakpoint to the object form.
162
+ * Strings are converted to { pattern: str } with no constraints.
163
+ *
164
+ * @param bp - Breakpoint as string or object
165
+ * @returns Normalized BreakpointRule object
166
+ *
167
+ * @example
168
+ * normalizeBreakpoint('\\n\\n')
169
+ * // → { pattern: '\\n\\n' }
170
+ *
171
+ * normalizeBreakpoint({ pattern: '\\n', min: 10 })
172
+ * // → { pattern: '\\n', min: 10 }
39
173
  */
174
+ const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
40
175
  /**
41
- * Common hadith narrator phrases (diacritic-insensitive)
42
- * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
176
+ * Checks if a page ID is in an excluded list (single pages or ranges).
177
+ *
178
+ * @param pageId - Page ID to check
179
+ * @param excludeList - List of page IDs or [from, to] ranges to exclude
180
+ * @returns True if page is excluded
181
+ *
182
+ * @example
183
+ * isPageExcluded(5, [1, 5, 10])
184
+ * // → true
185
+ *
186
+ * isPageExcluded(5, [[3, 7]])
187
+ * // → true
188
+ *
189
+ * isPageExcluded(5, [[10, 20]])
190
+ * // → false
43
191
  */
44
- const DEFAULT_HADITH_PHRASES = [
45
- "حَدَّثَنَا",
46
- "حدثنا",
47
- "أَخْبَرَنَا",
48
- "حدثني",
49
- "حدَّثني",
50
- "وحدثنا",
51
- "حُدِّثت عن",
52
- "وحَدَّثَنَا"
53
- ];
192
+ const isPageExcluded = (pageId, excludeList) => {
193
+ if (!excludeList || excludeList.length === 0) return false;
194
+ for (const item of excludeList) if (typeof item === "number") {
195
+ if (pageId === item) return true;
196
+ } else {
197
+ const [from, to] = item;
198
+ if (pageId >= from && pageId <= to) return true;
199
+ }
200
+ return false;
201
+ };
54
202
  /**
55
- * Common basmala patterns
56
- * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
203
+ * Checks if a page ID is within a breakpoint's min/max range and not excluded.
204
+ *
205
+ * @param pageId - Page ID to check
206
+ * @param rule - Breakpoint rule with optional min/max/exclude constraints
207
+ * @returns True if page is within valid range
208
+ *
209
+ * @example
210
+ * isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
211
+ * // → true
212
+ *
213
+ * isInBreakpointRange(5, { pattern: '\\n', min: 10 })
214
+ * // → false (below min)
57
215
  */
58
- const DEFAULT_BASMALA_PATTERNS = [
59
- "بسم الله",
60
- "\\[بسم",
61
- "\\[تم"
62
- ];
63
-
64
- //#endregion
65
- //#region src/markers/tokens.ts
216
+ const isInBreakpointRange = (pageId, rule) => {
217
+ if (rule.min !== void 0 && pageId < rule.min) return false;
218
+ if (rule.max !== void 0 && pageId > rule.max) return false;
219
+ return !isPageExcluded(pageId, rule.exclude);
220
+ };
66
221
  /**
67
- * Token definitions for pattern templates.
68
- * Tokens provide a readable alternative to raw regex patterns.
222
+ * Builds an exclude set from a PageRange array for O(1) lookups.
223
+ *
224
+ * @param excludeList - List of page IDs or [from, to] ranges
225
+ * @returns Set of all excluded page IDs
226
+ *
227
+ * @remarks
228
+ * This expands ranges into explicit page IDs for fast membership checks. For typical
229
+ * book-scale inputs (thousands of pages), this is small and keeps downstream logic
230
+ * simple and fast. If you expect extremely large ranges (e.g., millions of pages),
231
+ * consider avoiding broad excludes or introducing a range-based membership structure.
232
+ *
233
+ * @example
234
+ * buildExcludeSet([1, 5, [10, 12]])
235
+ * // → Set { 1, 5, 10, 11, 12 }
69
236
  */
237
+ const buildExcludeSet = (excludeList) => {
238
+ const excludeSet = /* @__PURE__ */ new Set();
239
+ for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
240
+ else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
241
+ return excludeSet;
242
+ };
70
243
  /**
71
- * Standard tokens for building marker patterns.
72
- * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
244
+ * Creates a segment with optional to and meta fields.
245
+ * Returns null if content is empty after trimming.
246
+ *
247
+ * @param content - Segment content
248
+ * @param fromPageId - Starting page ID
249
+ * @param toPageId - Optional ending page ID (omitted if same as from)
250
+ * @param meta - Optional metadata to attach
251
+ * @returns Segment object or null if empty
252
+ *
253
+ * @example
254
+ * createSegment('Hello world', 1, 3, { chapter: 1 })
255
+ * // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
256
+ *
257
+ * createSegment(' ', 1, undefined, undefined)
258
+ * // → null (empty content)
73
259
  */
74
- const TOKENS = {
75
- bullet: "[•*°]",
76
- colon: ":",
77
- comma: "،",
78
- content: "(.*)",
79
- dash: "[-–—ـ]",
80
- dot: "\\.",
81
- latin: "\\d+",
82
- letter: "[أ-ي]",
83
- num: "[\\u0660-\\u0669]+",
84
- paren: "\\)",
85
- s: "\\s?",
86
- slash: "/",
87
- space: "\\s+"
260
+ const createSegment = (content, fromPageId, toPageId, meta) => {
261
+ const trimmed = content.trim();
262
+ if (!trimmed) return null;
263
+ const seg = {
264
+ content: trimmed,
265
+ from: fromPageId
266
+ };
267
+ if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
268
+ if (meta) seg.meta = meta;
269
+ return seg;
88
270
  };
89
-
90
- //#endregion
91
- //#region src/markers/template-parser.ts
92
- /**
93
- * Expands a template string into a regex pattern using named capture groups.
94
- * Always creates three groups: full (entire match), marker (just the marker), content (clean text).
95
- *
96
- * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
97
- *
98
- * @param template - Template string with {token} placeholders
99
- * @param options - Optional configuration
100
- * @returns Regex pattern string with named groups
101
- *
102
- * @example
103
- * expandTemplate('{num} {dash}')
104
- * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
105
- */
106
- function expandTemplate(template, options) {
107
- const tokenMap = options?.tokens || TOKENS;
108
- let expandedMarker = template;
109
- for (const [token, pattern] of Object.entries(tokenMap)) {
110
- const placeholder = `{${token}}`;
111
- expandedMarker = expandedMarker.replaceAll(placeholder, pattern);
112
- }
113
- return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\s\S]*))`;
114
- }
115
- /**
116
- * Create a custom token map by extending the base tokens.
117
- *
118
- * @param customTokens - Custom token definitions
119
- * @returns Combined token map
120
- *
121
- * @example
122
- * const myTokens = createTokenMap({
123
- * verse: '\\[[\\u0660-\\u0669]+\\]',
124
- * tafsir: 'تفسير'
125
- * });
271
+ /**
272
+ * Expands breakpoint patterns and pre-computes exclude sets.
273
+ *
274
+ * @param breakpoints - Array of breakpoint patterns or rules
275
+ * @param processPattern - Function to expand tokens in patterns
276
+ * @returns Array of expanded breakpoints with compiled regexes
277
+ *
278
+ * @remarks
279
+ * This function compiles regex patterns dynamically. This can be a ReDoS vector
280
+ * if patterns come from untrusted sources. In typical usage, breakpoint rules
281
+ * are application configuration, not user input.
126
282
  */
127
- function createTokenMap(customTokens) {
128
- return {
129
- ...TOKENS,
130
- ...customTokens
131
- };
132
- }
133
- /**
134
- * Validates a template string.
135
- *
136
- * @param template - Template to validate
137
- * @param tokens - Token map to validate against
138
- * @returns Validation result with errors if invalid
139
- *
140
- * @example
141
- * validateTemplate('{num} {dash}')
142
- * // Returns: { valid: true }
143
- *
144
- * validateTemplate('{invalid}')
145
- * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
146
- */
147
- function validateTemplate(template, tokens = TOKENS) {
148
- const unknownTokens = (template.match(/\{(\w+)\}/g) || []).map((t) => t.slice(1, -1)).filter((name) => !tokens[name]);
149
- if (unknownTokens.length > 0) return {
150
- valid: false,
151
- errors: [`Unknown tokens: ${unknownTokens.map((t) => `{${t}}`).join(", ")}`, `Available tokens: ${Object.keys(tokens).map((t) => `{${t}}`).join(", ")}`]
283
+ const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
284
+ const rule = normalizeBreakpoint(bp);
285
+ const excludeSet = buildExcludeSet(rule.exclude);
286
+ const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
287
+ const expandedSkip = processPattern$1(rule.skipWhen);
288
+ try {
289
+ return new RegExp(expandedSkip, "mu");
290
+ } catch (error) {
291
+ const message = error instanceof Error ? error.message : String(error);
292
+ throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
293
+ }
294
+ })() : null;
295
+ if (rule.pattern === "") return {
296
+ excludeSet,
297
+ regex: null,
298
+ rule,
299
+ skipWhenRegex
152
300
  };
153
- return { valid: true };
154
- }
301
+ const expanded = processPattern$1(rule.pattern);
302
+ try {
303
+ return {
304
+ excludeSet,
305
+ regex: new RegExp(expanded, "gmu"),
306
+ rule,
307
+ skipWhenRegex
308
+ };
309
+ } catch (error) {
310
+ const message = error instanceof Error ? error.message : String(error);
311
+ throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
312
+ }
313
+ });
314
+ /**
315
+ * Finds the actual ending page index by searching backwards for page content prefix.
316
+ * Used to determine which page a segment actually ends on based on content matching.
317
+ *
318
+ * @param pieceContent - Content of the segment piece
319
+ * @param currentFromIdx - Current starting index in pageIds
320
+ * @param toIdx - Maximum ending index to search
321
+ * @param pageIds - Array of page IDs
322
+ * @param normalizedPages - Map of page ID to normalized content
323
+ * @returns The actual ending page index
324
+ */
325
+ const findActualEndPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
326
+ for (let pi = toIdx; pi > currentFromIdx; pi--) {
327
+ const pageData = normalizedPages.get(pageIds[pi]);
328
+ if (pageData) {
329
+ const checkPortion = pageData.content.slice(0, Math.min(30, pageData.length));
330
+ if (checkPortion.length > 0 && pieceContent.indexOf(checkPortion) > 0) return pi;
331
+ }
332
+ }
333
+ return currentFromIdx;
334
+ };
335
+ /**
336
+ * Finds the actual starting page index by searching forwards for page content prefix.
337
+ * Used to determine which page content actually starts from based on content matching.
338
+ *
339
+ * This is the counterpart to findActualEndPage - it searches forward to find which
340
+ * page the content starts on, rather than which page it ends on.
341
+ *
342
+ * @param pieceContent - Content of the segment piece
343
+ * @param currentFromIdx - Current starting index in pageIds
344
+ * @param toIdx - Maximum ending index to search
345
+ * @param pageIds - Array of page IDs
346
+ * @param normalizedPages - Map of page ID to normalized content
347
+ * @returns The actual starting page index
348
+ */
349
+ const findActualStartPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
350
+ const trimmedPiece = pieceContent.trimStart();
351
+ if (!trimmedPiece) return currentFromIdx;
352
+ for (let pi = currentFromIdx; pi <= toIdx; pi++) {
353
+ const pageData = normalizedPages.get(pageIds[pi]);
354
+ if (pageData) {
355
+ const pagePrefix = pageData.content.slice(0, Math.min(30, pageData.length)).trim();
356
+ const piecePrefix = trimmedPiece.slice(0, Math.min(30, trimmedPiece.length));
357
+ if (pagePrefix.length > 0) {
358
+ if (trimmedPiece.startsWith(pagePrefix)) return pi;
359
+ if (pageData.content.trimStart().startsWith(piecePrefix)) return pi;
360
+ }
361
+ }
362
+ }
363
+ return currentFromIdx;
364
+ };
365
+ /**
366
+ * Checks if any page in a range is excluded by the given exclude set.
367
+ *
368
+ * @param excludeSet - Set of excluded page IDs
369
+ * @param pageIds - Array of page IDs
370
+ * @param fromIdx - Start index (inclusive)
371
+ * @param toIdx - End index (inclusive)
372
+ * @returns True if any page in range is excluded
373
+ */
374
+ const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
375
+ if (excludeSet.size === 0) return false;
376
+ for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
377
+ return false;
378
+ };
379
+ /**
380
+ * Finds the position of the next page content within remaining content.
381
+ * Returns -1 if not found.
382
+ *
383
+ * @param remainingContent - Content to search in
384
+ * @param nextPageData - Normalized data for the next page
385
+ * @returns Position of next page content, or -1 if not found
386
+ */
387
+ const findNextPagePosition = (remainingContent, nextPageData) => {
388
+ const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
389
+ if (searchPrefix.length === 0) return -1;
390
+ const pos = remainingContent.indexOf(searchPrefix);
391
+ return pos > 0 ? pos : -1;
392
+ };
393
+ /**
394
+ * Finds matches within a window and returns the selected position based on preference.
395
+ *
396
+ * @param windowContent - Content to search
397
+ * @param regex - Regex to match
398
+ * @param prefer - 'longer' for last match, 'shorter' for first match
399
+ * @returns Break position after the selected match, or -1 if no matches
400
+ */
401
+ const findPatternBreakPosition = (windowContent, regex, prefer) => {
402
+ const matches = [];
403
+ for (const m of windowContent.matchAll(regex)) matches.push({
404
+ index: m.index,
405
+ length: m[0].length
406
+ });
407
+ if (matches.length === 0) return -1;
408
+ const selected = prefer === "longer" ? matches[matches.length - 1] : matches[0];
409
+ return selected.index + selected.length;
410
+ };
411
+ /**
412
+ * Tries to find a break position within the current window using breakpoint patterns.
413
+ * Returns the break position or -1 if no suitable break was found.
414
+ *
415
+ * @param remainingContent - Content remaining to be segmented
416
+ * @param currentFromIdx - Current starting page index
417
+ * @param toIdx - Ending page index
418
+ * @param windowEndIdx - Maximum window end index
419
+ * @param ctx - Breakpoint context with page data and patterns
420
+ * @returns Break position in the content, or -1 if no break found
421
+ */
422
+ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, ctx) => {
423
+ const { pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, prefer } = ctx;
424
+ for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
425
+ if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
426
+ if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
427
+ if (skipWhenRegex?.test(remainingContent)) continue;
428
+ if (regex === null) {
429
+ const nextPageIdx = windowEndIdx + 1;
430
+ if (nextPageIdx <= toIdx) {
431
+ const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
432
+ if (nextPageData) {
433
+ const pos = findNextPagePosition(remainingContent, nextPageData);
434
+ if (pos > 0) return pos;
435
+ }
436
+ }
437
+ return Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
438
+ }
439
+ const windowEndPosition = Math.min(cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx], remainingContent.length);
440
+ const breakPos = findPatternBreakPosition(remainingContent.slice(0, windowEndPosition), regex, prefer);
441
+ if (breakPos > 0) return breakPos;
442
+ }
443
+ return -1;
444
+ };
155
445
 
156
446
  //#endregion
157
- //#region src/markers/type-generators.ts
447
+ //#region src/segmentation/match-utils.ts
158
448
  /**
159
- * Generates a regular expression for pattern-type markers.
449
+ * Utility functions for regex matching and result processing.
160
450
  *
161
- * Supports two modes:
162
- * 1. Template-based: Uses the `template` field with token expansion
163
- * 2. Pattern-based: Uses the raw `pattern` field as-is
451
+ * These functions were extracted from `segmenter.ts` to reduce complexity
452
+ * and enable independent testing. They handle match filtering, capture
453
+ * extraction, and occurrence-based selection.
164
454
  *
165
- * @param config - Marker configuration with either `template` or `pattern` field
166
- * @returns A compiled RegExp object for matching the pattern
167
- * @throws {Error} When neither `template` nor `pattern` is provided
455
+ * @module match-utils
456
+ */
457
+ /**
458
+ * Extracts named capture groups from a regex match.
459
+ *
460
+ * Only includes groups that are in the `captureNames` list and have
461
+ * defined values. This filters out positional captures and ensures
462
+ * only explicitly requested named captures are returned.
463
+ *
464
+ * @param groups - The `match.groups` object from `RegExp.exec()`
465
+ * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
466
+ * @returns Object with capture name → value pairs, or `undefined` if none found
168
467
  *
169
468
  * @example
170
- * // Using template
171
- * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
469
+ * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
470
+ * extractNamedCaptures(match.groups, ['num'])
471
+ * // → { num: '٦٦٩٦' }
172
472
  *
173
473
  * @example
174
- * // Using raw pattern
175
- * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
474
+ * // No matching captures
475
+ * extractNamedCaptures({}, ['num'])
476
+ * // → undefined
176
477
  *
177
478
  * @example
178
- * // Using custom tokens
179
- * const regex = generatePatternRegex({
180
- * type: 'pattern',
181
- * template: '{verse}',
182
- * tokens: { verse: '\\[[0-9]+\\]' }
183
- * });
479
+ * // Undefined groups
480
+ * extractNamedCaptures(undefined, ['num'])
481
+ * // → undefined
184
482
  */
185
- function generatePatternRegex(config) {
186
- if (config.template) {
187
- const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
188
- const pattern = expandTemplate(config.template, { tokens: tokenMap });
189
- return new RegExp(pattern, "u");
190
- }
191
- if (!config.pattern) throw new Error("pattern marker must provide either a template or pattern");
192
- return new RegExp(config.pattern, "u");
193
- }
483
+ const extractNamedCaptures = (groups, captureNames) => {
484
+ if (!groups || captureNames.length === 0) return;
485
+ const namedCaptures = {};
486
+ for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
487
+ return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
488
+ };
194
489
  /**
195
- * Generates a regular expression for 'bab' (chapter) markers.
490
+ * Gets the last defined positional capture group from a match array.
196
491
  *
197
- * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
198
- * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
492
+ * Used for `lineStartsAfter` patterns where the content capture (`.*`)
493
+ * is always at the end of the pattern. Named captures may shift the
494
+ * positional indices, so we iterate backward to find the actual content.
199
495
  *
200
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
496
+ * @param match - RegExp exec result array
497
+ * @returns The last defined capture group value, or `undefined` if none
201
498
  *
202
499
  * @example
203
- * const regex = generateBabRegex();
204
- * const match = regex.exec('باب الصلاة');
205
- * // match.groups.marker -> 'باب'
206
- * // match.groups.content -> ' الصلاة'
500
+ * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
501
+ * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
502
+ * getLastPositionalCapture(match)
503
+ * // 'content'
504
+ *
505
+ * @example
506
+ * // No captures
507
+ * getLastPositionalCapture(['full match'])
508
+ * // → undefined
207
509
  */
208
- function generateBabRegex() {
209
- const babPattern = makeDiacriticInsensitive("باب");
210
- const pattern = String.raw`^(?<full>(?<marker>${babPattern}[ًٌٍَُ]?)(?<content>[\s\S]*))`;
211
- return new RegExp(pattern, "u");
212
- }
510
+ const getLastPositionalCapture = (match) => {
511
+ if (match.length <= 1) return;
512
+ for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
513
+ };
213
514
  /**
214
- * Generates a regular expression for hadith chain (isnad) markers.
515
+ * Filters matches to only include those within page ID constraints.
215
516
  *
216
- * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
217
- * Uses default phrases from presets or custom phrases from config.
218
- * All phrases are made diacritic-insensitive.
517
+ * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
518
+ * matches that occur on pages outside the allowed range or explicitly excluded.
219
519
  *
220
- * @param config - Marker configuration with optional `phrases` array
221
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
520
+ * @param matches - Array of match results to filter
521
+ * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
522
+ * @param getId - Function that returns the page ID for a given offset
523
+ * @returns Filtered array containing only matches within constraints
222
524
  *
223
525
  * @example
224
- * // Using default phrases
225
- * const regex = generateHadithChainRegex({ type: 'hadith-chain' });
226
- * const match = regex.exec('حَدَّثَنَا أبو بكر');
526
+ * const matches = [
527
+ * { start: 0, end: 10 }, // Page 1
528
+ * { start: 100, end: 110 }, // Page 5
529
+ * { start: 200, end: 210 }, // Page 10
530
+ * ];
531
+ * filterByConstraints(matches, { min: 3, max: 8 }, getId)
532
+ * // → [{ start: 100, end: 110 }] (only page 5 match)
533
+ */
534
+ const filterByConstraints = (matches, rule, getId) => {
535
+ return matches.filter((m) => {
536
+ const id = getId(m.start);
537
+ if (rule.min !== void 0 && id < rule.min) return false;
538
+ if (rule.max !== void 0 && id > rule.max) return false;
539
+ if (isPageExcluded(id, rule.exclude)) return false;
540
+ return true;
541
+ });
542
+ };
543
+ /**
544
+ * Filters matches based on occurrence setting (first, last, or all).
545
+ *
546
+ * Applies occurrence-based selection to a list of matches:
547
+ * - `'all'` or `undefined`: Return all matches (default)
548
+ * - `'first'`: Return only the first match
549
+ * - `'last'`: Return only the last match
550
+ *
551
+ * @param matches - Array of match results to filter
552
+ * @param occurrence - Which occurrence(s) to keep
553
+ * @returns Filtered array based on occurrence setting
227
554
  *
228
555
  * @example
229
- * // Using custom phrases
230
- * const regex = generateHadithChainRegex({
231
- * type: 'hadith-chain',
232
- * phrases: ['قَالَ', 'رَوَى']
233
- * });
556
+ * const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
557
+ *
558
+ * filterByOccurrence(matches, 'first')
559
+ * // → [{ start: 0 }]
560
+ *
561
+ * filterByOccurrence(matches, 'last')
562
+ * // → [{ start: 20 }]
563
+ *
564
+ * filterByOccurrence(matches, 'all')
565
+ * // → [{ start: 0 }, { start: 10 }, { start: 20 }]
566
+ *
567
+ * filterByOccurrence(matches, undefined)
568
+ * // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
234
569
  */
235
- function generateHadithChainRegex(config) {
236
- const phrasesPattern = (config.phrases || DEFAULT_HADITH_PHRASES).map((p) => makeDiacriticInsensitive(p)).join("|");
237
- const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
238
- return new RegExp(pattern, "u");
239
- }
570
+ const filterByOccurrence = (matches, occurrence) => {
571
+ if (!matches.length) return [];
572
+ if (occurrence === "first") return [matches[0]];
573
+ if (occurrence === "last") return [matches[matches.length - 1]];
574
+ return matches;
575
+ };
240
576
  /**
241
- * Generates a regular expression for basmala markers.
577
+ * Checks if any rule in the list allows the given page ID.
578
+ *
579
+ * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
580
+ * Rules without constraints allow all page IDs.
581
+ *
582
+ * This is used to determine whether to create a segment for content
583
+ * that appears before any split points (the "first segment").
242
584
  *
243
- * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
244
- * - بسم الله (without diacritics)
245
- * - بِسْمِ اللَّهِ (with diacritics)
246
- * - Special patterns like [بسم, [تم
585
+ * @param rules - Array of rules with optional `min` and `max` constraints
586
+ * @param pageId - Page ID to check
587
+ * @returns `true` if at least one rule allows the page ID
588
+ *
589
+ * @example
590
+ * const rules = [
591
+ * { min: 5, max: 10 }, // Allows pages 5-10
592
+ * { min: 20 }, // Allows pages 20+
593
+ * ];
247
594
  *
248
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
595
+ * anyRuleAllowsId(rules, 7) // true (first rule allows)
596
+ * anyRuleAllowsId(rules, 3) // → false (no rule allows)
597
+ * anyRuleAllowsId(rules, 25) // → true (second rule allows)
249
598
  *
250
599
  * @example
251
- * const regex = generateBasmalaRegex();
252
- * const match = regex.exec('بسم الله الرحمن الرحيم');
253
- * // match.groups.marker -> 'بسم الله'
600
+ * // Rules without constraints allow everything
601
+ * anyRuleAllowsId([{}], 999) // true
254
602
  */
255
- function generateBasmalaRegex() {
256
- const combinedPattern = DEFAULT_BASMALA_PATTERNS.map((p) => makeDiacriticInsensitive(p)).join("|");
257
- const pattern = String.raw`^(?<full>(?<marker>${combinedPattern})(?<content>[\s\S]*))`;
258
- return new RegExp(pattern, "u");
259
- }
603
+ const anyRuleAllowsId = (rules, pageId) => {
604
+ return rules.some((r) => {
605
+ const minOk = r.min === void 0 || pageId >= r.min;
606
+ const maxOk = r.max === void 0 || pageId <= r.max;
607
+ return minOk && maxOk;
608
+ });
609
+ };
610
+
611
+ //#endregion
612
+ //#region src/segmentation/textUtils.ts
260
613
  /**
261
- * Generates a regular expression for custom phrase markers.
614
+ * Strip all HTML tags from content, keeping only text.
262
615
  *
263
- * Similar to hadith-chain markers but requires explicit phrase list.
264
- * All phrases are made diacritic-insensitive.
616
+ * @param html - HTML content
617
+ * @returns Plain text content
618
+ */
619
+ const stripHtmlTags = (html) => {
620
+ return html.replace(/<[^>]*>/g, "");
621
+ };
622
+ /**
623
+ * Normalizes line endings to Unix-style (`\n`).
265
624
  *
266
- * @param config - Marker configuration with required `phrases` array
267
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
268
- * @throws {Error} When `phrases` is undefined or empty
625
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
626
+ * for consistent pattern matching across platforms.
269
627
  *
270
- * @example
271
- * const regex = generatePhraseRegex({
272
- * type: 'phrase',
273
- * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
274
- * });
628
+ * @param content - Raw content with potentially mixed line endings
629
+ * @returns Content with all line endings normalized to `\n`
275
630
  */
276
- function generatePhraseRegex(config) {
277
- if (!config.phrases || config.phrases.length === 0) throw new Error("phrase marker requires phrases array");
278
- const phrasesPattern = config.phrases.map((p) => makeDiacriticInsensitive(p)).join("|");
279
- const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
280
- return new RegExp(pattern, "u");
281
- }
631
+ const normalizeLineEndings = (content) => content.replace(/\r\n?/g, "\n");
632
+
633
+ //#endregion
634
+ //#region src/segmentation/tokens.ts
282
635
  /**
283
- * Generates a regular expression for square bracket markers.
636
+ * Token-based template system for Arabic text pattern matching.
284
637
  *
285
- * Matches verse or hadith reference numbers in square brackets:
286
- * - [٦٥] - Simple bracket
287
- * - [٦٥] - With bullet prefix
288
- * - ° [٦٥] - With degree prefix
638
+ * This module provides a human-readable way to define regex patterns using
639
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
640
+ * named capture groups for extracting matched values into metadata.
289
641
  *
290
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
642
+ * @module tokens
291
643
  *
292
644
  * @example
293
- * const regex = generateSquareBracketRegex();
294
- * const match = regex.exec('[٦٥] نص الحديث');
295
- * // match.groups.content -> ' نص الحديث'
645
+ * // Simple token expansion
646
+ * expandTokens('{{raqms}} {{dash}}')
647
+ * // '[\\u0660-\\u0669]+ [-–—ـ]'
648
+ *
649
+ * @example
650
+ * // Named capture groups
651
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
652
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
296
653
  */
297
- function generateSquareBracketRegex() {
298
- const markerPattern = String.raw`[•°]?\s?\[[\u0660-\u0669]+\]\s?`;
299
- const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
300
- return new RegExp(pattern, "u");
301
- }
302
654
  /**
303
- * Generates a regular expression for number-letter-separator markers.
655
+ * Token definitions mapping human-readable token names to regex patterns.
304
656
  *
305
- * Matches patterns like:
306
- * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
307
- * - 5 ب. (Latin number, Arabic letter, dot)
657
+ * Tokens are used in template strings with double-brace syntax:
658
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
659
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
660
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
308
661
  *
309
- * @param config - Configuration with required `numbering` and `separator` fields
310
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
662
+ * @remarks
663
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
664
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
665
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
311
666
  *
312
667
  * @example
313
- * const regex = generateNumLetterRegex({
314
- * numbering: 'arabic-indic',
315
- * separator: 'dash'
316
- * });
317
- * const match = regex.exec('٥ أ - نص');
668
+ * // Using tokens in a split rule
669
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
670
+ *
671
+ * @example
672
+ * // Using tokens with named captures
673
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
318
674
  */
319
- function generateNumLetterRegex(config) {
320
- const numPattern = NUMBERING_PATTERNS[config.numbering];
321
- const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
322
- const markerPattern = String.raw`${numPattern} [أ-ي]\s?${sepPattern}`;
323
- const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
324
- return new RegExp(pattern, "u");
325
- }
326
675
  /**
327
- * Generates a regular expression for number-parenthetical-separator markers.
676
+ * Base token definitions mapping human-readable token names to regex patterns.
328
677
  *
329
- * Matches patterns like:
330
- * - ٥ (أ) - (number, parenthetical content, separator)
331
- * - 5 (٦) - (number with parenthetical number)
678
+ * These tokens contain raw regex patterns and do not reference other tokens.
679
+ * For composite tokens that build on these, see `COMPOSITE_TOKENS`.
332
680
  *
333
- * @param config - Configuration with required `numbering` and `separator` fields
334
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
681
+ * @internal
682
+ */
683
+ const BASE_TOKENS = {
684
+ bab: "باب",
685
+ basmalah: "بسم الله|﷽",
686
+ bullet: "[•*°]",
687
+ dash: "[-–—ـ]",
688
+ fasl: "فصل|مسألة",
689
+ harf: "[أ-ي]",
690
+ kitab: "كتاب",
691
+ naql: "حدثنا|أخبرنا|حدثني|وحدثنا|أنبأنا|سمعت",
692
+ raqm: "[\\u0660-\\u0669]",
693
+ raqms: "[\\u0660-\\u0669]+",
694
+ tarqim: "[.!?؟؛]"
695
+ };
696
+ /**
697
+ * Composite token definitions using template syntax.
698
+ *
699
+ * These tokens reference base tokens using `{{token}}` syntax and are
700
+ * automatically expanded to their final regex patterns at module load time.
701
+ *
702
+ * This provides better abstraction - if base tokens change, composites
703
+ * automatically update on the next build.
704
+ *
705
+ * @internal
706
+ */
707
+ const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
708
+ /**
709
+ * Expands base tokens in a template string.
710
+ * Used internally to pre-expand composite tokens.
711
+ *
712
+ * @param template - Template string with `{{token}}` placeholders
713
+ * @returns Expanded pattern with base tokens replaced
714
+ * @internal
715
+ */
716
+ const expandBaseTokens = (template) => {
717
+ return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
718
+ return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
719
+ });
720
+ };
721
+ /**
722
+ * Token definitions mapping human-readable token names to regex patterns.
723
+ *
724
+ * Tokens are used in template strings with double-brace syntax:
725
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
726
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
727
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
728
+ *
729
+ * @remarks
730
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
731
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
732
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
335
733
  *
336
734
  * @example
337
- * const regex = generateNumParenRegex({
338
- * numbering: 'arabic-indic',
339
- * separator: 'dash'
340
- * });
341
- * const match = regex.exec('٥ (أ) - نص');
735
+ * // Using tokens in a split rule
736
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
737
+ *
738
+ * @example
739
+ * // Using tokens with named captures
740
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
741
+ *
742
+ * @example
743
+ * // Using the numbered convenience token
744
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
745
+ */
746
+ const TOKEN_PATTERNS = {
747
+ ...BASE_TOKENS,
748
+ ...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
749
+ };
750
+ /**
751
+ * Regex pattern for matching tokens with optional named capture syntax.
752
+ *
753
+ * Matches:
754
+ * - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
755
+ * - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
756
+ * - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
757
+ *
758
+ * @internal
342
759
  */
343
- function generateNumParenRegex(config) {
344
- const numPattern = NUMBERING_PATTERNS[config.numbering];
345
- const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
346
- const markerPattern = String.raw`${numPattern}\s*\([\u0600-\u06FF\u0660-\u0669\s]+\)\s?${sepPattern}`;
347
- const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
348
- return new RegExp(pattern, "u");
349
- }
760
+ const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
350
761
  /**
351
- * Generates a regular expression for number-slash-number markers.
762
+ * Regex pattern for simple token matching (no capture syntax).
763
+ *
764
+ * Matches only `{{token}}` format where token is one or more word characters.
765
+ * Used by `containsTokens()` for quick detection.
352
766
  *
353
- * Matches patterns like:
354
- * - ٥/٦ - (number slash number, separator)
355
- * - ٥ - (single number, separator)
767
+ * @internal
768
+ */
769
+ const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
770
+ /**
771
+ * Checks if a query string contains template tokens.
356
772
  *
357
- * The second number after the slash is optional.
773
+ * Performs a quick test for `{{token}}` patterns without actually
774
+ * expanding them. Useful for determining whether to apply token
775
+ * expansion to a string.
358
776
  *
359
- * @param config - Configuration with required `numbering` and `separator` fields
360
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
777
+ * @param query - String to check for tokens
778
+ * @returns `true` if the string contains at least one `{{token}}` pattern
361
779
  *
362
780
  * @example
363
- * const regex = generateNumSlashRegex({
364
- * numbering: 'arabic-indic',
365
- * separator: 'dash'
366
- * });
367
- * const match1 = regex.exec('٥/٦ - نص');
368
- * const match2 = regex.exec('٥ - نص'); // Also matches
781
+ * containsTokens('{{raqms}} {{dash}}') // → true
782
+ * containsTokens('plain text') // → false
783
+ * containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
369
784
  */
370
- function generateNumSlashRegex(config) {
371
- const numPattern = NUMBERING_PATTERNS[config.numbering];
372
- const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
373
- const markerPattern = String.raw`${numPattern}(?:\s?/\s?${numPattern})?\s?${sepPattern}`;
374
- const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
375
- return new RegExp(pattern, "u");
376
- }
785
+ const containsTokens = (query) => {
786
+ SIMPLE_TOKEN_REGEX.lastIndex = 0;
787
+ return SIMPLE_TOKEN_REGEX.test(query);
788
+ };
377
789
  /**
378
- * Generates a regular expression for numbered markers with optional format template.
790
+ * Expands template tokens with support for named captures.
379
791
  *
380
- * Supports two modes:
381
- * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
382
- * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
792
+ * This is the primary token expansion function that handles all token syntax:
793
+ * - `{{token}}` Expands to the token's pattern (no capture group)
794
+ * - `{{token:name}}` Expands to `(?<name>pattern)` (named capture)
795
+ * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
383
796
  *
384
- * When using default pattern:
385
- * - Separator 'none' generates pattern without separator
386
- * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
797
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
387
798
  *
388
- * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
389
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
799
+ * @param query - The template string containing tokens
800
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
801
+ * Applied to both token patterns and plain Arabic text between tokens.
802
+ * Typically `makeDiacriticInsensitive` from the fuzzy module.
803
+ * @returns Object with expanded pattern, capture names, and capture flag
390
804
  *
391
805
  * @example
392
- * // Using format template
393
- * const regex = generateNumberedRegex({
394
- * numbering: 'arabic-indic',
395
- * separator: 'dash',
396
- * format: '{bullet}+ {num} {dash}'
397
- * });
806
+ * // Simple token expansion
807
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
808
+ * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
398
809
  *
399
810
  * @example
400
- * // Using default pattern
401
- * const regex = generateNumberedRegex({
402
- * numbering: 'arabic-indic',
403
- * separator: 'dash'
404
- * });
405
- * const match = regex.exec('٥ - نص');
811
+ * // Named capture
812
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
813
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
406
814
  *
407
815
  * @example
408
- * // With 'none' separator
409
- * const regex = generateNumberedRegex({
410
- * numbering: 'latin',
411
- * separator: 'none'
412
- * });
413
- * const match = regex.exec('5 text');
816
+ * // Capture-only token
817
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
818
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
819
+ *
820
+ * @example
821
+ * // With fuzzy transform
822
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
823
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
414
824
  */
415
- function generateNumberedRegex(config) {
416
- if (config.format) {
417
- const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
418
- const expandedPattern = expandTemplate(config.format, { tokens: tokenMap });
419
- return new RegExp(expandedPattern, "u");
825
+ const expandTokensWithCaptures = (query, fuzzyTransform) => {
826
+ const captureNames = [];
827
+ const segments = [];
828
+ let lastIndex = 0;
829
+ TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
830
+ let match;
831
+ while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
832
+ if (match.index > lastIndex) segments.push({
833
+ type: "text",
834
+ value: query.slice(lastIndex, match.index)
835
+ });
836
+ segments.push({
837
+ type: "token",
838
+ value: match[0]
839
+ });
840
+ lastIndex = match.index + match[0].length;
420
841
  }
421
- const numPattern = NUMBERING_PATTERNS[config.numbering];
422
- const separator = config.separator;
423
- const sepPattern = separator !== "none" ? SEPARATOR_PATTERNS[separator] ?? separator : "";
424
- const markerPattern = sepPattern ? String.raw`${numPattern}\s?${sepPattern}` : numPattern;
425
- const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
426
- return new RegExp(pattern, "u");
427
- }
842
+ if (lastIndex < query.length) segments.push({
843
+ type: "text",
844
+ value: query.slice(lastIndex)
845
+ });
846
+ const processedParts = segments.map((segment) => {
847
+ if (segment.type === "text") {
848
+ if (fuzzyTransform && /[\u0600-\u06FF]/.test(segment.value)) return fuzzyTransform(segment.value);
849
+ return segment.value;
850
+ }
851
+ TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
852
+ const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(segment.value);
853
+ if (!tokenMatch) return segment.value;
854
+ const [, tokenName, captureName] = tokenMatch;
855
+ if (!tokenName && captureName) {
856
+ captureNames.push(captureName);
857
+ return `(?<${captureName}>.+)`;
858
+ }
859
+ let tokenPattern = TOKEN_PATTERNS[tokenName];
860
+ if (!tokenPattern) return segment.value;
861
+ if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
862
+ if (captureName) {
863
+ captureNames.push(captureName);
864
+ return `(?<${captureName}>${tokenPattern})`;
865
+ }
866
+ return tokenPattern;
867
+ });
868
+ return {
869
+ captureNames,
870
+ hasCaptures: captureNames.length > 0,
871
+ pattern: processedParts.join("")
872
+ };
873
+ };
428
874
  /**
429
- * Generates a regular expression for bullet-point markers.
875
+ * Expands template tokens in a query string to their regex equivalents.
430
876
  *
431
- * Matches common bullet characters:
432
- * - (bullet)
433
- * - * (asterisk)
434
- * - ° (degree)
435
- * - - (dash)
877
+ * This is the simple version without capture support. It returns only the
878
+ * expanded pattern string, not capture metadata.
436
879
  *
437
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
880
+ * Unknown tokens are left as-is, allowing for partial templates.
881
+ *
882
+ * @param query - Template string containing `{{token}}` placeholders
883
+ * @returns Expanded regex pattern string
438
884
  *
439
885
  * @example
440
- * const regex = generateBulletRegex();
441
- * const match = regex.exec(' نقطة');
442
- * // match.groups.content -> 'نقطة'
886
+ * expandTokens('، {{raqms}}') // '، [\\u0660-\\u0669]+'
887
+ * expandTokens('{{raqm}}*') // '[\\u0660-\\u0669]*'
888
+ * expandTokens('{{dash}}{{raqm}}') // '[-–—ـ][\\u0660-\\u0669]'
889
+ * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
890
+ *
891
+ * @see expandTokensWithCaptures for full capture group support
443
892
  */
444
- function generateBulletRegex() {
445
- return new RegExp("^(?<full>(?<marker>[•*°\\-]\\s?)(?<content>[\\s\\S]*))", "u");
446
- }
893
+ const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
447
894
  /**
448
- * Generates a regular expression for Markdown-style heading markers.
895
+ * Converts a template string to a compiled RegExp.
449
896
  *
450
- * Matches heading levels using hash symbols:
451
- * - # Heading 1
452
- * - ## Heading 2
453
- * - ### Heading 3
454
- * - etc.
897
+ * Expands all tokens and attempts to compile the result as a RegExp
898
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
455
899
  *
456
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
900
+ * @remarks
901
+ * This function dynamically compiles regular expressions from template strings.
902
+ * If templates may come from untrusted sources, be aware of potential ReDoS
903
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
904
+ * Consider validating pattern complexity or applying execution timeouts when
905
+ * running user-submitted patterns.
906
+ *
907
+ * @param template - Template string containing `{{token}}` placeholders
908
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
909
+ *
910
+ * @example
911
+ * templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
912
+ * templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
913
+ * templateToRegex('(((') // → null (invalid regex)
914
+ */
915
+ const templateToRegex = (template) => {
916
+ const expanded = expandTokens(template);
917
+ try {
918
+ return new RegExp(expanded, "u");
919
+ } catch {
920
+ return null;
921
+ }
922
+ };
923
+ /**
924
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
925
+ *
926
+ * Useful for documentation, validation, or building user interfaces
927
+ * that show available tokens.
928
+ *
929
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
930
+ *
931
+ * @example
932
+ * getAvailableTokens()
933
+ * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
934
+ */
935
+ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
936
+ /**
937
+ * Gets the regex pattern for a specific token name.
938
+ *
939
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
940
+ * without any expansion or capture group wrapping.
941
+ *
942
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
943
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
457
944
  *
458
945
  * @example
459
- * const regex = generateHeadingRegex();
460
- * const match = regex.exec('## عنوان فرعي');
461
- * // match.groups.marker -> '## '
462
- * // match.groups.content -> 'عنوان فرعي'
946
+ * getTokenPattern('raqms') // '[\\u0660-\\u0669]+'
947
+ * getTokenPattern('dash') // '[-–—ـ]'
948
+ * getTokenPattern('unknown') // undefined
463
949
  */
464
- function generateHeadingRegex() {
465
- return new RegExp("^(?<full>(?<marker>#+\\s?)(?<content>[\\s\\S]*))", "u");
466
- }
950
+ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
467
951
 
468
952
  //#endregion
469
- //#region src/markers/generator.ts
470
- /**
471
- * Generates a regex pattern from a marker configuration.
472
- * Always returns a regex with three named capture groups:
473
- * - full: Complete match including marker
474
- * - marker: Just the marker part (for metadata/indexing)
475
- * - content: Clean content without marker (for LLM processing)
476
- *
477
- * This function applies all default values before delegating to type-specific generators.
478
- *
479
- * @param config - Marker configuration
480
- * @returns Regular expression with named groups
481
- *
482
- * @example
483
- * const regex = generateRegexFromMarker({ type: 'numbered' });
484
- * const match = regex.exec( - نص');
485
- * match.groups.full // - نص"
486
- * match.groups.marker // -"
487
- * match.groups.content // "نص"
488
- */
489
- function generateRegexFromMarker(config) {
490
- const normalized = {
491
- numbering: config.numbering ?? DEFAULT_NUMBERING,
492
- separator: config.separator ?? DEFAULT_SEPARATOR,
493
- ...config
953
+ //#region src/segmentation/segmenter.ts
954
+ /**
955
+ * Core segmentation engine for splitting Arabic text pages into logical segments.
956
+ *
957
+ * The segmenter takes an array of pages and applies pattern-based rules to
958
+ * identify split points, producing segments with content, page references,
959
+ * and optional metadata.
960
+ *
961
+ * @module segmenter
962
+ */
963
+ /**
964
+ * Checks if a regex pattern contains standard (anonymous) capturing groups.
965
+ *
966
+ * Detects standard capturing groups `(...)` while excluding:
967
+ * - Non-capturing groups `(?:...)`
968
+ * - Lookahead assertions `(?=...)` and `(?!...)`
969
+ * - Lookbehind assertions `(?<=...)` and `(?<!...)`
970
+ * - Named groups `(?<name>...)` (start with `(?` so excluded here)
971
+ *
972
+ * **Note**: Named capture groups `(?<name>...)` ARE capturing groups but are
973
+ * excluded by this check because they are tracked separately via the
974
+ * `captureNames` array from token expansion. This function only detects
975
+ * anonymous capturing groups like `(.*)`.
976
+ *
977
+ * @param pattern - Regex pattern string to analyze
978
+ * @returns `true` if the pattern contains at least one anonymous capturing group
979
+ */
980
+ const hasCapturingGroup = (pattern) => {
981
+ return /\((?!\?)/.test(pattern);
982
+ };
983
+ /**
984
+ * Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
985
+ *
986
+ * Fuzzy matching makes Arabic text diacritic-insensitive. When enabled, the
987
+ * transform is applied to token patterns BEFORE wrapping with capture groups,
988
+ * ensuring regex metacharacters (`(`, `)`, `|`, etc.) are not corrupted.
989
+ *
990
+ * @param pattern - Pattern string potentially containing `{{token}}` placeholders
991
+ * @param fuzzy - Whether to apply diacritic-insensitive transformation
992
+ * @returns Processed pattern with expanded tokens and capture names
993
+ *
994
+ * @example
995
+ * processPattern('{{raqms:num}} {{dash}}', false)
996
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ]', captureNames: ['num'] }
997
+ *
998
+ * @example
999
+ * processPattern('{{naql}}', true)
1000
+ * // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
1001
+ */
1002
+ const processPattern = (pattern, fuzzy) => {
1003
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(pattern, fuzzy ? makeDiacriticInsensitive : void 0);
1004
+ return {
1005
+ captureNames,
1006
+ pattern: expanded
494
1007
  };
495
- switch (normalized.type) {
496
- case "pattern": return generatePatternRegex(normalized);
497
- case "bab": return generateBabRegex();
498
- case "hadith-chain": return generateHadithChainRegex(normalized);
499
- case "basmala": return generateBasmalaRegex();
500
- case "phrase": return generatePhraseRegex(normalized);
501
- case "square-bracket": return generateSquareBracketRegex();
502
- case "num-letter": return generateNumLetterRegex(normalized);
503
- case "num-paren": return generateNumParenRegex(normalized);
504
- case "num-slash": return generateNumSlashRegex(normalized);
505
- case "numbered": return generateNumberedRegex(normalized);
506
- case "bullet": return generateBulletRegex();
507
- case "heading": return generateHeadingRegex();
508
- default: {
509
- const _exhaustive = normalized.type;
510
- throw new Error(`Unknown marker type: ${_exhaustive}`);
1008
+ };
1009
+ /**
1010
+ * Builds a compiled regex and metadata from a split rule.
1011
+ *
1012
+ * Handles all pattern types:
1013
+ * - `regex`: Used as-is (no token expansion)
1014
+ * - `template`: Tokens expanded via `expandTokensWithCaptures`
1015
+ * - `lineStartsWith`: Converted to `^(?:patterns...)`
1016
+ * - `lineStartsAfter`: Converted to `^(?:patterns...)(.*)`
1017
+ * - `lineEndsWith`: Converted to `(?:patterns...)$`
1018
+ *
1019
+ * @param rule - Split rule containing pattern and options
1020
+ * @returns Compiled regex with capture metadata
1021
+ */
1022
+ const buildRuleRegex = (rule) => {
1023
+ const s = { ...rule };
1024
+ const fuzzy = rule.fuzzy ?? false;
1025
+ let allCaptureNames = [];
1026
+ /**
1027
+ * Safely compiles a regex pattern, throwing a helpful error if invalid.
1028
+ *
1029
+ * @remarks
1030
+ * This catches syntax errors only. It does NOT protect against ReDoS
1031
+ * (catastrophic backtracking) from pathological patterns. Avoid compiling
1032
+ * patterns from untrusted sources.
1033
+ */
1034
+ const compileRegex = (pattern) => {
1035
+ try {
1036
+ return new RegExp(pattern, "gmu");
1037
+ } catch (error) {
1038
+ const message = error instanceof Error ? error.message : String(error);
1039
+ throw new Error(`Invalid regex pattern: ${pattern}\n Cause: ${message}`);
511
1040
  }
1041
+ };
1042
+ if (s.lineStartsAfter?.length) {
1043
+ const processed = s.lineStartsAfter.map((p) => processPattern(p, fuzzy));
1044
+ const patterns = processed.map((p) => p.pattern).join("|");
1045
+ allCaptureNames = processed.flatMap((p) => p.captureNames);
1046
+ s.regex = `^(?:${patterns})(.*)`;
1047
+ return {
1048
+ captureNames: allCaptureNames,
1049
+ regex: compileRegex(s.regex),
1050
+ usesCapture: true,
1051
+ usesLineStartsAfter: true
1052
+ };
1053
+ }
1054
+ if (s.lineStartsWith?.length) {
1055
+ const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
1056
+ const patterns = processed.map((p) => p.pattern).join("|");
1057
+ allCaptureNames = processed.flatMap((p) => p.captureNames);
1058
+ s.template = `^(?:${patterns})`;
1059
+ }
1060
+ if (s.lineEndsWith?.length) {
1061
+ const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
1062
+ const patterns = processed.map((p) => p.pattern).join("|");
1063
+ allCaptureNames = processed.flatMap((p) => p.captureNames);
1064
+ s.template = `(?:${patterns})$`;
1065
+ }
1066
+ if (s.template) {
1067
+ const { pattern, captureNames } = expandTokensWithCaptures(s.template);
1068
+ s.regex = pattern;
1069
+ allCaptureNames = [...allCaptureNames, ...captureNames];
1070
+ }
1071
+ if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
1072
+ const usesCapture = hasCapturingGroup(s.regex) || allCaptureNames.length > 0;
1073
+ return {
1074
+ captureNames: allCaptureNames,
1075
+ regex: compileRegex(s.regex),
1076
+ usesCapture,
1077
+ usesLineStartsAfter: false
1078
+ };
1079
+ };
1080
+ /**
1081
+ * Builds a concatenated content string and page mapping from input pages.
1082
+ *
1083
+ * Pages are joined with newline characters, and a page map is created to
1084
+ * track which page each offset belongs to. This allows pattern matching
1085
+ * across page boundaries while preserving page reference information.
1086
+ *
1087
+ * @param pages - Array of input pages with id and content
1088
+ * @returns Concatenated content string and page mapping utilities
1089
+ *
1090
+ * @example
1091
+ * const pages = [
1092
+ * { id: 1, content: 'Page 1 text' },
1093
+ * { id: 2, content: 'Page 2 text' }
1094
+ * ];
1095
+ * const { content, pageMap } = buildPageMap(pages);
1096
+ * // content = 'Page 1 text\nPage 2 text'
1097
+ * // pageMap.getId(0) = 1
1098
+ * // pageMap.getId(12) = 2
1099
+ */
1100
+ const buildPageMap = (pages) => {
1101
+ const boundaries = [];
1102
+ const pageBreaks = [];
1103
+ let offset = 0;
1104
+ const parts = [];
1105
+ for (let i = 0; i < pages.length; i++) {
1106
+ const normalized = normalizeLineEndings(pages[i].content);
1107
+ boundaries.push({
1108
+ end: offset + normalized.length,
1109
+ id: pages[i].id,
1110
+ start: offset
1111
+ });
1112
+ parts.push(normalized);
1113
+ if (i < pages.length - 1) {
1114
+ pageBreaks.push(offset + normalized.length);
1115
+ offset += normalized.length + 1;
1116
+ } else offset += normalized.length;
1117
+ }
1118
+ /**
1119
+ * Finds the page boundary containing the given offset using binary search.
1120
+ * O(log n) complexity for efficient lookup with many pages.
1121
+ *
1122
+ * @param off - Character offset to look up
1123
+ * @returns Page boundary or the last boundary as fallback
1124
+ */
1125
+ const findBoundary = (off) => {
1126
+ let lo = 0;
1127
+ let hi = boundaries.length - 1;
1128
+ while (lo <= hi) {
1129
+ const mid = lo + hi >>> 1;
1130
+ const b = boundaries[mid];
1131
+ if (off < b.start) hi = mid - 1;
1132
+ else if (off > b.end) lo = mid + 1;
1133
+ else return b;
1134
+ }
1135
+ return boundaries[boundaries.length - 1];
1136
+ };
1137
+ return {
1138
+ content: parts.join("\n"),
1139
+ normalizedPages: parts,
1140
+ pageMap: {
1141
+ boundaries,
1142
+ getId: (off) => findBoundary(off)?.id ?? 0,
1143
+ pageBreaks,
1144
+ pageIds: boundaries.map((b) => b.id)
1145
+ }
1146
+ };
1147
+ };
1148
+ /**
1149
+ * Executes a regex against content and extracts match results with capture information.
1150
+ *
1151
+ * @param content - Full content string to search
1152
+ * @param regex - Compiled regex with 'g' flag
1153
+ * @param usesCapture - Whether to extract captured content
1154
+ * @param captureNames - Names of expected named capture groups
1155
+ * @returns Array of match results with positions and captures
1156
+ */
1157
+ const findMatches = (content, regex, usesCapture, captureNames) => {
1158
+ const matches = [];
1159
+ regex.lastIndex = 0;
1160
+ let m = regex.exec(content);
1161
+ while (m !== null) {
1162
+ const result = {
1163
+ end: m.index + m[0].length,
1164
+ start: m.index
1165
+ };
1166
+ result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
1167
+ if (usesCapture) result.captured = getLastPositionalCapture(m);
1168
+ matches.push(result);
1169
+ if (m[0].length === 0) regex.lastIndex++;
1170
+ m = regex.exec(content);
1171
+ }
1172
+ return matches;
1173
+ };
1174
+ /**
1175
+ * Finds page breaks within a given offset range using binary search.
1176
+ * O(log n + k) where n = total breaks, k = breaks in range.
1177
+ *
1178
+ * @param startOffset - Start of range (inclusive)
1179
+ * @param endOffset - End of range (exclusive)
1180
+ * @param sortedBreaks - Sorted array of page break offsets
1181
+ * @returns Array of break offsets relative to startOffset
1182
+ */
1183
+ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
1184
+ if (sortedBreaks.length === 0) return [];
1185
+ let lo = 0;
1186
+ let hi = sortedBreaks.length;
1187
+ while (lo < hi) {
1188
+ const mid = lo + hi >>> 1;
1189
+ if (sortedBreaks[mid] < startOffset) lo = mid + 1;
1190
+ else hi = mid;
1191
+ }
1192
+ const result = [];
1193
+ for (let i = lo; i < sortedBreaks.length && sortedBreaks[i] < endOffset; i++) result.push(sortedBreaks[i] - startOffset);
1194
+ return result;
1195
+ };
1196
+ /**
1197
+ * Converts page-break newlines to spaces in segment content.
1198
+ *
1199
+ * When a segment spans multiple pages, the newline characters that were
1200
+ * inserted as page separators during concatenation are converted to spaces
1201
+ * for more natural reading.
1202
+ *
1203
+ * Uses binary search for O(log n + k) lookup instead of O(n) iteration.
1204
+ *
1205
+ * @param content - Segment content string
1206
+ * @param startOffset - Starting offset of this content in concatenated string
1207
+ * @param pageBreaks - Sorted array of page break offsets
1208
+ * @returns Content with page-break newlines converted to spaces
1209
+ */
1210
+ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1211
+ const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
1212
+ if (breaksInRange.length === 0) return content;
1213
+ const breakSet = new Set(breaksInRange);
1214
+ return content.replace(/\n/g, (match, offset) => breakSet.has(offset) ? " " : match);
1215
+ };
1216
+ /**
1217
+ * Applies breakpoints to oversized segments.
1218
+ *
1219
+ * For each segment that spans more than maxPages, tries the breakpoint patterns
1220
+ * in order to find a suitable split point. Structural markers (from rules) are
1221
+ * always respected - segments are only broken within their boundaries.
1222
+ *
1223
+ * @param segments - Initial segments from rule processing
1224
+ * @param pages - Original pages for page lookup
1225
+ * @param maxPages - Maximum pages before breakpoints apply
1226
+ * @param breakpoints - Patterns to try in order (tokens supported)
1227
+ * @param prefer - 'longer' for last match, 'shorter' for first match
1228
+ * @returns Processed segments with oversized ones broken up
1229
+ */
1230
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer) => {
1231
+ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
1232
+ const startingPageId = pageIds$1[currentFromIdx];
1233
+ if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
1234
+ for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
1235
+ const pageId = pageIds$1[pageIdx];
1236
+ if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets$1[pageIdx] - cumulativeOffsets$1[currentFromIdx];
1237
+ }
1238
+ return -1;
1239
+ };
1240
+ const pageIds = pages.map((p) => p.id);
1241
+ const pageIdToIndex = new Map(pageIds.map((id, i) => [id, i]));
1242
+ const normalizedPages = /* @__PURE__ */ new Map();
1243
+ for (let i = 0; i < pages.length; i++) {
1244
+ const content = normalizedContent[i];
1245
+ normalizedPages.set(pages[i].id, {
1246
+ content,
1247
+ index: i,
1248
+ length: content.length
1249
+ });
1250
+ }
1251
+ const cumulativeOffsets = [0];
1252
+ let totalOffset = 0;
1253
+ for (let i = 0; i < pageIds.length; i++) {
1254
+ const pageData = normalizedPages.get(pageIds[i]);
1255
+ totalOffset += pageData ? pageData.length : 0;
1256
+ if (i < pageIds.length - 1) totalOffset += 1;
1257
+ cumulativeOffsets.push(totalOffset);
512
1258
  }
513
- }
1259
+ const patternProcessor = (p) => processPattern(p, false).pattern;
1260
+ const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
1261
+ const result = [];
1262
+ for (const segment of segments) {
1263
+ const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
1264
+ const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1265
+ const segmentSpan = (segment.to ?? segment.from) - segment.from;
1266
+ const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
1267
+ if (segmentSpan <= maxPages && !hasExclusions) {
1268
+ result.push(segment);
1269
+ continue;
1270
+ }
1271
+ let remainingContent = segment.content;
1272
+ let currentFromIdx = fromIdx;
1273
+ let isFirstPiece = true;
1274
+ while (currentFromIdx <= toIdx) {
1275
+ const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
1276
+ const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
1277
+ if (remainingSpan <= maxPages && !remainingHasExclusions) {
1278
+ const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1279
+ if (finalSeg) result.push(finalSeg);
1280
+ break;
1281
+ }
1282
+ const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
1283
+ let windowEndIdx = currentFromIdx;
1284
+ for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
1285
+ else break;
1286
+ const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
1287
+ let breakPosition = -1;
1288
+ if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1289
+ if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, {
1290
+ cumulativeOffsets,
1291
+ expandedBreakpoints,
1292
+ normalizedPages,
1293
+ pageIds,
1294
+ prefer
1295
+ });
1296
+ if (breakPosition <= 0) {
1297
+ if (windowEndIdx === currentFromIdx) {
1298
+ const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
1299
+ const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
1300
+ if (pageSeg) result.push(pageSeg);
1301
+ remainingContent = remainingContent.slice(pageContent.length).trim();
1302
+ currentFromIdx++;
1303
+ isFirstPiece = false;
1304
+ continue;
1305
+ }
1306
+ breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
1307
+ }
1308
+ const pieceContent = remainingContent.slice(0, breakPosition).trim();
1309
+ const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
1310
+ const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
1311
+ if (pieceContent) {
1312
+ const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1313
+ if (pieceSeg) result.push(pieceSeg);
1314
+ }
1315
+ remainingContent = remainingContent.slice(breakPosition).trim();
1316
+ let nextFromIdx = actualEndIdx;
1317
+ if (remainingContent && actualEndIdx + 1 <= toIdx) {
1318
+ const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
1319
+ if (nextPageData) {
1320
+ const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
1321
+ if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
1322
+ }
1323
+ }
1324
+ currentFromIdx = nextFromIdx;
1325
+ isFirstPiece = false;
1326
+ }
1327
+ }
1328
+ return result;
1329
+ };
1330
+ /**
1331
+ * Segments pages of content based on pattern-matching rules.
1332
+ *
1333
+ * This is the main entry point for the segmentation engine. It takes an array
1334
+ * of pages and applies the provided rules to identify split points, producing
1335
+ * an array of segments with content, page references, and metadata.
1336
+ *
1337
+ * @param pages - Array of pages with id and content
1338
+ * @param options - Segmentation options including splitting rules
1339
+ * @returns Array of segments with content, from/to page references, and optional metadata
1340
+ *
1341
+ * @example
1342
+ * // Split markdown by headers
1343
+ * const segments = segmentPages(pages, {
1344
+ * rules: [
1345
+ * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
1346
+ * ]
1347
+ * });
1348
+ *
1349
+ * @example
1350
+ * // Split Arabic hadith text with number extraction
1351
+ * const segments = segmentPages(pages, {
1352
+ * rules: [
1353
+ * {
1354
+ * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
1355
+ * split: 'at',
1356
+ * fuzzy: true,
1357
+ * meta: { type: 'hadith' }
1358
+ * }
1359
+ * ]
1360
+ * });
1361
+ *
1362
+ * @example
1363
+ * // Multiple rules with page constraints
1364
+ * const segments = segmentPages(pages, {
1365
+ * rules: [
1366
+ * { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
1367
+ * { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
1368
+ * { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
1369
+ * ]
1370
+ * });
1371
+ */
1372
+ const segmentPages = (pages, options) => {
1373
+ const { rules = [], maxPages, breakpoints, prefer = "longer" } = options;
1374
+ if (!pages.length) return [];
1375
+ const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1376
+ const splitPoints = [];
1377
+ for (const rule of rules) {
1378
+ const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1379
+ const finalMatches = filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence);
1380
+ for (const m of finalMatches) {
1381
+ const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1382
+ const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1383
+ splitPoints.push({
1384
+ capturedContent: isLineStartsAfter ? void 0 : m.captured,
1385
+ contentStartOffset: isLineStartsAfter ? markerLength : void 0,
1386
+ index: rule.split === "at" ? m.start : m.end,
1387
+ meta: rule.meta,
1388
+ namedCaptures: m.namedCaptures
1389
+ });
1390
+ }
1391
+ }
1392
+ const byIndex = /* @__PURE__ */ new Map();
1393
+ for (const p of splitPoints) {
1394
+ const existing = byIndex.get(p.index);
1395
+ if (!existing) byIndex.set(p.index, p);
1396
+ else if (p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
1397
+ }
1398
+ const unique = [...byIndex.values()];
1399
+ unique.sort((a, b) => a.index - b.index);
1400
+ let segments = buildSegments(unique, matchContent, pageMap, rules);
1401
+ if (segments.length === 0 && pages.length > 0) {
1402
+ const firstPage = pages[0];
1403
+ const lastPage = pages[pages.length - 1];
1404
+ const initialSeg = {
1405
+ content: pages.map((p) => normalizeLineEndings(p.content)).join("\n").trim(),
1406
+ from: firstPage.id
1407
+ };
1408
+ if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1409
+ if (initialSeg.content) segments = [initialSeg];
1410
+ }
1411
+ if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer);
1412
+ return segments;
1413
+ };
1414
+ /**
1415
+ * Creates segment objects from split points.
1416
+ *
1417
+ * Handles segment creation including:
1418
+ * - Content extraction (with captured content for `lineStartsAfter`)
1419
+ * - Page break conversion to spaces
1420
+ * - From/to page reference calculation
1421
+ * - Metadata merging (static + named captures)
1422
+ *
1423
+ * @param splitPoints - Sorted, unique split points
1424
+ * @param content - Full concatenated content string
1425
+ * @param pageMap - Page mapping utilities
1426
+ * @param rules - Original rules (for constraint checking on first segment)
1427
+ * @returns Array of segment objects
1428
+ */
1429
+ const buildSegments = (splitPoints, content, pageMap, rules) => {
1430
+ /**
1431
+ * Creates a single segment from a content range.
1432
+ */
1433
+ const createSegment$1 = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
1434
+ const actualStart = start + (contentStartOffset ?? 0);
1435
+ const sliced = content.slice(actualStart, end);
1436
+ let text = capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
1437
+ if (!text) return null;
1438
+ if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks);
1439
+ const from = pageMap.getId(actualStart);
1440
+ const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(actualStart + text.length - 1);
1441
+ const seg = {
1442
+ content: text,
1443
+ from
1444
+ };
1445
+ if (to !== from) seg.to = to;
1446
+ if (meta || namedCaptures) seg.meta = {
1447
+ ...meta,
1448
+ ...namedCaptures
1449
+ };
1450
+ return seg;
1451
+ };
1452
+ /**
1453
+ * Creates segments from an array of split points.
1454
+ */
1455
+ const createSegmentsFromSplitPoints = () => {
1456
+ const result = [];
1457
+ for (let i = 0; i < splitPoints.length; i++) {
1458
+ const sp = splitPoints[i];
1459
+ const end = i < splitPoints.length - 1 ? splitPoints[i + 1].index : content.length;
1460
+ const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
1461
+ if (s) result.push(s);
1462
+ }
1463
+ return result;
1464
+ };
1465
+ const segments = [];
1466
+ if (!splitPoints.length) {
1467
+ if (anyRuleAllowsId(rules, pageMap.getId(0))) {
1468
+ const s = createSegment$1(0, content.length);
1469
+ if (s) segments.push(s);
1470
+ }
1471
+ return segments;
1472
+ }
1473
+ if (splitPoints[0].index > 0) {
1474
+ if (anyRuleAllowsId(rules, pageMap.getId(0))) {
1475
+ const s = createSegment$1(0, splitPoints[0].index);
1476
+ if (s) segments.push(s);
1477
+ }
1478
+ }
1479
+ return [...segments, ...createSegmentsFromSplitPoints()];
1480
+ };
514
1481
 
515
1482
  //#endregion
516
- export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, NUMBERING_PATTERNS, SEPARATOR_PATTERNS, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
1483
+ export { TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
517
1484
  //# sourceMappingURL=index.mjs.map