flappa-doormal 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,460 +1,850 @@
1
- //#region src/types.d.ts
1
+ //#region src/segmentation/fuzzy.d.ts
2
2
  /**
3
- * Numbering styles for markers
4
- */
5
- type NumberingStyle = 'arabic-indic' | 'latin';
6
- /**
7
- * Separator styles for markers
3
+ * Fuzzy matching utilities for Arabic text.
4
+ *
5
+ * Provides diacritic-insensitive and character-equivalence matching for Arabic text.
6
+ * This allows matching text regardless of:
7
+ * - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
8
+ * - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
9
+ *
10
+ * @module fuzzy
11
+ *
12
+ * @example
13
+ * // Make a pattern diacritic-insensitive
14
+ * const pattern = makeDiacriticInsensitive('حدثنا');
15
+ * new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
8
16
  */
9
- type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
10
17
  /**
11
- * Marker types for text segmentation
18
+ * Escapes a string for safe inclusion in a regular expression.
19
+ *
20
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
21
+ *
22
+ * @param s - Any string to escape
23
+ * @returns String with regex metacharacters escaped
24
+ *
25
+ * @example
26
+ * escapeRegex('hello.world') // → 'hello\\.world'
27
+ * escapeRegex('[test]') // → '\\[test\\]'
28
+ * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
12
29
  */
13
- type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash';
30
+ declare const escapeRegex: (s: string) => string;
14
31
  /**
15
- * Configuration for a single marker pattern
16
- */
17
- type MarkerConfig = {
18
- /** The type of marker to look for */
19
- type: MarkerType;
20
- /** For numbered markers, the digit style */
21
- numbering?: NumberingStyle;
22
- /** The separator that follows the marker */
23
- separator?: SeparatorStyle | string;
24
- /**
25
- * Template format for numbered markers using token syntax.
26
- * Example: '{bullet}+ {num} {dash}'
27
- * Only valid when type is 'numbered'.
28
- */
29
- format?: string;
30
- /**
31
- * For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}.
32
- * For raw regex patterns that don't use templates, provide the raw pattern string here.
33
- * Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)'
34
- */
35
- template?: string;
36
- /**
37
- * Alternative to template: raw regex pattern string (for 'pattern' type only).
38
- * Use this for complex patterns that can't be expressed with templates.
39
- * The pattern should have a capture group for the content.
40
- * Example: '^CUSTOM: (.*)'
41
- */
42
- pattern?: string;
43
- /**
44
- * Custom token map for advanced users.
45
- * Extends the default TOKENS with additional definitions.
46
- */
47
- tokens?: Record<string, string>;
48
- /**
49
- * List of phrases for 'phrase' and 'hadith-chain' types.
50
- * For 'hadith-chain', defaults to common narrator patterns if not provided.
51
- */
52
- phrases?: string[];
53
- /**
54
- * Optional: Only apply this marker after a specific page number.
55
- * Useful for books with different formatting in front matter vs main content.
56
- */
57
- minPage?: number;
58
- /**
59
- * Optional: Arbitrary metadata to attach to entries matched by this marker.
60
- * This allows for agnostic handling of entry properties.
61
- * Example: { type: 0, category: 'hadith' }
62
- */
63
- metadata?: Record<string, any>;
64
- };
32
+ * Creates a diacritic-insensitive regex pattern for Arabic text matching.
33
+ *
34
+ * Transforms input text into a regex pattern that matches the text regardless
35
+ * of diacritical marks (harakat) and character variations. Each character in
36
+ * the input is:
37
+ * 1. Expanded to its equivalence class (if applicable)
38
+ * 2. Followed by an optional diacritics matcher
39
+ *
40
+ * This allows matching:
41
+ * - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
42
+ * - `الإيمان` with `الايمان` (alef variants)
43
+ * - `صلاة` with `صلاه` (ta marbuta ↔ ha)
44
+ *
45
+ * @param text - Input Arabic text to make diacritic-insensitive
46
+ * @returns Regex pattern string that matches the text with or without diacritics
47
+ *
48
+ * @example
49
+ * const pattern = makeDiacriticInsensitive('حدثنا');
50
+ * // Each char gets equivalence class + optional diacritics
51
+ * // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
52
+ *
53
+ * @example
54
+ * const pattern = makeDiacriticInsensitive('باب');
55
+ * new RegExp(pattern, 'u').test('بَابٌ') // true
56
+ * new RegExp(pattern, 'u').test('باب') // true
57
+ *
58
+ * @example
59
+ * // Using with split rules
60
+ * {
61
+ * lineStartsWith: ['باب'],
62
+ * split: 'at',
63
+ * fuzzy: true // Applies makeDiacriticInsensitive internally
64
+ * }
65
+ */
66
+ declare const makeDiacriticInsensitive: (text: string) => string;
65
67
  //#endregion
66
- //#region src/markers/defaults.d.ts
68
+ //#region src/segmentation/types.d.ts
67
69
  /**
68
- * Default numbering style for markers
69
- */
70
- declare const DEFAULT_NUMBERING: NumberingStyle;
71
- /**
72
- * Default separator style for markers
73
- */
74
- declare const DEFAULT_SEPARATOR: SeparatorStyle;
75
- /**
76
- * Default separator pattern (used when separator is a custom string)
70
+ * Literal regex pattern rule - no token expansion is applied.
71
+ *
72
+ * Use this when you need full control over the regex pattern.
73
+ * If the regex contains capturing groups, the captured content
74
+ * will be used as the segment content.
75
+ *
76
+ * @example
77
+ * // Match Arabic-Indic numbers followed by a dash
78
+ * { regex: '^[٠-٩]+ - ', split: 'at' }
79
+ *
80
+ * @example
81
+ * // Capture group - content after the marker becomes segment content
82
+ * { regex: '^[٠-٩]+ - (.*)', split: 'at' }
77
83
  */
78
- declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
84
+ type RegexPattern = {
85
+ /** Raw regex pattern string (no token expansion) */
86
+ regex: string;
87
+ };
79
88
  /**
80
- * Numbering patterns mapped by style
89
+ * Template pattern rule - expands `{{tokens}}` before compiling to regex.
90
+ *
91
+ * Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
92
+ *
93
+ * @example
94
+ * // Using tokens for Arabic-Indic digits
95
+ * { template: '^{{raqms}} {{dash}}', split: 'at' }
96
+ *
97
+ * @example
98
+ * // Named capture to extract hadith number into metadata
99
+ * { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
100
+ *
101
+ * @see TOKEN_PATTERNS for available tokens
81
102
  */
82
- declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
103
+ type TemplatePattern = {
104
+ /** Template string with `{{token}}` or `{{token:name}}` placeholders */
105
+ template: string;
106
+ };
83
107
  /**
84
- * Separator patterns mapped by style
108
+ * Line-start pattern rule - matches lines starting with any of the given patterns.
109
+ *
110
+ * Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
111
+ * is **included** in the segment content.
112
+ *
113
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
114
+ * diacritic-insensitive Arabic matching.
115
+ *
116
+ * @example
117
+ * // Split at chapter headings (marker included in content)
118
+ * { lineStartsWith: ['## ', '### '], split: 'at' }
119
+ *
120
+ * @example
121
+ * // Split at Arabic book/chapter markers with fuzzy matching
122
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
85
123
  */
86
- declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
87
- //#endregion
88
- //#region src/markers/generator.d.ts
124
+ type LineStartsWithPattern = {
125
+ /** Array of patterns that mark line beginnings (marker included in content) */
126
+ lineStartsWith: string[];
127
+ };
89
128
  /**
90
- * Generates a regex pattern from a marker configuration.
91
- * Always returns a regex with three named capture groups:
92
- * - full: Complete match including marker
93
- * - marker: Just the marker part (for metadata/indexing)
94
- * - content: Clean content without marker (for LLM processing)
129
+ * Line-start-after pattern rule - matches lines starting with patterns,
130
+ * but **excludes** the marker from the segment content.
131
+ *
132
+ * Behaves like `lineStartsWith` but strips the marker from the output.
133
+ * The segment content starts after the marker and extends to the next split point
134
+ * (not just the end of the matching line).
95
135
  *
96
- * This function applies all default values before delegating to type-specific generators.
136
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
137
+ * diacritic-insensitive Arabic matching.
97
138
  *
98
- * @param config - Marker configuration
99
- * @returns Regular expression with named groups
139
+ * @example
140
+ * // Split at numbered hadiths, capturing content without the number prefix
141
+ * // Content extends to next split, not just end of that line
142
+ * { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
100
143
  *
101
144
  * @example
102
- * const regex = generateRegexFromMarker({ type: 'numbered' });
103
- * const match = regex.exec('٥ - نص');
104
- * match.groups.full // "٥ - نص"
105
- * match.groups.marker // "٥ -"
106
- * match.groups.content // "نص"
145
+ * // Extract hadith number to metadata while stripping the prefix
146
+ * { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
107
147
  */
108
- declare function generateRegexFromMarker(config: MarkerConfig): RegExp;
109
- //#endregion
110
- //#region src/markers/presets.d.ts
148
+ type LineStartsAfterPattern = {
149
+ /** Array of patterns that mark line beginnings (marker excluded from content) */
150
+ lineStartsAfter: string[];
151
+ };
111
152
  /**
112
- * Default phrase lists for preset marker types.
113
- * Export these so users can extend them.
153
+ * Line-end pattern rule - matches lines ending with any of the given patterns.
154
+ *
155
+ * Syntactic sugar for `(?:pattern1|pattern2|...)$`.
156
+ *
157
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
158
+ * diacritic-insensitive Arabic matching.
159
+ *
160
+ * @example
161
+ * // Split at lines ending with Arabic sentence-ending punctuation
162
+ * { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
114
163
  */
164
+ type LineEndsWithPattern = {
165
+ /** Array of patterns that mark line endings */
166
+ lineEndsWith: string[];
167
+ };
115
168
  /**
116
- * Common hadith narrator phrases (diacritic-insensitive)
117
- * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
169
+ * Union of all pattern types for split rules.
170
+ *
171
+ * Each rule must have exactly ONE pattern type:
172
+ * - `regex` - Raw regex pattern (no token expansion)
173
+ * - `template` - Pattern with `{{token}}` expansion
174
+ * - `lineStartsWith` - Match line beginnings (marker included)
175
+ * - `lineStartsAfter` - Match line beginnings (marker excluded)
176
+ * - `lineEndsWith` - Match line endings
118
177
  */
119
- declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"];
178
+ type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
120
179
  /**
121
- * Common basmala patterns
122
- * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
180
+ * Configuration for how and where to split content when a pattern matches.
181
+ *
182
+ * Controls the split position relative to matches, which occurrences to
183
+ * split on, page span limits, and fuzzy matching for Arabic text.
123
184
  */
124
- declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"];
125
- //#endregion
126
- //#region src/markers/tokens.d.ts
185
+ type SplitBehavior = {
186
+ /**
187
+ * Where to split relative to the match.
188
+ * - `'at'`: New segment starts at the match position
189
+ * - `'after'`: New segment starts after the match ends
190
+ */
191
+ split: 'at' | 'after';
192
+ /**
193
+ * Which occurrence(s) to split on.
194
+ * - `'all'`: Split at every match (default)
195
+ * - `'first'`: Only split at the first match
196
+ * - `'last'`: Only split at the last match
197
+ *
198
+ * When `maxSpan` is set, occurrence filtering is applied per sliding
199
+ * window rather than globally. With `'last'`, the algorithm prefers
200
+ * longer segments by looking as far ahead as allowed before selecting
201
+ * the last match in the window.
202
+ *
203
+ * @default 'all'
204
+ */
205
+ occurrence?: 'first' | 'last' | 'all';
206
+ /**
207
+ * Maximum page ID difference allowed when looking ahead for split points.
208
+ *
209
+ * Uses a sliding window algorithm that prefers longer segments:
210
+ * 1. Start from the first page of the current segment
211
+ * 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
212
+ * 3. Apply occurrence filter (e.g., 'last') to select a match
213
+ * 4. Next window starts from the page after the match
214
+ *
215
+ * Examples:
216
+ * - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
217
+ * - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
218
+ * - `undefined` = no limit (entire content treated as one group)
219
+ *
220
+ * Note: With non-consecutive page IDs, the algorithm uses actual ID
221
+ * difference, not array index. Pages 1 and 5 have a difference of 4.
222
+ *
223
+ * @example
224
+ * // Split at last period, looking up to 1 page ahead
225
+ * // Pages 1,2: split at page 2's last period
226
+ * // Page 3: split at page 3's last period
227
+ * { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
228
+ */
229
+ maxSpan?: number;
230
+ /**
231
+ * Enable diacritic-insensitive matching for Arabic text.
232
+ *
233
+ * When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
234
+ * `lineStartsAfter` are transformed to match text regardless of:
235
+ * - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
236
+ * - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
237
+ *
238
+ * **Note**: Does NOT apply to `regex` or `template` patterns.
239
+ * For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
240
+ *
241
+ * @default false
242
+ */
243
+ fuzzy?: boolean;
244
+ };
127
245
  /**
128
- * Token definitions for pattern templates.
129
- * Tokens provide a readable alternative to raw regex patterns.
246
+ * A single page ID or a range of page IDs.
247
+ *
248
+ * - `number`: A single page ID
249
+ * - `[number, number]`: A range from first to second (inclusive)
250
+ *
251
+ * @example
252
+ * 5 // Single page 5
253
+ * [10, 20] // Pages 10 through 20 (inclusive)
130
254
  */
255
+ type PageRange = number | [number, number];
131
256
  /**
132
- * Standard tokens for building marker patterns.
133
- * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
257
+ * Optional constraints and metadata for a split rule.
258
+ *
259
+ * Use constraints to limit which pages a rule applies to, and
260
+ * metadata to attach arbitrary data to resulting segments.
134
261
  */
135
- declare const TOKENS: {
136
- readonly bullet: "[•*°]";
137
- readonly colon: ":";
138
- readonly comma: "،";
139
- readonly content: "(.*)";
140
- readonly dash: "[-–—ـ]";
141
- readonly dot: "\\.";
142
- readonly latin: "\\d+";
143
- readonly letter: "[أ-ي]";
144
- readonly num: "[\\u0660-\\u0669]+";
145
- readonly paren: "\\)";
146
- readonly s: "\\s?";
147
- readonly slash: "/";
148
- readonly space: "\\s+";
262
+ type RuleConstraints = {
263
+ /**
264
+ * Minimum page ID for this rule to apply.
265
+ *
266
+ * Matches on pages with `id < min` are ignored.
267
+ *
268
+ * @example
269
+ * // Only apply rule starting from page 10
270
+ * { min: 10, lineStartsWith: ['##'], split: 'before' }
271
+ */
272
+ min?: number;
273
+ /**
274
+ * Maximum page ID for this rule to apply.
275
+ *
276
+ * Matches on pages with `id > max` are ignored.
277
+ *
278
+ * @example
279
+ * // Only apply rule up to page 100
280
+ * { max: 100, lineStartsWith: ['##'], split: 'before' }
281
+ */
282
+ max?: number;
283
+ /**
284
+ * Specific pages or page ranges to exclude from this rule.
285
+ *
286
+ * Use this to skip the rule for specific pages without needing
287
+ * to repeat the rule with different min/max values.
288
+ *
289
+ * @example
290
+ * // Exclude specific pages
291
+ * { exclude: [1, 2, 5] }
292
+ *
293
+ * @example
294
+ * // Exclude page ranges
295
+ * { exclude: [[1, 10], [50, 100]] }
296
+ *
297
+ * @example
298
+ * // Mix single pages and ranges
299
+ * { exclude: [1, [5, 10], 50] }
300
+ */
301
+ exclude?: PageRange[];
302
+ /**
303
+ * Arbitrary metadata attached to segments matching this rule.
304
+ *
305
+ * This metadata is merged with any named captures from the pattern.
306
+ * Named captures (e.g., `{{raqms:num}}`) take precedence over
307
+ * static metadata with the same key.
308
+ *
309
+ * @example
310
+ * // Tag segments as chapters
311
+ * { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
312
+ */
313
+ meta?: Record<string, unknown>;
314
+ /**
315
+ * Fallback behavior when no matches are found within a maxSpan boundary.
316
+ * - 'page': Create split points at page boundaries
317
+ * - undefined: No fallback (current behavior)
318
+ */
319
+ fallback?: 'page';
149
320
  };
150
- type TokenMap = Record<string, string>;
151
- //#endregion
152
- //#region src/markers/template-parser.d.ts
153
321
  /**
154
- * Result of template validation
322
+ * A complete split rule combining pattern, behavior, and constraints.
323
+ *
324
+ * Each rule must specify:
325
+ * - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
326
+ * `lineStartsAfter`, or `lineEndsWith`
327
+ * - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
328
+ * - **Constraints** (optional): `min`, `max`, `meta`
329
+ *
330
+ * @example
331
+ * // Basic rule: split at markdown headers
332
+ * const rule: SplitRule = {
333
+ * lineStartsWith: ['## ', '### '],
334
+ * split: 'at',
335
+ * meta: { type: 'section' }
336
+ * };
337
+ *
338
+ * @example
339
+ * // Advanced rule: extract hadith numbers with fuzzy Arabic matching
340
+ * const rule: SplitRule = {
341
+ * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
342
+ * split: 'at',
343
+ * fuzzy: true,
344
+ * min: 5,
345
+ * max: 500,
346
+ * meta: { type: 'hadith' }
347
+ * };
155
348
  */
156
- interface ValidationResult {
157
- valid: boolean;
158
- errors?: string[];
159
- }
349
+ type SplitRule = PatternType & SplitBehavior & RuleConstraints;
160
350
  /**
161
- * Options for template expansion
351
+ * Input page structure for segmentation.
352
+ *
353
+ * Each page represents a logical unit of content (e.g., a book page,
354
+ * a document section) that can be tracked across segment boundaries.
355
+ *
356
+ * @example
357
+ * const pages: Page[] = [
358
+ * { id: 1, content: '## Chapter 1\nFirst paragraph...' },
359
+ * { id: 2, content: 'Continued text...\n## Chapter 2' },
360
+ * ];
162
361
  */
163
- interface ExpandOptions {
164
- /** Custom token map to use instead of default TOKENS */
165
- tokens?: TokenMap;
166
- }
362
+ type Page = {
363
+ /**
364
+ * Unique page/entry ID used for:
365
+ * - `maxSpan` grouping (segments spanning multiple pages)
366
+ * - `min`/`max` constraint filtering
367
+ * - `from`/`to` tracking in output segments
368
+ */
369
+ id: number;
370
+ /**
371
+ * Raw page content (may contain HTML).
372
+ *
373
+ * Line endings are normalized internally (`\r\n` and `\r` → `\n`).
374
+ * Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
375
+ */
376
+ content: string;
377
+ };
167
378
  /**
168
- * Expands a template string into a regex pattern using named capture groups.
169
- * Always creates three groups: full (entire match), marker (just the marker), content (clean text).
379
+ * A breakpoint pattern with optional page constraints.
170
380
  *
171
- * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
381
+ * Use this to control which pages a breakpoint pattern applies to.
382
+ * Patterns outside the specified range are skipped, allowing
383
+ * the next breakpoint pattern (or fallback) to be tried.
172
384
  *
173
- * @param template - Template string with {token} placeholders
174
- * @param options - Optional configuration
175
- * @returns Regex pattern string with named groups
385
+ * @example
386
+ * // Only apply punctuation-based breaking from page 10 onwards
387
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
176
388
  *
177
389
  * @example
178
- * expandTemplate('{num} {dash}')
179
- * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
390
+ * // Apply to specific page range (pages 10-50)
391
+ * { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
180
392
  */
181
- declare function expandTemplate(template: string, options?: ExpandOptions): string;
393
+ type BreakpointRule = {
394
+ /**
395
+ * Regex pattern for breaking (supports token expansion).
396
+ * Empty string `''` means fall back to page boundary.
397
+ */
398
+ pattern: string;
399
+ /**
400
+ * Minimum page ID for this breakpoint to apply.
401
+ * Segments starting before this page skip this pattern.
402
+ */
403
+ min?: number;
404
+ /**
405
+ * Maximum page ID for this breakpoint to apply.
406
+ * Segments starting after this page skip this pattern.
407
+ */
408
+ max?: number;
409
+ /**
410
+ * Specific pages or page ranges to exclude from this breakpoint.
411
+ *
412
+ * Use this to skip the breakpoint for specific pages without needing
413
+ * to repeat the breakpoint with different min/max values.
414
+ *
415
+ * @example
416
+ * // Exclude specific pages
417
+ * { pattern: '\\.\\s*', exclude: [1, 2, 5] }
418
+ *
419
+ * @example
420
+ * // Exclude page ranges (front matter pages 1-10)
421
+ * { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
422
+ *
423
+ * @example
424
+ * // Mix single pages and ranges
425
+ * { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
426
+ */
427
+ exclude?: PageRange[];
428
+ /**
429
+ * Skip this breakpoint if the segment content matches this pattern.
430
+ *
431
+ * Supports token expansion (e.g., `{{kitab}}`). When the segment's
432
+ * remaining content matches this regex, the breakpoint pattern is
433
+ * skipped and the next breakpoint in the array is tried.
434
+ *
435
+ * Useful for excluding title pages or front matter without needing
436
+ * to specify explicit page ranges.
437
+ *
438
+ * @example
439
+ * // Skip punctuation breakpoint for short content (likely titles)
440
+ * { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
441
+ *
442
+ * @example
443
+ * // Skip for content containing "kitab" (book) marker
444
+ * { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
445
+ */
446
+ skipWhen?: string;
447
+ };
182
448
  /**
183
- * Create a custom token map by extending the base tokens.
449
+ * A breakpoint can be a simple string pattern or an object with constraints.
184
450
  *
185
- * @param customTokens - Custom token definitions
186
- * @returns Combined token map
451
+ * String breakpoints apply to all pages. Object breakpoints can specify
452
+ * `min`/`max` to limit which pages they apply to.
187
453
  *
188
454
  * @example
189
- * const myTokens = createTokenMap({
190
- * verse: '\\[[\\u0660-\\u0669]+\\]',
191
- * tafsir: 'تفسير'
192
- * });
455
+ * // String (applies everywhere)
456
+ * '{{tarqim}}\\s*'
457
+ *
458
+ * @example
459
+ * // Object with constraints (only from page 10+)
460
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
193
461
  */
194
- declare function createTokenMap(customTokens: Record<string, string>): TokenMap;
462
+ type Breakpoint = string | BreakpointRule;
195
463
  /**
196
- * Validates a template string.
464
+ * Segmentation options controlling how pages are split.
197
465
  *
198
- * @param template - Template to validate
199
- * @param tokens - Token map to validate against
200
- * @returns Validation result with errors if invalid
466
+ * @example
467
+ * // Basic structural rules only
468
+ * const options: SegmentationOptions = {
469
+ * rules: [
470
+ * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
471
+ * { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
472
+ * ]
473
+ * };
201
474
  *
202
475
  * @example
203
- * validateTemplate('{num} {dash}')
204
- * // Returns: { valid: true }
476
+ * // With breakpoints for oversized segments
477
+ * const options: SegmentationOptions = {
478
+ * rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
479
+ * maxPages: 2,
480
+ * breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
481
+ * prefer: 'longer'
482
+ * };
483
+ */
484
+ type SegmentationOptions = {
485
+ /**
486
+ * Rules applied in order to find split points.
487
+ *
488
+ * All rules are evaluated against the content, and their matches
489
+ * are combined to determine final split points. The first matching
490
+ * rule's metadata is used for each segment.
491
+ */
492
+ rules?: SplitRule[];
493
+ /**
494
+ * Maximum pages per segment before breakpoints are applied.
495
+ *
496
+ * When a segment spans more pages than this limit, the `breakpoints`
497
+ * patterns are tried (in order) to find a suitable break point within
498
+ * the allowed window.
499
+ *
500
+ * Structural markers (from rules) always take precedence - segments
501
+ * are only broken within their rule-defined boundaries, never across them.
502
+ *
503
+ * @example
504
+ * // Break segments that exceed 2 pages
505
+ * { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
506
+ */
507
+ maxPages?: number;
508
+ /**
509
+ * Patterns tried in order to break oversized segments.
510
+ *
511
+ * Each pattern is tried until one matches within the allowed page window.
512
+ * Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
513
+ * matches the page boundary (always succeeds as ultimate fallback).
514
+ *
515
+ * Patterns can be simple strings (apply everywhere) or objects with
516
+ * `min`/`max` constraints to limit which pages they apply to.
517
+ *
518
+ * Patterns are checked in order - put preferred break styles first:
519
+ * - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
520
+ * - `\\n` - Break at line breaks (useful for OCR content)
521
+ * - `''` - Break at page boundary (always works)
522
+ *
523
+ * Only applied to segments exceeding `maxPages`.
524
+ *
525
+ * @example
526
+ * // Simple patterns (backward compatible)
527
+ * breakpoints: ['{{tarqim}}\\s*', '\\n', '']
528
+ *
529
+ * @example
530
+ * // Object patterns with page constraints
531
+ * breakpoints: [
532
+ * { pattern: '{{tarqim}}\\s*', min: 10 }, // Only from page 10+
533
+ * '' // Fallback for pages 1-9
534
+ * ]
535
+ */
536
+ breakpoints?: Breakpoint[];
537
+ /**
538
+ * When multiple matches exist for a breakpoint pattern, select:
539
+ * - `'longer'` - Last match in window (prefers longer segments)
540
+ * - `'shorter'` - First match in window (prefers shorter segments)
541
+ *
542
+ * @default 'longer'
543
+ */
544
+ prefer?: 'longer' | 'shorter';
545
+ };
546
+ /**
547
+ * Output segment produced by `segmentPages()`.
548
+ *
549
+ * Each segment contains extracted content, page references, and
550
+ * optional metadata from the matched rule and captured groups.
205
551
  *
206
- * validateTemplate('{invalid}')
207
- * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
552
+ * @example
553
+ * // Simple segment on a single page
554
+ * { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
555
+ *
556
+ * @example
557
+ * // Segment spanning pages 5-7 with captured hadith number
558
+ * { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
208
559
  */
209
- declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult;
560
+ type Segment = {
561
+ /**
562
+ * Segment content with:
563
+ * - Leading/trailing whitespace trimmed
564
+ * - Page breaks converted to spaces (for multi-page segments)
565
+ * - Markers stripped (for `lineStartsAfter` patterns)
566
+ */
567
+ content: string;
568
+ /**
569
+ * Starting page ID (from `Page.id`).
570
+ */
571
+ from: number;
572
+ /**
573
+ * Ending page ID if segment spans multiple pages.
574
+ *
575
+ * Only present when the segment content extends across page boundaries.
576
+ * When `undefined`, the segment is contained within a single page.
577
+ */
578
+ to?: number;
579
+ /**
580
+ * Combined metadata from:
581
+ * 1. Rule's `meta` property (static metadata)
582
+ * 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
583
+ *
584
+ * Named captures override static metadata with the same key.
585
+ */
586
+ meta?: Record<string, unknown>;
587
+ };
210
588
  //#endregion
211
- //#region src/markers/type-generators.d.ts
589
+ //#region src/segmentation/segmenter.d.ts
212
590
  /**
213
- * Generates a regular expression for pattern-type markers.
591
+ * Segments pages of content based on pattern-matching rules.
214
592
  *
215
- * Supports two modes:
216
- * 1. Template-based: Uses the `template` field with token expansion
217
- * 2. Pattern-based: Uses the raw `pattern` field as-is
593
+ * This is the main entry point for the segmentation engine. It takes an array
594
+ * of pages and applies the provided rules to identify split points, producing
595
+ * an array of segments with content, page references, and metadata.
218
596
  *
219
- * @param config - Marker configuration with either `template` or `pattern` field
220
- * @returns A compiled RegExp object for matching the pattern
221
- * @throws {Error} When neither `template` nor `pattern` is provided
597
+ * @param pages - Array of pages with id and content
598
+ * @param options - Segmentation options including splitting rules
599
+ * @returns Array of segments with content, from/to page references, and optional metadata
222
600
  *
223
601
  * @example
224
- * // Using template
225
- * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
602
+ * // Split markdown by headers
603
+ * const segments = segmentPages(pages, {
604
+ * rules: [
605
+ * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
606
+ * ]
607
+ * });
226
608
  *
227
609
  * @example
228
- * // Using raw pattern
229
- * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
610
+ * // Split Arabic hadith text with number extraction
611
+ * const segments = segmentPages(pages, {
612
+ * rules: [
613
+ * {
614
+ * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
615
+ * split: 'at',
616
+ * fuzzy: true,
617
+ * meta: { type: 'hadith' }
618
+ * }
619
+ * ]
620
+ * });
230
621
  *
231
622
  * @example
232
- * // Using custom tokens
233
- * const regex = generatePatternRegex({
234
- * type: 'pattern',
235
- * template: '{verse}',
236
- * tokens: { verse: '\\[[0-9]+\\]' }
623
+ * // Multiple rules with page constraints
624
+ * const segments = segmentPages(pages, {
625
+ * rules: [
626
+ * { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
627
+ * { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
628
+ * { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
629
+ * ]
237
630
  * });
238
631
  */
239
- declare function generatePatternRegex(config: MarkerConfig): RegExp;
632
+ declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
633
+ //#endregion
634
+ //#region src/segmentation/textUtils.d.ts
240
635
  /**
241
- * Generates a regular expression for 'bab' (chapter) markers.
636
+ * Strip all HTML tags from content, keeping only text.
242
637
  *
243
- * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
244
- * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
638
+ * @param html - HTML content
639
+ * @returns Plain text content
640
+ */
641
+ declare const stripHtmlTags: (html: string) => string;
642
+ /**
643
+ * Normalizes line endings to Unix-style (`\n`).
245
644
  *
246
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
645
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
646
+ * for consistent pattern matching across platforms.
247
647
  *
248
- * @example
249
- * const regex = generateBabRegex();
250
- * const match = regex.exec('باب الصلاة');
251
- * // match.groups.marker -> 'باب'
252
- * // match.groups.content -> ' الصلاة'
648
+ * @param content - Raw content with potentially mixed line endings
649
+ * @returns Content with all line endings normalized to `\n`
253
650
  */
254
- declare function generateBabRegex(): RegExp;
651
+ declare const normalizeLineEndings: (content: string) => string;
652
+ //#endregion
653
+ //#region src/segmentation/tokens.d.ts
255
654
  /**
256
- * Generates a regular expression for hadith chain (isnad) markers.
655
+ * Token-based template system for Arabic text pattern matching.
257
656
  *
258
- * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
259
- * Uses default phrases from presets or custom phrases from config.
260
- * All phrases are made diacritic-insensitive.
657
+ * This module provides a human-readable way to define regex patterns using
658
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
659
+ * named capture groups for extracting matched values into metadata.
261
660
  *
262
- * @param config - Marker configuration with optional `phrases` array
263
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
661
+ * @module tokens
264
662
  *
265
663
  * @example
266
- * // Using default phrases
267
- * const regex = generateHadithChainRegex({ type: 'hadith-chain' });
268
- * const match = regex.exec('حَدَّثَنَا أبو بكر');
664
+ * // Simple token expansion
665
+ * expandTokens('{{raqms}} {{dash}}')
666
+ * // '[\\u0660-\\u0669]+ [-–—ـ]'
269
667
  *
270
668
  * @example
271
- * // Using custom phrases
272
- * const regex = generateHadithChainRegex({
273
- * type: 'hadith-chain',
274
- * phrases: ['قَالَ', 'رَوَى']
275
- * });
669
+ * // Named capture groups
670
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
671
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
276
672
  */
277
- declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
278
673
  /**
279
- * Generates a regular expression for basmala markers.
674
+ * Token definitions mapping human-readable token names to regex patterns.
280
675
  *
281
- * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
282
- * - بسم الله (without diacritics)
283
- * - بِسْمِ اللَّهِ (with diacritics)
284
- * - Special patterns like [بسم, [تم
676
+ * Tokens are used in template strings with double-brace syntax:
677
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
678
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
679
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
285
680
  *
286
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
681
+ * @remarks
682
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
683
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
684
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
287
685
  *
288
686
  * @example
289
- * const regex = generateBasmalaRegex();
290
- * const match = regex.exec('بسم الله الرحمن الرحيم');
291
- * // match.groups.marker -> 'بسم الله'
292
- */
293
- declare function generateBasmalaRegex(): RegExp;
294
- /**
295
- * Generates a regular expression for custom phrase markers.
296
- *
297
- * Similar to hadith-chain markers but requires explicit phrase list.
298
- * All phrases are made diacritic-insensitive.
687
+ * // Using tokens in a split rule
688
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
299
689
  *
300
- * @param config - Marker configuration with required `phrases` array
301
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
302
- * @throws {Error} When `phrases` is undefined or empty
690
+ * @example
691
+ * // Using tokens with named captures
692
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
303
693
  *
304
694
  * @example
305
- * const regex = generatePhraseRegex({
306
- * type: 'phrase',
307
- * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
308
- * });
695
+ * // Using the numbered convenience token
696
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
309
697
  */
310
- declare function generatePhraseRegex(config: MarkerConfig): RegExp;
698
+ declare const TOKEN_PATTERNS: Record<string, string>;
311
699
  /**
312
- * Generates a regular expression for square bracket markers.
700
+ * Checks if a query string contains template tokens.
313
701
  *
314
- * Matches verse or hadith reference numbers in square brackets:
315
- * - [٦٥] - Simple bracket
316
- * - [٦٥] - With bullet prefix
317
- * - ° [٦٥] - With degree prefix
702
+ * Performs a quick test for `{{token}}` patterns without actually
703
+ * expanding them. Useful for determining whether to apply token
704
+ * expansion to a string.
318
705
  *
319
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
706
+ * @param query - String to check for tokens
707
+ * @returns `true` if the string contains at least one `{{token}}` pattern
320
708
  *
321
709
  * @example
322
- * const regex = generateSquareBracketRegex();
323
- * const match = regex.exec('[٦٥] نص الحديث');
324
- * // match.groups.content -> ' نص الحديث'
710
+ * containsTokens('{{raqms}} {{dash}}') // → true
711
+ * containsTokens('plain text') // → false
712
+ * containsTokens('[٠-٩]+ - ') // false (raw regex, no tokens)
713
+ */
714
+ declare const containsTokens: (query: string) => boolean;
715
+ /**
716
+ * Result from expanding tokens with capture information.
717
+ *
718
+ * Contains the expanded pattern string along with metadata about
719
+ * any named capture groups that were created.
325
720
  */
326
- declare function generateSquareBracketRegex(): RegExp;
721
+ type ExpandResult = {
722
+ /**
723
+ * The expanded regex pattern string with all tokens replaced.
724
+ *
725
+ * Named captures use the `(?<name>pattern)` syntax.
726
+ */
727
+ pattern: string;
728
+ /**
729
+ * Names of captured groups extracted from `{{token:name}}` syntax.
730
+ *
731
+ * Empty array if no named captures were found.
732
+ */
733
+ captureNames: string[];
734
+ /**
735
+ * Whether the pattern has any named capturing groups.
736
+ *
737
+ * Equivalent to `captureNames.length > 0`.
738
+ */
739
+ hasCaptures: boolean;
740
+ };
327
741
  /**
328
- * Generates a regular expression for number-letter-separator markers.
742
+ * Expands template tokens with support for named captures.
743
+ *
744
+ * This is the primary token expansion function that handles all token syntax:
745
+ * - `{{token}}` → Expands to the token's pattern (no capture group)
746
+ * - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
747
+ * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
329
748
  *
330
- * Matches patterns like:
331
- * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
332
- * - 5 ب. (Latin number, Arabic letter, dot)
749
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
333
750
  *
334
- * @param config - Configuration with required `numbering` and `separator` fields
335
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
751
+ * @param query - The template string containing tokens
752
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
753
+ * Applied to both token patterns and plain Arabic text between tokens.
754
+ * Typically `makeDiacriticInsensitive` from the fuzzy module.
755
+ * @returns Object with expanded pattern, capture names, and capture flag
336
756
  *
337
757
  * @example
338
- * const regex = generateNumLetterRegex({
339
- * numbering: 'arabic-indic',
340
- * separator: 'dash'
341
- * });
342
- * const match = regex.exec('٥ أ - نص');
343
- */
344
- declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
345
- /**
346
- * Generates a regular expression for number-parenthetical-separator markers.
758
+ * // Simple token expansion
759
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
760
+ * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
347
761
  *
348
- * Matches patterns like:
349
- * - ٥ (أ) - (number, parenthetical content, separator)
350
- * - 5 (٦) - (number with parenthetical number)
762
+ * @example
763
+ * // Named capture
764
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
765
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
351
766
  *
352
- * @param config - Configuration with required `numbering` and `separator` fields
353
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
767
+ * @example
768
+ * // Capture-only token
769
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
770
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
354
771
  *
355
772
  * @example
356
- * const regex = generateNumParenRegex({
357
- * numbering: 'arabic-indic',
358
- * separator: 'dash'
359
- * });
360
- * const match = regex.exec('٥ (أ) - نص');
773
+ * // With fuzzy transform
774
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
775
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
361
776
  */
362
- declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
777
+ declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
363
778
  /**
364
- * Generates a regular expression for number-slash-number markers.
779
+ * Expands template tokens in a query string to their regex equivalents.
365
780
  *
366
- * Matches patterns like:
367
- * - ٥/٦ - (number slash number, separator)
368
- * - ٥ - (single number, separator)
781
+ * This is the simple version without capture support. It returns only the
782
+ * expanded pattern string, not capture metadata.
369
783
  *
370
- * The second number after the slash is optional.
784
+ * Unknown tokens are left as-is, allowing for partial templates.
371
785
  *
372
- * @param config - Configuration with required `numbering` and `separator` fields
373
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
786
+ * @param query - Template string containing `{{token}}` placeholders
787
+ * @returns Expanded regex pattern string
374
788
  *
375
789
  * @example
376
- * const regex = generateNumSlashRegex({
377
- * numbering: 'arabic-indic',
378
- * separator: 'dash'
379
- * });
380
- * const match1 = regex.exec('٥/٦ - نص');
381
- * const match2 = regex.exec('٥ - نص'); // Also matches
790
+ * expandTokens('، {{raqms}}') // '، [\\u0660-\\u0669]+'
791
+ * expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
792
+ * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
793
+ * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
794
+ *
795
+ * @see expandTokensWithCaptures for full capture group support
382
796
  */
383
- declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
797
+ declare const expandTokens: (query: string) => string;
384
798
  /**
385
- * Generates a regular expression for numbered markers with optional format template.
799
+ * Converts a template string to a compiled RegExp.
386
800
  *
387
- * Supports two modes:
388
- * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
389
- * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
801
+ * Expands all tokens and attempts to compile the result as a RegExp
802
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
390
803
  *
391
- * When using default pattern:
392
- * - Separator 'none' generates pattern without separator
393
- * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
804
+ * @remarks
805
+ * This function dynamically compiles regular expressions from template strings.
806
+ * If templates may come from untrusted sources, be aware of potential ReDoS
807
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
808
+ * Consider validating pattern complexity or applying execution timeouts when
809
+ * running user-submitted patterns.
394
810
  *
395
- * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
396
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
811
+ * @param template - Template string containing `{{token}}` placeholders
812
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
397
813
  *
398
814
  * @example
399
- * // Using format template
400
- * const regex = generateNumberedRegex({
401
- * numbering: 'arabic-indic',
402
- * separator: 'dash',
403
- * format: '{bullet}+ {num} {dash}'
404
- * });
405
- *
406
- * @example
407
- * // Using default pattern
408
- * const regex = generateNumberedRegex({
409
- * numbering: 'arabic-indic',
410
- * separator: 'dash'
411
- * });
412
- * const match = regex.exec('٥ - نص');
413
- *
414
- * @example
415
- * // With 'none' separator
416
- * const regex = generateNumberedRegex({
417
- * numbering: 'latin',
418
- * separator: 'none'
419
- * });
420
- * const match = regex.exec('5 text');
815
+ * templateToRegex('، {{raqms}}') // [٠-٩]+/u
816
+ * templateToRegex('{{raqms}}+') // /[٠-٩]++/u (might be invalid in some engines)
817
+ * templateToRegex('(((') // → null (invalid regex)
421
818
  */
422
- declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp;
819
+ declare const templateToRegex: (template: string) => RegExp | null;
423
820
  /**
424
- * Generates a regular expression for bullet-point markers.
821
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
425
822
  *
426
- * Matches common bullet characters:
427
- * - (bullet)
428
- * - * (asterisk)
429
- * - ° (degree)
430
- * - - (dash)
823
+ * Useful for documentation, validation, or building user interfaces
824
+ * that show available tokens.
431
825
  *
432
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
826
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
433
827
  *
434
828
  * @example
435
- * const regex = generateBulletRegex();
436
- * const match = regex.exec(' نقطة');
437
- * // match.groups.content -> 'نقطة'
829
+ * getAvailableTokens()
830
+ * // ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
438
831
  */
439
- declare function generateBulletRegex(): RegExp;
832
+ declare const getAvailableTokens: () => string[];
440
833
  /**
441
- * Generates a regular expression for Markdown-style heading markers.
834
+ * Gets the regex pattern for a specific token name.
442
835
  *
443
- * Matches heading levels using hash symbols:
444
- * - # Heading 1
445
- * - ## Heading 2
446
- * - ### Heading 3
447
- * - etc.
836
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
837
+ * without any expansion or capture group wrapping.
448
838
  *
449
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
839
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
840
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
450
841
  *
451
842
  * @example
452
- * const regex = generateHeadingRegex();
453
- * const match = regex.exec('## عنوان فرعي');
454
- * // match.groups.marker -> '## '
455
- * // match.groups.content -> 'عنوان فرعي'
843
+ * getTokenPattern('raqms') // '[\\u0660-\\u0669]+'
844
+ * getTokenPattern('dash') // '[-–—ـ]'
845
+ * getTokenPattern('unknown') // undefined
456
846
  */
457
- declare function generateHeadingRegex(): RegExp;
847
+ declare const getTokenPattern: (tokenName: string) => string | undefined;
458
848
  //#endregion
459
- export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
849
+ export { type Breakpoint, type BreakpointRule, type ExpandResult, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
460
850
  //# sourceMappingURL=index.d.mts.map