flappa-doormal 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,460 +1,1004 @@
1
- //#region src/types.d.ts
1
+ //#region src/segmentation/fuzzy.d.ts
2
2
  /**
3
- * Numbering styles for markers
3
+ * Fuzzy matching utilities for Arabic text.
4
+ *
5
+ * Provides diacritic-insensitive and character-equivalence matching for Arabic text.
6
+ * This allows matching text regardless of:
7
+ * - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
8
+ * - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
9
+ *
10
+ * @module fuzzy
11
+ *
12
+ * @example
13
+ * // Make a pattern diacritic-insensitive
14
+ * const pattern = makeDiacriticInsensitive('حدثنا');
15
+ * new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
16
+ */
17
+ /**
18
+ * Escapes a string for safe inclusion in a regular expression.
19
+ *
20
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
21
+ *
22
+ * @param s - Any string to escape
23
+ * @returns String with regex metacharacters escaped
24
+ *
25
+ * @example
26
+ * escapeRegex('hello.world') // → 'hello\\.world'
27
+ * escapeRegex('[test]') // → '\\[test\\]'
28
+ * escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
29
+ */
30
+ declare const escapeRegex: (s: string) => string;
31
+ /**
32
+ * Creates a diacritic-insensitive regex pattern for Arabic text matching.
33
+ *
34
+ * Transforms input text into a regex pattern that matches the text regardless
35
+ * of diacritical marks (harakat) and character variations. Each character in
36
+ * the input is:
37
+ * 1. Expanded to its equivalence class (if applicable)
38
+ * 2. Followed by an optional diacritics matcher
39
+ *
40
+ * This allows matching:
41
+ * - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
42
+ * - `الإيمان` with `الايمان` (alef variants)
43
+ * - `صلاة` with `صلاه` (ta marbuta ↔ ha)
44
+ *
45
+ * @param text - Input Arabic text to make diacritic-insensitive
46
+ * @returns Regex pattern string that matches the text with or without diacritics
47
+ *
48
+ * @example
49
+ * const pattern = makeDiacriticInsensitive('حدثنا');
50
+ * // Each char gets equivalence class + optional diacritics
51
+ * // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
52
+ *
53
+ * @example
54
+ * const pattern = makeDiacriticInsensitive('باب');
55
+ * new RegExp(pattern, 'u').test('بَابٌ') // → true
56
+ * new RegExp(pattern, 'u').test('باب') // → true
57
+ *
58
+ * @example
59
+ * // Using with split rules
60
+ * {
61
+ * lineStartsWith: ['باب'],
62
+ * split: 'at',
63
+ * fuzzy: true // Applies makeDiacriticInsensitive internally
64
+ * }
65
+ */
66
+ declare const makeDiacriticInsensitive: (text: string) => string;
67
+ //#endregion
68
+ //#region src/segmentation/types.d.ts
69
+ /**
70
+ * Literal regex pattern rule - no token expansion is applied.
71
+ *
72
+ * Use this when you need full control over the regex pattern.
73
+ * If the regex contains capturing groups, the captured content
74
+ * will be used as the segment content.
75
+ *
76
+ * @example
77
+ * // Match Arabic-Indic numbers followed by a dash
78
+ * { regex: '^[٠-٩]+ - ', split: 'at' }
79
+ *
80
+ * @example
81
+ * // Capture group - content after the marker becomes segment content
82
+ * { regex: '^[٠-٩]+ - (.*)', split: 'at' }
83
+ */
84
+ type RegexPattern = {
85
+ /** Raw regex pattern string (no token expansion) */
86
+ regex: string;
87
+ };
88
+ /**
89
+ * Template pattern rule - expands `{{tokens}}` before compiling to regex.
90
+ *
91
+ * Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
92
+ *
93
+ * @example
94
+ * // Using tokens for Arabic-Indic digits
95
+ * { template: '^{{raqms}} {{dash}}', split: 'at' }
96
+ *
97
+ * @example
98
+ * // Named capture to extract hadith number into metadata
99
+ * { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
100
+ *
101
+ * @see TOKEN_PATTERNS for available tokens
102
+ */
103
+ type TemplatePattern = {
104
+ /** Template string with `{{token}}` or `{{token:name}}` placeholders */
105
+ template: string;
106
+ };
107
+ /**
108
+ * Line-start pattern rule - matches lines starting with any of the given patterns.
109
+ *
110
+ * Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
111
+ * is **included** in the segment content.
112
+ *
113
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
114
+ * diacritic-insensitive Arabic matching.
115
+ *
116
+ * @example
117
+ * // Split at chapter headings (marker included in content)
118
+ * { lineStartsWith: ['## ', '### '], split: 'at' }
119
+ *
120
+ * @example
121
+ * // Split at Arabic book/chapter markers with fuzzy matching
122
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
123
+ */
124
+ type LineStartsWithPattern = {
125
+ /** Array of patterns that mark line beginnings (marker included in content) */
126
+ lineStartsWith: string[];
127
+ };
128
+ /**
129
+ * Line-start-after pattern rule - matches lines starting with patterns,
130
+ * but **excludes** the marker from the segment content.
131
+ *
132
+ * Behaves like `lineStartsWith` but strips the marker from the output.
133
+ * The segment content starts after the marker and extends to the next split point
134
+ * (not just the end of the matching line).
135
+ *
136
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
137
+ * diacritic-insensitive Arabic matching.
138
+ *
139
+ * @example
140
+ * // Split at numbered hadiths, capturing content without the number prefix
141
+ * // Content extends to next split, not just end of that line
142
+ * { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
143
+ *
144
+ * @example
145
+ * // Extract hadith number to metadata while stripping the prefix
146
+ * { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
4
147
  */
5
- type NumberingStyle = 'arabic-indic' | 'latin';
148
+ type LineStartsAfterPattern = {
149
+ /** Array of patterns that mark line beginnings (marker excluded from content) */
150
+ lineStartsAfter: string[];
151
+ };
6
152
  /**
7
- * Separator styles for markers
153
+ * Line-end pattern rule - matches lines ending with any of the given patterns.
154
+ *
155
+ * Syntactic sugar for `(?:pattern1|pattern2|...)$`.
156
+ *
157
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
158
+ * diacritic-insensitive Arabic matching.
159
+ *
160
+ * @example
161
+ * // Split at lines ending with Arabic sentence-ending punctuation
162
+ * { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
8
163
  */
9
- type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
164
+ type LineEndsWithPattern = {
165
+ /** Array of patterns that mark line endings */
166
+ lineEndsWith: string[];
167
+ };
10
168
  /**
11
- * Marker types for text segmentation
169
+ * Union of all pattern types for split rules.
170
+ *
171
+ * Each rule must have exactly ONE pattern type:
172
+ * - `regex` - Raw regex pattern (no token expansion)
173
+ * - `template` - Pattern with `{{token}}` expansion
174
+ * - `lineStartsWith` - Match line beginnings (marker included)
175
+ * - `lineStartsAfter` - Match line beginnings (marker excluded)
176
+ * - `lineEndsWith` - Match line endings
12
177
  */
13
- type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash';
178
+ type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
14
179
  /**
15
- * Configuration for a single marker pattern
180
+ * Configuration for how and where to split content when a pattern matches.
181
+ *
182
+ * Controls the split position relative to matches, which occurrences to
183
+ * split on, page span limits, and fuzzy matching for Arabic text.
16
184
  */
17
- type MarkerConfig = {
18
- /** The type of marker to look for */
19
- type: MarkerType;
20
- /** For numbered markers, the digit style */
21
- numbering?: NumberingStyle;
22
- /** The separator that follows the marker */
23
- separator?: SeparatorStyle | string;
185
+ type SplitBehavior = {
24
186
  /**
25
- * Template format for numbered markers using token syntax.
26
- * Example: '{bullet}+ {num} {dash}'
27
- * Only valid when type is 'numbered'.
187
+ * Where to split relative to the match.
188
+ * - `'at'`: New segment starts at the match position
189
+ * - `'after'`: New segment starts after the match ends
28
190
  */
29
- format?: string;
191
+ split: 'at' | 'after';
30
192
  /**
31
- * For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}.
32
- * For raw regex patterns that don't use templates, provide the raw pattern string here.
33
- * Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)'
193
+ * Which occurrence(s) to split on.
194
+ * - `'all'`: Split at every match (default)
195
+ * - `'first'`: Only split at the first match
196
+ * - `'last'`: Only split at the last match
197
+ *
198
+ * When `maxSpan` is set, occurrence filtering is applied per sliding
199
+ * window rather than globally. With `'last'`, the algorithm prefers
200
+ * longer segments by looking as far ahead as allowed before selecting
201
+ * the last match in the window.
202
+ *
203
+ * @default 'all'
34
204
  */
35
- template?: string;
205
+ occurrence?: 'first' | 'last' | 'all';
36
206
  /**
37
- * Alternative to template: raw regex pattern string (for 'pattern' type only).
38
- * Use this for complex patterns that can't be expressed with templates.
39
- * The pattern should have a capture group for the content.
40
- * Example: '^CUSTOM: (.*)'
207
+ * Maximum page ID difference allowed when looking ahead for split points.
208
+ *
209
+ * Uses a sliding window algorithm that prefers longer segments:
210
+ * 1. Start from the first page of the current segment
211
+ * 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
212
+ * 3. Apply occurrence filter (e.g., 'last') to select a match
213
+ * 4. Next window starts from the page after the match
214
+ *
215
+ * Examples:
216
+ * - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
217
+ * - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
218
+ * - `undefined` = no limit (entire content treated as one group)
219
+ *
220
+ * Note: With non-consecutive page IDs, the algorithm uses actual ID
221
+ * difference, not array index. Pages 1 and 5 have a difference of 4.
222
+ *
223
+ * @example
224
+ * // Split at last period, looking up to 1 page ahead
225
+ * // Pages 1,2: split at page 2's last period
226
+ * // Page 3: split at page 3's last period
227
+ * { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
41
228
  */
42
- pattern?: string;
229
+ maxSpan?: number;
43
230
  /**
44
- * Custom token map for advanced users.
45
- * Extends the default TOKENS with additional definitions.
231
+ * Enable diacritic-insensitive matching for Arabic text.
232
+ *
233
+ * When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
234
+ * `lineStartsAfter` are transformed to match text regardless of:
235
+ * - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
236
+ * - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
237
+ *
238
+ * **Note**: Does NOT apply to `regex` or `template` patterns.
239
+ * For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
240
+ *
241
+ * @default false
46
242
  */
47
- tokens?: Record<string, string>;
243
+ fuzzy?: boolean;
244
+ };
245
+ /**
246
+ * A single page ID or a range of page IDs.
247
+ *
248
+ * - `number`: A single page ID
249
+ * - `[number, number]`: A range from first to second (inclusive)
250
+ *
251
+ * @example
252
+ * 5 // Single page 5
253
+ * [10, 20] // Pages 10 through 20 (inclusive)
254
+ */
255
+ type PageRange = number | [number, number];
256
+ /**
257
+ * Optional constraints and metadata for a split rule.
258
+ *
259
+ * Use constraints to limit which pages a rule applies to, and
260
+ * metadata to attach arbitrary data to resulting segments.
261
+ */
262
+ type RuleConstraints = {
263
+ /**
264
+ * Minimum page ID for this rule to apply.
265
+ *
266
+ * Matches on pages with `id < min` are ignored.
267
+ *
268
+ * @example
269
+ * // Only apply rule starting from page 10
270
+ * { min: 10, lineStartsWith: ['##'], split: 'before' }
271
+ */
272
+ min?: number;
48
273
  /**
49
- * List of phrases for 'phrase' and 'hadith-chain' types.
50
- * For 'hadith-chain', defaults to common narrator patterns if not provided.
274
+ * Maximum page ID for this rule to apply.
275
+ *
276
+ * Matches on pages with `id > max` are ignored.
277
+ *
278
+ * @example
279
+ * // Only apply rule up to page 100
280
+ * { max: 100, lineStartsWith: ['##'], split: 'before' }
51
281
  */
52
- phrases?: string[];
282
+ max?: number;
53
283
  /**
54
- * Optional: Only apply this marker after a specific page number.
55
- * Useful for books with different formatting in front matter vs main content.
284
+ * Specific pages or page ranges to exclude from this rule.
285
+ *
286
+ * Use this to skip the rule for specific pages without needing
287
+ * to repeat the rule with different min/max values.
288
+ *
289
+ * @example
290
+ * // Exclude specific pages
291
+ * { exclude: [1, 2, 5] }
292
+ *
293
+ * @example
294
+ * // Exclude page ranges
295
+ * { exclude: [[1, 10], [50, 100]] }
296
+ *
297
+ * @example
298
+ * // Mix single pages and ranges
299
+ * { exclude: [1, [5, 10], 50] }
56
300
  */
57
- minPage?: number;
301
+ exclude?: PageRange[];
58
302
  /**
59
- * Optional: Arbitrary metadata to attach to entries matched by this marker.
60
- * This allows for agnostic handling of entry properties.
61
- * Example: { type: 0, category: 'hadith' }
303
+ * Arbitrary metadata attached to segments matching this rule.
304
+ *
305
+ * This metadata is merged with any named captures from the pattern.
306
+ * Named captures (e.g., `{{raqms:num}}`) take precedence over
307
+ * static metadata with the same key.
308
+ *
309
+ * @example
310
+ * // Tag segments as chapters
311
+ * { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
62
312
  */
63
- metadata?: Record<string, any>;
313
+ meta?: Record<string, unknown>;
314
+ /**
315
+ * Fallback behavior when no matches are found within a maxSpan boundary.
316
+ * - 'page': Create split points at page boundaries
317
+ * - undefined: No fallback (current behavior)
318
+ */
319
+ fallback?: 'page';
64
320
  };
65
- //#endregion
66
- //#region src/markers/defaults.d.ts
67
321
  /**
68
- * Default numbering style for markers
69
- */
70
- declare const DEFAULT_NUMBERING: NumberingStyle;
71
- /**
72
- * Default separator style for markers
73
- */
74
- declare const DEFAULT_SEPARATOR: SeparatorStyle;
75
- /**
76
- * Default separator pattern (used when separator is a custom string)
77
- */
78
- declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
79
- /**
80
- * Numbering patterns mapped by style
81
- */
82
- declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
83
- /**
84
- * Separator patterns mapped by style
85
- */
86
- declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
87
- //#endregion
88
- //#region src/markers/generator.d.ts
89
- /**
90
- * Generates a regex pattern from a marker configuration.
91
- * Always returns a regex with three named capture groups:
92
- * - full: Complete match including marker
93
- * - marker: Just the marker part (for metadata/indexing)
94
- * - content: Clean content without marker (for LLM processing)
322
+ * A complete split rule combining pattern, behavior, and constraints.
95
323
  *
96
- * This function applies all default values before delegating to type-specific generators.
324
+ * Each rule must specify:
325
+ * - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
326
+ * `lineStartsAfter`, or `lineEndsWith`
327
+ * - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
328
+ * - **Constraints** (optional): `min`, `max`, `meta`
97
329
  *
98
- * @param config - Marker configuration
99
- * @returns Regular expression with named groups
330
+ * @example
331
+ * // Basic rule: split at markdown headers
332
+ * const rule: SplitRule = {
333
+ * lineStartsWith: ['## ', '### '],
334
+ * split: 'at',
335
+ * meta: { type: 'section' }
336
+ * };
100
337
  *
101
338
  * @example
102
- * const regex = generateRegexFromMarker({ type: 'numbered' });
103
- * const match = regex.exec('٥ - نص');
104
- * match.groups.full // - نص"
105
- * match.groups.marker // "٥ -"
106
- * match.groups.content // "نص"
107
- */
108
- declare function generateRegexFromMarker(config: MarkerConfig): RegExp;
109
- //#endregion
110
- //#region src/markers/presets.d.ts
111
- /**
112
- * Default phrase lists for preset marker types.
113
- * Export these so users can extend them.
114
- */
115
- /**
116
- * Common hadith narrator phrases (diacritic-insensitive)
117
- * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
339
+ * // Advanced rule: extract hadith numbers with fuzzy Arabic matching
340
+ * const rule: SplitRule = {
341
+ * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
342
+ * split: 'at',
343
+ * fuzzy: true,
344
+ * min: 5,
345
+ * max: 500,
346
+ * meta: { type: 'hadith' }
347
+ * };
118
348
  */
119
- declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"];
349
+ type SplitRule = PatternType & SplitBehavior & RuleConstraints;
120
350
  /**
121
- * Common basmala patterns
122
- * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
351
+ * Input page structure for segmentation.
352
+ *
353
+ * Each page represents a logical unit of content (e.g., a book page,
354
+ * a document section) that can be tracked across segment boundaries.
355
+ *
356
+ * @example
357
+ * const pages: Page[] = [
358
+ * { id: 1, content: '## Chapter 1\nFirst paragraph...' },
359
+ * { id: 2, content: 'Continued text...\n## Chapter 2' },
360
+ * ];
123
361
  */
124
- declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"];
125
- //#endregion
126
- //#region src/markers/tokens.d.ts
127
- /**
128
- * Token definitions for pattern templates.
129
- * Tokens provide a readable alternative to raw regex patterns.
130
- */
131
- /**
132
- * Standard tokens for building marker patterns.
133
- * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
134
- */
135
- declare const TOKENS: {
136
- readonly bullet: "[•*°]";
137
- readonly colon: ":";
138
- readonly comma: "،";
139
- readonly content: "(.*)";
140
- readonly dash: "[-–—ـ]";
141
- readonly dot: "\\.";
142
- readonly latin: "\\d+";
143
- readonly letter: "[أ-ي]";
144
- readonly num: "[\\u0660-\\u0669]+";
145
- readonly paren: "\\)";
146
- readonly s: "\\s?";
147
- readonly slash: "/";
148
- readonly space: "\\s+";
362
+ type Page = {
363
+ /**
364
+ * Unique page/entry ID used for:
365
+ * - `maxSpan` grouping (segments spanning multiple pages)
366
+ * - `min`/`max` constraint filtering
367
+ * - `from`/`to` tracking in output segments
368
+ */
369
+ id: number;
370
+ /**
371
+ * Raw page content (may contain HTML).
372
+ *
373
+ * Line endings are normalized internally (`\r\n` and `\r` → `\n`).
374
+ * Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
375
+ */
376
+ content: string;
149
377
  };
150
- type TokenMap = Record<string, string>;
151
- //#endregion
152
- //#region src/markers/template-parser.d.ts
153
378
  /**
154
- * Result of template validation
379
+ * A breakpoint pattern with optional page constraints.
380
+ *
381
+ * Use this to control which pages a breakpoint pattern applies to.
382
+ * Patterns outside the specified range are skipped, allowing
383
+ * the next breakpoint pattern (or fallback) to be tried.
384
+ *
385
+ * @example
386
+ * // Only apply punctuation-based breaking from page 10 onwards
387
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
388
+ *
389
+ * @example
390
+ * // Apply to specific page range (pages 10-50)
391
+ * { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
155
392
  */
156
- interface ValidationResult {
157
- valid: boolean;
158
- errors?: string[];
159
- }
393
+ type BreakpointRule = {
394
+ /**
395
+ * Regex pattern for breaking (supports token expansion).
396
+ * Empty string `''` means fall back to page boundary.
397
+ */
398
+ pattern: string;
399
+ /**
400
+ * Minimum page ID for this breakpoint to apply.
401
+ * Segments starting before this page skip this pattern.
402
+ */
403
+ min?: number;
404
+ /**
405
+ * Maximum page ID for this breakpoint to apply.
406
+ * Segments starting after this page skip this pattern.
407
+ */
408
+ max?: number;
409
+ /**
410
+ * Specific pages or page ranges to exclude from this breakpoint.
411
+ *
412
+ * Use this to skip the breakpoint for specific pages without needing
413
+ * to repeat the breakpoint with different min/max values.
414
+ *
415
+ * @example
416
+ * // Exclude specific pages
417
+ * { pattern: '\\.\\s*', exclude: [1, 2, 5] }
418
+ *
419
+ * @example
420
+ * // Exclude page ranges (front matter pages 1-10)
421
+ * { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
422
+ *
423
+ * @example
424
+ * // Mix single pages and ranges
425
+ * { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
426
+ */
427
+ exclude?: PageRange[];
428
+ /**
429
+ * Skip this breakpoint if the segment content matches this pattern.
430
+ *
431
+ * Supports token expansion (e.g., `{{kitab}}`). When the segment's
432
+ * remaining content matches this regex, the breakpoint pattern is
433
+ * skipped and the next breakpoint in the array is tried.
434
+ *
435
+ * Useful for excluding title pages or front matter without needing
436
+ * to specify explicit page ranges.
437
+ *
438
+ * @example
439
+ * // Skip punctuation breakpoint for short content (likely titles)
440
+ * { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
441
+ *
442
+ * @example
443
+ * // Skip for content containing "kitab" (book) marker
444
+ * { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
445
+ */
446
+ skipWhen?: string;
447
+ };
160
448
  /**
161
- * Options for template expansion
449
+ * A breakpoint can be a simple string pattern or an object with constraints.
450
+ *
451
+ * String breakpoints apply to all pages. Object breakpoints can specify
452
+ * `min`/`max` to limit which pages they apply to.
453
+ *
454
+ * @example
455
+ * // String (applies everywhere)
456
+ * '{{tarqim}}\\s*'
457
+ *
458
+ * @example
459
+ * // Object with constraints (only from page 10+)
460
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
162
461
  */
163
- interface ExpandOptions {
164
- /** Custom token map to use instead of default TOKENS */
165
- tokens?: TokenMap;
166
- }
462
+ type Breakpoint = string | BreakpointRule;
167
463
  /**
168
- * Expands a template string into a regex pattern using named capture groups.
169
- * Always creates three groups: full (entire match), marker (just the marker), content (clean text).
464
+ * Logger interface for custom logging implementations.
170
465
  *
171
- * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
466
+ * All methods are optional - only implement the verbosity levels you need.
467
+ * When no logger is provided, no logging overhead is incurred.
172
468
  *
173
- * @param template - Template string with {token} placeholders
174
- * @param options - Optional configuration
175
- * @returns Regex pattern string with named groups
469
+ * Compatible with the Logger interface from ffmpeg-simplified and similar libraries.
176
470
  *
177
471
  * @example
178
- * expandTemplate('{num} {dash}')
179
- * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
180
- */
181
- declare function expandTemplate(template: string, options?: ExpandOptions): string;
472
+ * // Simple console logger
473
+ * const logger: Logger = {
474
+ * debug: console.debug,
475
+ * info: console.info,
476
+ * warn: console.warn,
477
+ * error: console.error,
478
+ * };
479
+ *
480
+ * @example
481
+ * // Production logger (only warnings and errors)
482
+ * const prodLogger: Logger = {
483
+ * warn: (msg, ...args) => myLoggingService.warn(msg, args),
484
+ * error: (msg, ...args) => myLoggingService.error(msg, args),
485
+ * };
486
+ */
487
+ interface Logger {
488
+ /** Log a debug message (verbose debugging output) */
489
+ debug?: (message: string, ...args: unknown[]) => void;
490
+ /** Log an error message (critical failures) */
491
+ error?: (message: string, ...args: unknown[]) => void;
492
+ /** Log an informational message (key progress points) */
493
+ info?: (message: string, ...args: unknown[]) => void;
494
+ /** Log a trace message (extremely verbose, per-iteration details) */
495
+ trace?: (message: string, ...args: unknown[]) => void;
496
+ /** Log a warning message (potential issues) */
497
+ warn?: (message: string, ...args: unknown[]) => void;
498
+ }
182
499
  /**
183
- * Create a custom token map by extending the base tokens.
500
+ * Segmentation options controlling how pages are split.
184
501
  *
185
- * @param customTokens - Custom token definitions
186
- * @returns Combined token map
502
+ * @example
503
+ * // Basic structural rules only
504
+ * const options: SegmentationOptions = {
505
+ * rules: [
506
+ * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
507
+ * { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
508
+ * ]
509
+ * };
187
510
  *
188
511
  * @example
189
- * const myTokens = createTokenMap({
190
- * verse: '\\[[\\u0660-\\u0669]+\\]',
191
- * tafsir: 'تفسير'
192
- * });
193
- */
194
- declare function createTokenMap(customTokens: Record<string, string>): TokenMap;
512
+ * // With breakpoints for oversized segments
513
+ * const options: SegmentationOptions = {
514
+ * rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
515
+ * maxPages: 2,
516
+ * breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
517
+ * prefer: 'longer'
518
+ * };
519
+ *
520
+ * @example
521
+ * // With custom logger for debugging
522
+ * const options: SegmentationOptions = {
523
+ * rules: [...],
524
+ * logger: {
525
+ * debug: console.debug,
526
+ * info: console.info,
527
+ * warn: console.warn,
528
+ * }
529
+ * };
530
+ */
531
+ type SegmentationOptions = {
532
+ /**
533
+ * Rules applied in order to find split points.
534
+ *
535
+ * All rules are evaluated against the content, and their matches
536
+ * are combined to determine final split points. The first matching
537
+ * rule's metadata is used for each segment.
538
+ */
539
+ rules?: SplitRule[];
540
+ /**
541
+ * Maximum pages per segment before breakpoints are applied.
542
+ *
543
+ * When a segment spans more pages than this limit, the `breakpoints`
544
+ * patterns are tried (in order) to find a suitable break point within
545
+ * the allowed window.
546
+ *
547
+ * Structural markers (from rules) always take precedence - segments
548
+ * are only broken within their rule-defined boundaries, never across them.
549
+ *
550
+ * @example
551
+ * // Break segments that exceed 2 pages
552
+ * { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
553
+ */
554
+ maxPages?: number;
555
+ /**
556
+ * Patterns tried in order to break oversized segments.
557
+ *
558
+ * Each pattern is tried until one matches within the allowed page window.
559
+ * Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
560
+ * matches the page boundary (always succeeds as ultimate fallback).
561
+ *
562
+ * Patterns can be simple strings (apply everywhere) or objects with
563
+ * `min`/`max` constraints to limit which pages they apply to.
564
+ *
565
+ * Patterns are checked in order - put preferred break styles first:
566
+ * - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
567
+ * - `\\n` - Break at line breaks (useful for OCR content)
568
+ * - `''` - Break at page boundary (always works)
569
+ *
570
+ * Only applied to segments exceeding `maxPages`.
571
+ *
572
+ * @example
573
+ * // Simple patterns (backward compatible)
574
+ * breakpoints: ['{{tarqim}}\\s*', '\\n', '']
575
+ *
576
+ * @example
577
+ * // Object patterns with page constraints
578
+ * breakpoints: [
579
+ * { pattern: '{{tarqim}}\\s*', min: 10 }, // Only from page 10+
580
+ * '' // Fallback for pages 1-9
581
+ * ]
582
+ */
583
+ breakpoints?: Breakpoint[];
584
+ /**
585
+ * When multiple matches exist for a breakpoint pattern, select:
586
+ * - `'longer'` - Last match in window (prefers longer segments)
587
+ * - `'shorter'` - First match in window (prefers shorter segments)
588
+ *
589
+ * @default 'longer'
590
+ */
591
+ prefer?: 'longer' | 'shorter';
592
+ /**
593
+ * Optional logger for debugging segmentation.
594
+ *
595
+ * Provide a logger to receive detailed information about the segmentation
596
+ * process. Useful for debugging pattern matching, page tracking, and
597
+ * breakpoint processing issues.
598
+ *
599
+ * When not provided, no logging overhead is incurred (methods are not called).
600
+ *
601
+ * Verbosity levels:
602
+ * - `trace`: Per-iteration details (very verbose)
603
+ * - `debug`: Detailed operation information
604
+ * - `info`: Key progress points
605
+ * - `warn`: Potential issues
606
+ * - `error`: Critical failures
607
+ *
608
+ * @example
609
+ * // Console logger for development
610
+ * logger: {
611
+ * debug: console.debug,
612
+ * info: console.info,
613
+ * warn: console.warn,
614
+ * }
615
+ *
616
+ * @example
617
+ * // Custom logger integration
618
+ * logger: {
619
+ * debug: (msg, ...args) => winston.debug(msg, { meta: args }),
620
+ * error: (msg, ...args) => winston.error(msg, { meta: args }),
621
+ * }
622
+ */
623
+ logger?: Logger;
624
+ };
195
625
  /**
196
- * Validates a template string.
626
+ * Output segment produced by `segmentPages()`.
197
627
  *
198
- * @param template - Template to validate
199
- * @param tokens - Token map to validate against
200
- * @returns Validation result with errors if invalid
628
+ * Each segment contains extracted content, page references, and
629
+ * optional metadata from the matched rule and captured groups.
201
630
  *
202
631
  * @example
203
- * validateTemplate('{num} {dash}')
204
- * // Returns: { valid: true }
632
+ * // Simple segment on a single page
633
+ * { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
205
634
  *
206
- * validateTemplate('{invalid}')
207
- * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
635
+ * @example
636
+ * // Segment spanning pages 5-7 with captured hadith number
637
+ * { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
208
638
  */
209
- declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult;
639
+ type Segment = {
640
+ /**
641
+ * Segment content with:
642
+ * - Leading/trailing whitespace trimmed
643
+ * - Page breaks converted to spaces (for multi-page segments)
644
+ * - Markers stripped (for `lineStartsAfter` patterns)
645
+ */
646
+ content: string;
647
+ /**
648
+ * Starting page ID (from `Page.id`).
649
+ */
650
+ from: number;
651
+ /**
652
+ * Ending page ID if segment spans multiple pages.
653
+ *
654
+ * Only present when the segment content extends across page boundaries.
655
+ * When `undefined`, the segment is contained within a single page.
656
+ */
657
+ to?: number;
658
+ /**
659
+ * Combined metadata from:
660
+ * 1. Rule's `meta` property (static metadata)
661
+ * 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
662
+ *
663
+ * Named captures override static metadata with the same key.
664
+ */
665
+ meta?: Record<string, unknown>;
666
+ };
210
667
  //#endregion
211
- //#region src/markers/type-generators.d.ts
668
+ //#region src/segmentation/segmenter.d.ts
212
669
  /**
213
- * Generates a regular expression for pattern-type markers.
670
+ * Segments pages of content based on pattern-matching rules.
214
671
  *
215
- * Supports two modes:
216
- * 1. Template-based: Uses the `template` field with token expansion
217
- * 2. Pattern-based: Uses the raw `pattern` field as-is
672
+ * This is the main entry point for the segmentation engine. It takes an array
673
+ * of pages and applies the provided rules to identify split points, producing
674
+ * an array of segments with content, page references, and metadata.
218
675
  *
219
- * @param config - Marker configuration with either `template` or `pattern` field
220
- * @returns A compiled RegExp object for matching the pattern
221
- * @throws {Error} When neither `template` nor `pattern` is provided
676
+ * @param pages - Array of pages with id and content
677
+ * @param options - Segmentation options including splitting rules
678
+ * @returns Array of segments with content, from/to page references, and optional metadata
222
679
  *
223
680
  * @example
224
- * // Using template
225
- * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
681
+ * // Split markdown by headers
682
+ * const segments = segmentPages(pages, {
683
+ * rules: [
684
+ * { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
685
+ * ]
686
+ * });
226
687
  *
227
688
  * @example
228
- * // Using raw pattern
229
- * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
689
+ * // Split Arabic hadith text with number extraction
690
+ * const segments = segmentPages(pages, {
691
+ * rules: [
692
+ * {
693
+ * lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
694
+ * split: 'at',
695
+ * fuzzy: true,
696
+ * meta: { type: 'hadith' }
697
+ * }
698
+ * ]
699
+ * });
230
700
  *
231
701
  * @example
232
- * // Using custom tokens
233
- * const regex = generatePatternRegex({
234
- * type: 'pattern',
235
- * template: '{verse}',
236
- * tokens: { verse: '\\[[0-9]+\\]' }
702
+ * // Multiple rules with page constraints
703
+ * const segments = segmentPages(pages, {
704
+ * rules: [
705
+ * { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
706
+ * { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
707
+ * { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
708
+ * ]
237
709
  * });
238
710
  */
239
- declare function generatePatternRegex(config: MarkerConfig): RegExp;
711
+ declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
712
+ //#endregion
713
+ //#region src/segmentation/textUtils.d.ts
240
714
  /**
241
- * Generates a regular expression for 'bab' (chapter) markers.
715
+ * Strip all HTML tags from content, keeping only text.
242
716
  *
243
- * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
244
- * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
717
+ * @param html - HTML content
718
+ * @returns Plain text content
719
+ */
720
+ declare const stripHtmlTags: (html: string) => string;
721
+ /**
722
+ * Normalizes line endings to Unix-style (`\n`).
245
723
  *
246
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
724
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
725
+ * for consistent pattern matching across platforms.
247
726
  *
248
- * @example
249
- * const regex = generateBabRegex();
250
- * const match = regex.exec('باب الصلاة');
251
- * // match.groups.marker -> 'باب'
252
- * // match.groups.content -> ' الصلاة'
727
+ * @param content - Raw content with potentially mixed line endings
728
+ * @returns Content with all line endings normalized to `\n`
253
729
  */
254
- declare function generateBabRegex(): RegExp;
730
+ declare const normalizeLineEndings: (content: string) => string;
731
+ //#endregion
732
+ //#region src/segmentation/tokens.d.ts
255
733
  /**
256
- * Generates a regular expression for hadith chain (isnad) markers.
734
+ * Token-based template system for Arabic text pattern matching.
257
735
  *
258
- * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
259
- * Uses default phrases from presets or custom phrases from config.
260
- * All phrases are made diacritic-insensitive.
736
+ * This module provides a human-readable way to define regex patterns using
737
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
738
+ * named capture groups for extracting matched values into metadata.
261
739
  *
262
- * @param config - Marker configuration with optional `phrases` array
263
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
740
+ * @module tokens
264
741
  *
265
742
  * @example
266
- * // Using default phrases
267
- * const regex = generateHadithChainRegex({ type: 'hadith-chain' });
268
- * const match = regex.exec('حَدَّثَنَا أبو بكر');
743
+ * // Simple token expansion
744
+ * expandTokens('{{raqms}} {{dash}}')
745
+ * // '[\\u0660-\\u0669]+ [-–—ـ]'
269
746
  *
270
747
  * @example
271
- * // Using custom phrases
272
- * const regex = generateHadithChainRegex({
273
- * type: 'hadith-chain',
274
- * phrases: ['قَالَ', 'رَوَى']
275
- * });
748
+ * // Named capture groups
749
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
750
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
276
751
  */
277
- declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
278
752
  /**
279
- * Generates a regular expression for basmala markers.
753
+ * Token definitions mapping human-readable token names to regex patterns.
754
+ *
755
+ * Tokens are used in template strings with double-brace syntax:
756
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
757
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
758
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
280
759
  *
281
- * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
282
- * - بسم الله (without diacritics)
283
- * - بِسْمِ اللَّهِ (with diacritics)
284
- * - Special patterns like [بسم, [تم
760
+ * @remarks
761
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
762
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
763
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
285
764
  *
286
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
765
+ * @example
766
+ * // Using tokens in a split rule
767
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
287
768
  *
288
769
  * @example
289
- * const regex = generateBasmalaRegex();
290
- * const match = regex.exec('بسم الله الرحمن الرحيم');
291
- * // match.groups.marker -> 'بسم الله'
770
+ * // Using tokens with named captures
771
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
772
+ *
773
+ * @example
774
+ * // Using the numbered convenience token
775
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
292
776
  */
293
- declare function generateBasmalaRegex(): RegExp;
777
+ declare const TOKEN_PATTERNS: Record<string, string>;
294
778
  /**
295
- * Generates a regular expression for custom phrase markers.
779
+ * Checks if a query string contains template tokens.
296
780
  *
297
- * Similar to hadith-chain markers but requires explicit phrase list.
298
- * All phrases are made diacritic-insensitive.
781
+ * Performs a quick test for `{{token}}` patterns without actually
782
+ * expanding them. Useful for determining whether to apply token
783
+ * expansion to a string.
299
784
  *
300
- * @param config - Marker configuration with required `phrases` array
301
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
302
- * @throws {Error} When `phrases` is undefined or empty
785
+ * @param query - String to check for tokens
786
+ * @returns `true` if the string contains at least one `{{token}}` pattern
303
787
  *
304
788
  * @example
305
- * const regex = generatePhraseRegex({
306
- * type: 'phrase',
307
- * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
308
- * });
789
+ * containsTokens('{{raqms}} {{dash}}') // → true
790
+ * containsTokens('plain text') // → false
791
+ * containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
792
+ */
793
+ declare const containsTokens: (query: string) => boolean;
794
+ /**
795
+ * Result from expanding tokens with capture information.
796
+ *
797
+ * Contains the expanded pattern string along with metadata about
798
+ * any named capture groups that were created.
309
799
  */
310
- declare function generatePhraseRegex(config: MarkerConfig): RegExp;
800
+ type ExpandResult = {
801
+ /**
802
+ * The expanded regex pattern string with all tokens replaced.
803
+ *
804
+ * Named captures use the `(?<name>pattern)` syntax.
805
+ */
806
+ pattern: string;
807
+ /**
808
+ * Names of captured groups extracted from `{{token:name}}` syntax.
809
+ *
810
+ * Empty array if no named captures were found.
811
+ */
812
+ captureNames: string[];
813
+ /**
814
+ * Whether the pattern has any named capturing groups.
815
+ *
816
+ * Equivalent to `captureNames.length > 0`.
817
+ */
818
+ hasCaptures: boolean;
819
+ };
311
820
  /**
312
- * Generates a regular expression for square bracket markers.
821
+ * Expands template tokens with support for named captures.
313
822
  *
314
- * Matches verse or hadith reference numbers in square brackets:
315
- * - [٦٥] - Simple bracket
316
- * - [٦٥] - With bullet prefix
317
- * - ° [٦٥] - With degree prefix
823
+ * This is the primary token expansion function that handles all token syntax:
824
+ * - `{{token}}` Expands to the token's pattern (no capture group)
825
+ * - `{{token:name}}` Expands to `(?<name>pattern)` (named capture)
826
+ * - `{{:name}}` Expands to `(?<name>.+)` (capture anything)
318
827
  *
319
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
828
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
829
+ *
830
+ * @param query - The template string containing tokens
831
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
832
+ * Applied to both token patterns and plain Arabic text between tokens.
833
+ * Typically `makeDiacriticInsensitive` from the fuzzy module.
834
+ * @returns Object with expanded pattern, capture names, and capture flag
320
835
  *
321
836
  * @example
322
- * const regex = generateSquareBracketRegex();
323
- * const match = regex.exec('[٦٥] نص الحديث');
324
- * // match.groups.content -> ' نص الحديث'
325
- */
326
- declare function generateSquareBracketRegex(): RegExp;
327
- /**
328
- * Generates a regular expression for number-letter-separator markers.
837
+ * // Simple token expansion
838
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
839
+ * // { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
329
840
  *
330
- * Matches patterns like:
331
- * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
332
- * - 5 ب. (Latin number, Arabic letter, dot)
841
+ * @example
842
+ * // Named capture
843
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
844
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
333
845
  *
334
- * @param config - Configuration with required `numbering` and `separator` fields
335
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
846
+ * @example
847
+ * // Capture-only token
848
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
849
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
336
850
  *
337
851
  * @example
338
- * const regex = generateNumLetterRegex({
339
- * numbering: 'arabic-indic',
340
- * separator: 'dash'
341
- * });
342
- * const match = regex.exec('٥ أ - نص');
852
+ * // With fuzzy transform
853
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
854
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
343
855
  */
344
- declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
856
+ declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
345
857
  /**
346
- * Generates a regular expression for number-parenthetical-separator markers.
858
+ * Expands template tokens in a query string to their regex equivalents.
347
859
  *
348
- * Matches patterns like:
349
- * - ٥ (أ) - (number, parenthetical content, separator)
350
- * - 5 (٦) - (number with parenthetical number)
860
+ * This is the simple version without capture support. It returns only the
861
+ * expanded pattern string, not capture metadata.
351
862
  *
352
- * @param config - Configuration with required `numbering` and `separator` fields
353
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
863
+ * Unknown tokens are left as-is, allowing for partial templates.
864
+ *
865
+ * @param query - Template string containing `{{token}}` placeholders
866
+ * @returns Expanded regex pattern string
354
867
  *
355
868
  * @example
356
- * const regex = generateNumParenRegex({
357
- * numbering: 'arabic-indic',
358
- * separator: 'dash'
359
- * });
360
- * const match = regex.exec('٥ (أ) - نص');
869
+ * expandTokens('، {{raqms}}') // '، [\\u0660-\\u0669]+'
870
+ * expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
871
+ * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
872
+ * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
873
+ *
874
+ * @see expandTokensWithCaptures for full capture group support
361
875
  */
362
- declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
876
+ declare const expandTokens: (query: string) => string;
363
877
  /**
364
- * Generates a regular expression for number-slash-number markers.
878
+ * Converts a template string to a compiled RegExp.
365
879
  *
366
- * Matches patterns like:
367
- * - ٥/٦ - (number slash number, separator)
368
- * - ٥ - (single number, separator)
880
+ * Expands all tokens and attempts to compile the result as a RegExp
881
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
369
882
  *
370
- * The second number after the slash is optional.
883
+ * @remarks
884
+ * This function dynamically compiles regular expressions from template strings.
885
+ * If templates may come from untrusted sources, be aware of potential ReDoS
886
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
887
+ * Consider validating pattern complexity or applying execution timeouts when
888
+ * running user-submitted patterns.
371
889
  *
372
- * @param config - Configuration with required `numbering` and `separator` fields
373
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
890
+ * @param template - Template string containing `{{token}}` placeholders
891
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
374
892
  *
375
893
  * @example
376
- * const regex = generateNumSlashRegex({
377
- * numbering: 'arabic-indic',
378
- * separator: 'dash'
379
- * });
380
- * const match1 = regex.exec('٥/٦ - نص');
381
- * const match2 = regex.exec('٥ - نص'); // Also matches
894
+ * templateToRegex('، {{raqms}}') // /، [٠-٩]+/u
895
+ * templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
896
+ * templateToRegex('(((') // → null (invalid regex)
382
897
  */
383
- declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
898
+ declare const templateToRegex: (template: string) => RegExp | null;
384
899
  /**
385
- * Generates a regular expression for numbered markers with optional format template.
900
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
386
901
  *
387
- * Supports two modes:
388
- * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
389
- * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
902
+ * Useful for documentation, validation, or building user interfaces
903
+ * that show available tokens.
390
904
  *
391
- * When using default pattern:
392
- * - Separator 'none' generates pattern without separator
393
- * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
394
- *
395
- * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
396
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
905
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
397
906
  *
398
907
  * @example
399
- * // Using format template
400
- * const regex = generateNumberedRegex({
401
- * numbering: 'arabic-indic',
402
- * separator: 'dash',
403
- * format: '{bullet}+ {num} {dash}'
404
- * });
908
+ * getAvailableTokens()
909
+ * // ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
910
+ */
911
+ declare const getAvailableTokens: () => string[];
912
+ /**
913
+ * Gets the regex pattern for a specific token name.
405
914
  *
406
- * @example
407
- * // Using default pattern
408
- * const regex = generateNumberedRegex({
409
- * numbering: 'arabic-indic',
410
- * separator: 'dash'
411
- * });
412
- * const match = regex.exec('٥ - نص');
915
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
916
+ * without any expansion or capture group wrapping.
917
+ *
918
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
919
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
413
920
  *
414
921
  * @example
415
- * // With 'none' separator
416
- * const regex = generateNumberedRegex({
417
- * numbering: 'latin',
418
- * separator: 'none'
419
- * });
420
- * const match = regex.exec('5 text');
922
+ * getTokenPattern('raqms') // '[\\u0660-\\u0669]+'
923
+ * getTokenPattern('dash') // '[-–—ـ]'
924
+ * getTokenPattern('unknown') // → undefined
421
925
  */
422
- declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp;
926
+ declare const getTokenPattern: (tokenName: string) => string | undefined;
927
+ //#endregion
928
+ //#region src/pattern-detection.d.ts
423
929
  /**
424
- * Generates a regular expression for bullet-point markers.
930
+ * Pattern detection utilities for recognizing template tokens in Arabic text.
931
+ * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
425
932
  *
426
- * Matches common bullet characters:
427
- * - • (bullet)
428
- * - * (asterisk)
429
- * - ° (degree)
430
- * - - (dash)
933
+ * @module pattern-detection
934
+ */
935
+ /**
936
+ * Result of detecting a token pattern in text
937
+ */
938
+ type DetectedPattern = {
939
+ /** Token name from TOKEN_PATTERNS (e.g., 'raqms', 'dash') */
940
+ token: string;
941
+ /** The matched text */
942
+ match: string;
943
+ /** Start index in the original text */
944
+ index: number;
945
+ /** End index (exclusive) */
946
+ endIndex: number;
947
+ };
948
+ /**
949
+ * Analyzes text and returns all detected token patterns with their positions.
950
+ * Patterns are detected in priority order to avoid partial matches.
431
951
  *
432
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
952
+ * @param text - The text to analyze for token patterns
953
+ * @returns Array of detected patterns sorted by position
433
954
  *
434
955
  * @example
435
- * const regex = generateBulletRegex();
436
- * const match = regex.exec('• نقطة');
437
- * // match.groups.content -> 'نقطة'
956
+ * detectTokenPatterns("٣٤ - حدثنا")
957
+ * // Returns: [
958
+ * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
959
+ * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
960
+ * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
961
+ * // ]
438
962
  */
439
- declare function generateBulletRegex(): RegExp;
963
+ declare const detectTokenPatterns: (text: string) => DetectedPattern[];
440
964
  /**
441
- * Generates a regular expression for Markdown-style heading markers.
442
- *
443
- * Matches heading levels using hash symbols:
444
- * - # Heading 1
445
- * - ## Heading 2
446
- * - ### Heading 3
447
- * - etc.
965
+ * Generates a template pattern from text using detected tokens.
966
+ * Replaces matched portions with {{token}} syntax.
448
967
  *
449
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
968
+ * @param text - Original text
969
+ * @param detected - Array of detected patterns from detectTokenPatterns
970
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
450
971
  *
451
972
  * @example
452
- * const regex = generateHeadingRegex();
453
- * const match = regex.exec('## عنوان فرعي');
454
- * // match.groups.marker -> '## '
455
- * // match.groups.content -> 'عنوان فرعي'
973
+ * const detected = detectTokenPatterns("٣٤ - ");
974
+ * generateTemplateFromText("٣٤ - ", detected);
975
+ * // Returns: "{{raqms}} {{dash}} "
976
+ */
977
+ declare const generateTemplateFromText: (text: string, detected: DetectedPattern[]) => string;
978
+ /**
979
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
980
+ *
981
+ * @param detected - Array of detected patterns
982
+ * @returns Suggested pattern type and whether to use fuzzy matching
983
+ */
984
+ declare const suggestPatternConfig: (detected: DetectedPattern[]) => {
985
+ patternType: "lineStartsWith" | "lineStartsAfter";
986
+ fuzzy: boolean;
987
+ metaType?: string;
988
+ };
989
+ /**
990
+ * Analyzes text and generates a complete suggested rule configuration.
991
+ *
992
+ * @param text - Highlighted text from the page
993
+ * @returns Suggested rule configuration or null if no patterns detected
456
994
  */
457
- declare function generateHeadingRegex(): RegExp;
995
+ declare const analyzeTextForRule: (text: string) => {
996
+ template: string;
997
+ patternType: "lineStartsWith" | "lineStartsAfter";
998
+ fuzzy: boolean;
999
+ metaType?: string;
1000
+ detected: DetectedPattern[];
1001
+ } | null;
458
1002
  //#endregion
459
- export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
1003
+ export { type Breakpoint, type BreakpointRule, type DetectedPattern, type ExpandResult, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
460
1004
  //# sourceMappingURL=index.d.mts.map