flappa-doormal 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +407 -205
- package/dist/index.d.mts +722 -332
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1363 -396
- package/dist/index.mjs.map +1 -1
- package/package.json +11 -9
package/dist/index.d.mts
CHANGED
|
@@ -1,460 +1,850 @@
|
|
|
1
|
-
//#region src/
|
|
1
|
+
//#region src/segmentation/fuzzy.d.ts
|
|
2
2
|
/**
|
|
3
|
-
*
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
*
|
|
3
|
+
* Fuzzy matching utilities for Arabic text.
|
|
4
|
+
*
|
|
5
|
+
* Provides diacritic-insensitive and character-equivalence matching for Arabic text.
|
|
6
|
+
* This allows matching text regardless of:
|
|
7
|
+
* - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
|
|
8
|
+
* - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
|
|
9
|
+
*
|
|
10
|
+
* @module fuzzy
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* // Make a pattern diacritic-insensitive
|
|
14
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
15
|
+
* new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
|
|
8
16
|
*/
|
|
9
|
-
type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
|
|
10
17
|
/**
|
|
11
|
-
*
|
|
18
|
+
* Escapes a string for safe inclusion in a regular expression.
|
|
19
|
+
*
|
|
20
|
+
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
21
|
+
*
|
|
22
|
+
* @param s - Any string to escape
|
|
23
|
+
* @returns String with regex metacharacters escaped
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
27
|
+
* escapeRegex('[test]') // → '\\[test\\]'
|
|
28
|
+
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
12
29
|
*/
|
|
13
|
-
|
|
30
|
+
declare const escapeRegex: (s: string) => string;
|
|
14
31
|
/**
|
|
15
|
-
*
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
* For 'hadith-chain', defaults to common narrator patterns if not provided.
|
|
51
|
-
*/
|
|
52
|
-
phrases?: string[];
|
|
53
|
-
/**
|
|
54
|
-
* Optional: Only apply this marker after a specific page number.
|
|
55
|
-
* Useful for books with different formatting in front matter vs main content.
|
|
56
|
-
*/
|
|
57
|
-
minPage?: number;
|
|
58
|
-
/**
|
|
59
|
-
* Optional: Arbitrary metadata to attach to entries matched by this marker.
|
|
60
|
-
* This allows for agnostic handling of entry properties.
|
|
61
|
-
* Example: { type: 0, category: 'hadith' }
|
|
62
|
-
*/
|
|
63
|
-
metadata?: Record<string, any>;
|
|
64
|
-
};
|
|
32
|
+
* Creates a diacritic-insensitive regex pattern for Arabic text matching.
|
|
33
|
+
*
|
|
34
|
+
* Transforms input text into a regex pattern that matches the text regardless
|
|
35
|
+
* of diacritical marks (harakat) and character variations. Each character in
|
|
36
|
+
* the input is:
|
|
37
|
+
* 1. Expanded to its equivalence class (if applicable)
|
|
38
|
+
* 2. Followed by an optional diacritics matcher
|
|
39
|
+
*
|
|
40
|
+
* This allows matching:
|
|
41
|
+
* - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
|
|
42
|
+
* - `الإيمان` with `الايمان` (alef variants)
|
|
43
|
+
* - `صلاة` with `صلاه` (ta marbuta ↔ ha)
|
|
44
|
+
*
|
|
45
|
+
* @param text - Input Arabic text to make diacritic-insensitive
|
|
46
|
+
* @returns Regex pattern string that matches the text with or without diacritics
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
50
|
+
* // Each char gets equivalence class + optional diacritics
|
|
51
|
+
* // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* const pattern = makeDiacriticInsensitive('باب');
|
|
55
|
+
* new RegExp(pattern, 'u').test('بَابٌ') // → true
|
|
56
|
+
* new RegExp(pattern, 'u').test('باب') // → true
|
|
57
|
+
*
|
|
58
|
+
* @example
|
|
59
|
+
* // Using with split rules
|
|
60
|
+
* {
|
|
61
|
+
* lineStartsWith: ['باب'],
|
|
62
|
+
* split: 'at',
|
|
63
|
+
* fuzzy: true // Applies makeDiacriticInsensitive internally
|
|
64
|
+
* }
|
|
65
|
+
*/
|
|
66
|
+
declare const makeDiacriticInsensitive: (text: string) => string;
|
|
65
67
|
//#endregion
|
|
66
|
-
//#region src/
|
|
68
|
+
//#region src/segmentation/types.d.ts
|
|
67
69
|
/**
|
|
68
|
-
*
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
*
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
*
|
|
70
|
+
* Literal regex pattern rule - no token expansion is applied.
|
|
71
|
+
*
|
|
72
|
+
* Use this when you need full control over the regex pattern.
|
|
73
|
+
* If the regex contains capturing groups, the captured content
|
|
74
|
+
* will be used as the segment content.
|
|
75
|
+
*
|
|
76
|
+
* @example
|
|
77
|
+
* // Match Arabic-Indic numbers followed by a dash
|
|
78
|
+
* { regex: '^[٠-٩]+ - ', split: 'at' }
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* // Capture group - content after the marker becomes segment content
|
|
82
|
+
* { regex: '^[٠-٩]+ - (.*)', split: 'at' }
|
|
77
83
|
*/
|
|
78
|
-
|
|
84
|
+
type RegexPattern = {
|
|
85
|
+
/** Raw regex pattern string (no token expansion) */
|
|
86
|
+
regex: string;
|
|
87
|
+
};
|
|
79
88
|
/**
|
|
80
|
-
*
|
|
89
|
+
* Template pattern rule - expands `{{tokens}}` before compiling to regex.
|
|
90
|
+
*
|
|
91
|
+
* Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
|
|
92
|
+
*
|
|
93
|
+
* @example
|
|
94
|
+
* // Using tokens for Arabic-Indic digits
|
|
95
|
+
* { template: '^{{raqms}} {{dash}}', split: 'at' }
|
|
96
|
+
*
|
|
97
|
+
* @example
|
|
98
|
+
* // Named capture to extract hadith number into metadata
|
|
99
|
+
* { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
|
|
100
|
+
*
|
|
101
|
+
* @see TOKEN_PATTERNS for available tokens
|
|
81
102
|
*/
|
|
82
|
-
|
|
103
|
+
type TemplatePattern = {
|
|
104
|
+
/** Template string with `{{token}}` or `{{token:name}}` placeholders */
|
|
105
|
+
template: string;
|
|
106
|
+
};
|
|
83
107
|
/**
|
|
84
|
-
*
|
|
108
|
+
* Line-start pattern rule - matches lines starting with any of the given patterns.
|
|
109
|
+
*
|
|
110
|
+
* Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
|
|
111
|
+
* is **included** in the segment content.
|
|
112
|
+
*
|
|
113
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
114
|
+
* diacritic-insensitive Arabic matching.
|
|
115
|
+
*
|
|
116
|
+
* @example
|
|
117
|
+
* // Split at chapter headings (marker included in content)
|
|
118
|
+
* { lineStartsWith: ['## ', '### '], split: 'at' }
|
|
119
|
+
*
|
|
120
|
+
* @example
|
|
121
|
+
* // Split at Arabic book/chapter markers with fuzzy matching
|
|
122
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
85
123
|
*/
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
124
|
+
type LineStartsWithPattern = {
|
|
125
|
+
/** Array of patterns that mark line beginnings (marker included in content) */
|
|
126
|
+
lineStartsWith: string[];
|
|
127
|
+
};
|
|
89
128
|
/**
|
|
90
|
-
*
|
|
91
|
-
*
|
|
92
|
-
*
|
|
93
|
-
*
|
|
94
|
-
*
|
|
129
|
+
* Line-start-after pattern rule - matches lines starting with patterns,
|
|
130
|
+
* but **excludes** the marker from the segment content.
|
|
131
|
+
*
|
|
132
|
+
* Behaves like `lineStartsWith` but strips the marker from the output.
|
|
133
|
+
* The segment content starts after the marker and extends to the next split point
|
|
134
|
+
* (not just the end of the matching line).
|
|
95
135
|
*
|
|
96
|
-
*
|
|
136
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
137
|
+
* diacritic-insensitive Arabic matching.
|
|
97
138
|
*
|
|
98
|
-
* @
|
|
99
|
-
*
|
|
139
|
+
* @example
|
|
140
|
+
* // Split at numbered hadiths, capturing content without the number prefix
|
|
141
|
+
* // Content extends to next split, not just end of that line
|
|
142
|
+
* { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
|
|
100
143
|
*
|
|
101
144
|
* @example
|
|
102
|
-
*
|
|
103
|
-
*
|
|
104
|
-
* match.groups.full // "٥ - نص"
|
|
105
|
-
* match.groups.marker // "٥ -"
|
|
106
|
-
* match.groups.content // "نص"
|
|
145
|
+
* // Extract hadith number to metadata while stripping the prefix
|
|
146
|
+
* { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
|
|
107
147
|
*/
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
148
|
+
type LineStartsAfterPattern = {
|
|
149
|
+
/** Array of patterns that mark line beginnings (marker excluded from content) */
|
|
150
|
+
lineStartsAfter: string[];
|
|
151
|
+
};
|
|
111
152
|
/**
|
|
112
|
-
*
|
|
113
|
-
*
|
|
153
|
+
* Line-end pattern rule - matches lines ending with any of the given patterns.
|
|
154
|
+
*
|
|
155
|
+
* Syntactic sugar for `(?:pattern1|pattern2|...)$`.
|
|
156
|
+
*
|
|
157
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
158
|
+
* diacritic-insensitive Arabic matching.
|
|
159
|
+
*
|
|
160
|
+
* @example
|
|
161
|
+
* // Split at lines ending with Arabic sentence-ending punctuation
|
|
162
|
+
* { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
|
|
114
163
|
*/
|
|
164
|
+
type LineEndsWithPattern = {
|
|
165
|
+
/** Array of patterns that mark line endings */
|
|
166
|
+
lineEndsWith: string[];
|
|
167
|
+
};
|
|
115
168
|
/**
|
|
116
|
-
*
|
|
117
|
-
*
|
|
169
|
+
* Union of all pattern types for split rules.
|
|
170
|
+
*
|
|
171
|
+
* Each rule must have exactly ONE pattern type:
|
|
172
|
+
* - `regex` - Raw regex pattern (no token expansion)
|
|
173
|
+
* - `template` - Pattern with `{{token}}` expansion
|
|
174
|
+
* - `lineStartsWith` - Match line beginnings (marker included)
|
|
175
|
+
* - `lineStartsAfter` - Match line beginnings (marker excluded)
|
|
176
|
+
* - `lineEndsWith` - Match line endings
|
|
118
177
|
*/
|
|
119
|
-
|
|
178
|
+
type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
|
|
120
179
|
/**
|
|
121
|
-
*
|
|
122
|
-
*
|
|
180
|
+
* Configuration for how and where to split content when a pattern matches.
|
|
181
|
+
*
|
|
182
|
+
* Controls the split position relative to matches, which occurrences to
|
|
183
|
+
* split on, page span limits, and fuzzy matching for Arabic text.
|
|
123
184
|
*/
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
185
|
+
type SplitBehavior = {
|
|
186
|
+
/**
|
|
187
|
+
* Where to split relative to the match.
|
|
188
|
+
* - `'at'`: New segment starts at the match position
|
|
189
|
+
* - `'after'`: New segment starts after the match ends
|
|
190
|
+
*/
|
|
191
|
+
split: 'at' | 'after';
|
|
192
|
+
/**
|
|
193
|
+
* Which occurrence(s) to split on.
|
|
194
|
+
* - `'all'`: Split at every match (default)
|
|
195
|
+
* - `'first'`: Only split at the first match
|
|
196
|
+
* - `'last'`: Only split at the last match
|
|
197
|
+
*
|
|
198
|
+
* When `maxSpan` is set, occurrence filtering is applied per sliding
|
|
199
|
+
* window rather than globally. With `'last'`, the algorithm prefers
|
|
200
|
+
* longer segments by looking as far ahead as allowed before selecting
|
|
201
|
+
* the last match in the window.
|
|
202
|
+
*
|
|
203
|
+
* @default 'all'
|
|
204
|
+
*/
|
|
205
|
+
occurrence?: 'first' | 'last' | 'all';
|
|
206
|
+
/**
|
|
207
|
+
* Maximum page ID difference allowed when looking ahead for split points.
|
|
208
|
+
*
|
|
209
|
+
* Uses a sliding window algorithm that prefers longer segments:
|
|
210
|
+
* 1. Start from the first page of the current segment
|
|
211
|
+
* 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
|
|
212
|
+
* 3. Apply occurrence filter (e.g., 'last') to select a match
|
|
213
|
+
* 4. Next window starts from the page after the match
|
|
214
|
+
*
|
|
215
|
+
* Examples:
|
|
216
|
+
* - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
|
|
217
|
+
* - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
|
|
218
|
+
* - `undefined` = no limit (entire content treated as one group)
|
|
219
|
+
*
|
|
220
|
+
* Note: With non-consecutive page IDs, the algorithm uses actual ID
|
|
221
|
+
* difference, not array index. Pages 1 and 5 have a difference of 4.
|
|
222
|
+
*
|
|
223
|
+
* @example
|
|
224
|
+
* // Split at last period, looking up to 1 page ahead
|
|
225
|
+
* // Pages 1,2: split at page 2's last period
|
|
226
|
+
* // Page 3: split at page 3's last period
|
|
227
|
+
* { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
|
|
228
|
+
*/
|
|
229
|
+
maxSpan?: number;
|
|
230
|
+
/**
|
|
231
|
+
* Enable diacritic-insensitive matching for Arabic text.
|
|
232
|
+
*
|
|
233
|
+
* When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
|
|
234
|
+
* `lineStartsAfter` are transformed to match text regardless of:
|
|
235
|
+
* - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
|
|
236
|
+
* - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
|
|
237
|
+
*
|
|
238
|
+
* **Note**: Does NOT apply to `regex` or `template` patterns.
|
|
239
|
+
* For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
|
|
240
|
+
*
|
|
241
|
+
* @default false
|
|
242
|
+
*/
|
|
243
|
+
fuzzy?: boolean;
|
|
244
|
+
};
|
|
127
245
|
/**
|
|
128
|
-
*
|
|
129
|
-
*
|
|
246
|
+
* A single page ID or a range of page IDs.
|
|
247
|
+
*
|
|
248
|
+
* - `number`: A single page ID
|
|
249
|
+
* - `[number, number]`: A range from first to second (inclusive)
|
|
250
|
+
*
|
|
251
|
+
* @example
|
|
252
|
+
* 5 // Single page 5
|
|
253
|
+
* [10, 20] // Pages 10 through 20 (inclusive)
|
|
130
254
|
*/
|
|
255
|
+
type PageRange = number | [number, number];
|
|
131
256
|
/**
|
|
132
|
-
*
|
|
133
|
-
*
|
|
257
|
+
* Optional constraints and metadata for a split rule.
|
|
258
|
+
*
|
|
259
|
+
* Use constraints to limit which pages a rule applies to, and
|
|
260
|
+
* metadata to attach arbitrary data to resulting segments.
|
|
134
261
|
*/
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
262
|
+
type RuleConstraints = {
|
|
263
|
+
/**
|
|
264
|
+
* Minimum page ID for this rule to apply.
|
|
265
|
+
*
|
|
266
|
+
* Matches on pages with `id < min` are ignored.
|
|
267
|
+
*
|
|
268
|
+
* @example
|
|
269
|
+
* // Only apply rule starting from page 10
|
|
270
|
+
* { min: 10, lineStartsWith: ['##'], split: 'before' }
|
|
271
|
+
*/
|
|
272
|
+
min?: number;
|
|
273
|
+
/**
|
|
274
|
+
* Maximum page ID for this rule to apply.
|
|
275
|
+
*
|
|
276
|
+
* Matches on pages with `id > max` are ignored.
|
|
277
|
+
*
|
|
278
|
+
* @example
|
|
279
|
+
* // Only apply rule up to page 100
|
|
280
|
+
* { max: 100, lineStartsWith: ['##'], split: 'before' }
|
|
281
|
+
*/
|
|
282
|
+
max?: number;
|
|
283
|
+
/**
|
|
284
|
+
* Specific pages or page ranges to exclude from this rule.
|
|
285
|
+
*
|
|
286
|
+
* Use this to skip the rule for specific pages without needing
|
|
287
|
+
* to repeat the rule with different min/max values.
|
|
288
|
+
*
|
|
289
|
+
* @example
|
|
290
|
+
* // Exclude specific pages
|
|
291
|
+
* { exclude: [1, 2, 5] }
|
|
292
|
+
*
|
|
293
|
+
* @example
|
|
294
|
+
* // Exclude page ranges
|
|
295
|
+
* { exclude: [[1, 10], [50, 100]] }
|
|
296
|
+
*
|
|
297
|
+
* @example
|
|
298
|
+
* // Mix single pages and ranges
|
|
299
|
+
* { exclude: [1, [5, 10], 50] }
|
|
300
|
+
*/
|
|
301
|
+
exclude?: PageRange[];
|
|
302
|
+
/**
|
|
303
|
+
* Arbitrary metadata attached to segments matching this rule.
|
|
304
|
+
*
|
|
305
|
+
* This metadata is merged with any named captures from the pattern.
|
|
306
|
+
* Named captures (e.g., `{{raqms:num}}`) take precedence over
|
|
307
|
+
* static metadata with the same key.
|
|
308
|
+
*
|
|
309
|
+
* @example
|
|
310
|
+
* // Tag segments as chapters
|
|
311
|
+
* { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
|
|
312
|
+
*/
|
|
313
|
+
meta?: Record<string, unknown>;
|
|
314
|
+
/**
|
|
315
|
+
* Fallback behavior when no matches are found within a maxSpan boundary.
|
|
316
|
+
* - 'page': Create split points at page boundaries
|
|
317
|
+
* - undefined: No fallback (current behavior)
|
|
318
|
+
*/
|
|
319
|
+
fallback?: 'page';
|
|
149
320
|
};
|
|
150
|
-
type TokenMap = Record<string, string>;
|
|
151
|
-
//#endregion
|
|
152
|
-
//#region src/markers/template-parser.d.ts
|
|
153
321
|
/**
|
|
154
|
-
*
|
|
322
|
+
* A complete split rule combining pattern, behavior, and constraints.
|
|
323
|
+
*
|
|
324
|
+
* Each rule must specify:
|
|
325
|
+
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
326
|
+
* `lineStartsAfter`, or `lineEndsWith`
|
|
327
|
+
* - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
|
|
328
|
+
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
329
|
+
*
|
|
330
|
+
* @example
|
|
331
|
+
* // Basic rule: split at markdown headers
|
|
332
|
+
* const rule: SplitRule = {
|
|
333
|
+
* lineStartsWith: ['## ', '### '],
|
|
334
|
+
* split: 'at',
|
|
335
|
+
* meta: { type: 'section' }
|
|
336
|
+
* };
|
|
337
|
+
*
|
|
338
|
+
* @example
|
|
339
|
+
* // Advanced rule: extract hadith numbers with fuzzy Arabic matching
|
|
340
|
+
* const rule: SplitRule = {
|
|
341
|
+
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
342
|
+
* split: 'at',
|
|
343
|
+
* fuzzy: true,
|
|
344
|
+
* min: 5,
|
|
345
|
+
* max: 500,
|
|
346
|
+
* meta: { type: 'hadith' }
|
|
347
|
+
* };
|
|
155
348
|
*/
|
|
156
|
-
|
|
157
|
-
valid: boolean;
|
|
158
|
-
errors?: string[];
|
|
159
|
-
}
|
|
349
|
+
type SplitRule = PatternType & SplitBehavior & RuleConstraints;
|
|
160
350
|
/**
|
|
161
|
-
*
|
|
351
|
+
* Input page structure for segmentation.
|
|
352
|
+
*
|
|
353
|
+
* Each page represents a logical unit of content (e.g., a book page,
|
|
354
|
+
* a document section) that can be tracked across segment boundaries.
|
|
355
|
+
*
|
|
356
|
+
* @example
|
|
357
|
+
* const pages: Page[] = [
|
|
358
|
+
* { id: 1, content: '## Chapter 1\nFirst paragraph...' },
|
|
359
|
+
* { id: 2, content: 'Continued text...\n## Chapter 2' },
|
|
360
|
+
* ];
|
|
162
361
|
*/
|
|
163
|
-
|
|
164
|
-
/**
|
|
165
|
-
|
|
166
|
-
|
|
362
|
+
type Page = {
|
|
363
|
+
/**
|
|
364
|
+
* Unique page/entry ID used for:
|
|
365
|
+
* - `maxSpan` grouping (segments spanning multiple pages)
|
|
366
|
+
* - `min`/`max` constraint filtering
|
|
367
|
+
* - `from`/`to` tracking in output segments
|
|
368
|
+
*/
|
|
369
|
+
id: number;
|
|
370
|
+
/**
|
|
371
|
+
* Raw page content (may contain HTML).
|
|
372
|
+
*
|
|
373
|
+
* Line endings are normalized internally (`\r\n` and `\r` → `\n`).
|
|
374
|
+
* Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
|
|
375
|
+
*/
|
|
376
|
+
content: string;
|
|
377
|
+
};
|
|
167
378
|
/**
|
|
168
|
-
*
|
|
169
|
-
* Always creates three groups: full (entire match), marker (just the marker), content (clean text).
|
|
379
|
+
* A breakpoint pattern with optional page constraints.
|
|
170
380
|
*
|
|
171
|
-
*
|
|
381
|
+
* Use this to control which pages a breakpoint pattern applies to.
|
|
382
|
+
* Patterns outside the specified range are skipped, allowing
|
|
383
|
+
* the next breakpoint pattern (or fallback) to be tried.
|
|
172
384
|
*
|
|
173
|
-
* @
|
|
174
|
-
*
|
|
175
|
-
*
|
|
385
|
+
* @example
|
|
386
|
+
* // Only apply punctuation-based breaking from page 10 onwards
|
|
387
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }
|
|
176
388
|
*
|
|
177
389
|
* @example
|
|
178
|
-
*
|
|
179
|
-
*
|
|
390
|
+
* // Apply to specific page range (pages 10-50)
|
|
391
|
+
* { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
|
|
180
392
|
*/
|
|
181
|
-
|
|
393
|
+
type BreakpointRule = {
|
|
394
|
+
/**
|
|
395
|
+
* Regex pattern for breaking (supports token expansion).
|
|
396
|
+
* Empty string `''` means fall back to page boundary.
|
|
397
|
+
*/
|
|
398
|
+
pattern: string;
|
|
399
|
+
/**
|
|
400
|
+
* Minimum page ID for this breakpoint to apply.
|
|
401
|
+
* Segments starting before this page skip this pattern.
|
|
402
|
+
*/
|
|
403
|
+
min?: number;
|
|
404
|
+
/**
|
|
405
|
+
* Maximum page ID for this breakpoint to apply.
|
|
406
|
+
* Segments starting after this page skip this pattern.
|
|
407
|
+
*/
|
|
408
|
+
max?: number;
|
|
409
|
+
/**
|
|
410
|
+
* Specific pages or page ranges to exclude from this breakpoint.
|
|
411
|
+
*
|
|
412
|
+
* Use this to skip the breakpoint for specific pages without needing
|
|
413
|
+
* to repeat the breakpoint with different min/max values.
|
|
414
|
+
*
|
|
415
|
+
* @example
|
|
416
|
+
* // Exclude specific pages
|
|
417
|
+
* { pattern: '\\.\\s*', exclude: [1, 2, 5] }
|
|
418
|
+
*
|
|
419
|
+
* @example
|
|
420
|
+
* // Exclude page ranges (front matter pages 1-10)
|
|
421
|
+
* { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
|
|
422
|
+
*
|
|
423
|
+
* @example
|
|
424
|
+
* // Mix single pages and ranges
|
|
425
|
+
* { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
|
|
426
|
+
*/
|
|
427
|
+
exclude?: PageRange[];
|
|
428
|
+
/**
|
|
429
|
+
* Skip this breakpoint if the segment content matches this pattern.
|
|
430
|
+
*
|
|
431
|
+
* Supports token expansion (e.g., `{{kitab}}`). When the segment's
|
|
432
|
+
* remaining content matches this regex, the breakpoint pattern is
|
|
433
|
+
* skipped and the next breakpoint in the array is tried.
|
|
434
|
+
*
|
|
435
|
+
* Useful for excluding title pages or front matter without needing
|
|
436
|
+
* to specify explicit page ranges.
|
|
437
|
+
*
|
|
438
|
+
* @example
|
|
439
|
+
* // Skip punctuation breakpoint for short content (likely titles)
|
|
440
|
+
* { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
|
|
441
|
+
*
|
|
442
|
+
* @example
|
|
443
|
+
* // Skip for content containing "kitab" (book) marker
|
|
444
|
+
* { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
|
|
445
|
+
*/
|
|
446
|
+
skipWhen?: string;
|
|
447
|
+
};
|
|
182
448
|
/**
|
|
183
|
-
*
|
|
449
|
+
* A breakpoint can be a simple string pattern or an object with constraints.
|
|
184
450
|
*
|
|
185
|
-
*
|
|
186
|
-
*
|
|
451
|
+
* String breakpoints apply to all pages. Object breakpoints can specify
|
|
452
|
+
* `min`/`max` to limit which pages they apply to.
|
|
187
453
|
*
|
|
188
454
|
* @example
|
|
189
|
-
*
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
455
|
+
* // String (applies everywhere)
|
|
456
|
+
* '{{tarqim}}\\s*'
|
|
457
|
+
*
|
|
458
|
+
* @example
|
|
459
|
+
* // Object with constraints (only from page 10+)
|
|
460
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }
|
|
193
461
|
*/
|
|
194
|
-
|
|
462
|
+
type Breakpoint = string | BreakpointRule;
|
|
195
463
|
/**
|
|
196
|
-
*
|
|
464
|
+
* Segmentation options controlling how pages are split.
|
|
197
465
|
*
|
|
198
|
-
* @
|
|
199
|
-
*
|
|
200
|
-
*
|
|
466
|
+
* @example
|
|
467
|
+
* // Basic structural rules only
|
|
468
|
+
* const options: SegmentationOptions = {
|
|
469
|
+
* rules: [
|
|
470
|
+
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
|
|
471
|
+
* { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
|
|
472
|
+
* ]
|
|
473
|
+
* };
|
|
201
474
|
*
|
|
202
475
|
* @example
|
|
203
|
-
*
|
|
204
|
-
*
|
|
476
|
+
* // With breakpoints for oversized segments
|
|
477
|
+
* const options: SegmentationOptions = {
|
|
478
|
+
* rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
|
|
479
|
+
* maxPages: 2,
|
|
480
|
+
* breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
|
|
481
|
+
* prefer: 'longer'
|
|
482
|
+
* };
|
|
483
|
+
*/
|
|
484
|
+
type SegmentationOptions = {
|
|
485
|
+
/**
|
|
486
|
+
* Rules applied in order to find split points.
|
|
487
|
+
*
|
|
488
|
+
* All rules are evaluated against the content, and their matches
|
|
489
|
+
* are combined to determine final split points. The first matching
|
|
490
|
+
* rule's metadata is used for each segment.
|
|
491
|
+
*/
|
|
492
|
+
rules?: SplitRule[];
|
|
493
|
+
/**
|
|
494
|
+
* Maximum pages per segment before breakpoints are applied.
|
|
495
|
+
*
|
|
496
|
+
* When a segment spans more pages than this limit, the `breakpoints`
|
|
497
|
+
* patterns are tried (in order) to find a suitable break point within
|
|
498
|
+
* the allowed window.
|
|
499
|
+
*
|
|
500
|
+
* Structural markers (from rules) always take precedence - segments
|
|
501
|
+
* are only broken within their rule-defined boundaries, never across them.
|
|
502
|
+
*
|
|
503
|
+
* @example
|
|
504
|
+
* // Break segments that exceed 2 pages
|
|
505
|
+
* { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
|
|
506
|
+
*/
|
|
507
|
+
maxPages?: number;
|
|
508
|
+
/**
|
|
509
|
+
* Patterns tried in order to break oversized segments.
|
|
510
|
+
*
|
|
511
|
+
* Each pattern is tried until one matches within the allowed page window.
|
|
512
|
+
* Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
|
|
513
|
+
* matches the page boundary (always succeeds as ultimate fallback).
|
|
514
|
+
*
|
|
515
|
+
* Patterns can be simple strings (apply everywhere) or objects with
|
|
516
|
+
* `min`/`max` constraints to limit which pages they apply to.
|
|
517
|
+
*
|
|
518
|
+
* Patterns are checked in order - put preferred break styles first:
|
|
519
|
+
* - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
|
|
520
|
+
* - `\\n` - Break at line breaks (useful for OCR content)
|
|
521
|
+
* - `''` - Break at page boundary (always works)
|
|
522
|
+
*
|
|
523
|
+
* Only applied to segments exceeding `maxPages`.
|
|
524
|
+
*
|
|
525
|
+
* @example
|
|
526
|
+
* // Simple patterns (backward compatible)
|
|
527
|
+
* breakpoints: ['{{tarqim}}\\s*', '\\n', '']
|
|
528
|
+
*
|
|
529
|
+
* @example
|
|
530
|
+
* // Object patterns with page constraints
|
|
531
|
+
* breakpoints: [
|
|
532
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }, // Only from page 10+
|
|
533
|
+
* '' // Fallback for pages 1-9
|
|
534
|
+
* ]
|
|
535
|
+
*/
|
|
536
|
+
breakpoints?: Breakpoint[];
|
|
537
|
+
/**
|
|
538
|
+
* When multiple matches exist for a breakpoint pattern, select:
|
|
539
|
+
* - `'longer'` - Last match in window (prefers longer segments)
|
|
540
|
+
* - `'shorter'` - First match in window (prefers shorter segments)
|
|
541
|
+
*
|
|
542
|
+
* @default 'longer'
|
|
543
|
+
*/
|
|
544
|
+
prefer?: 'longer' | 'shorter';
|
|
545
|
+
};
|
|
546
|
+
/**
|
|
547
|
+
* Output segment produced by `segmentPages()`.
|
|
548
|
+
*
|
|
549
|
+
* Each segment contains extracted content, page references, and
|
|
550
|
+
* optional metadata from the matched rule and captured groups.
|
|
205
551
|
*
|
|
206
|
-
*
|
|
207
|
-
* //
|
|
552
|
+
* @example
|
|
553
|
+
* // Simple segment on a single page
|
|
554
|
+
* { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
|
|
555
|
+
*
|
|
556
|
+
* @example
|
|
557
|
+
* // Segment spanning pages 5-7 with captured hadith number
|
|
558
|
+
* { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
|
|
208
559
|
*/
|
|
209
|
-
|
|
560
|
+
type Segment = {
|
|
561
|
+
/**
|
|
562
|
+
* Segment content with:
|
|
563
|
+
* - Leading/trailing whitespace trimmed
|
|
564
|
+
* - Page breaks converted to spaces (for multi-page segments)
|
|
565
|
+
* - Markers stripped (for `lineStartsAfter` patterns)
|
|
566
|
+
*/
|
|
567
|
+
content: string;
|
|
568
|
+
/**
|
|
569
|
+
* Starting page ID (from `Page.id`).
|
|
570
|
+
*/
|
|
571
|
+
from: number;
|
|
572
|
+
/**
|
|
573
|
+
* Ending page ID if segment spans multiple pages.
|
|
574
|
+
*
|
|
575
|
+
* Only present when the segment content extends across page boundaries.
|
|
576
|
+
* When `undefined`, the segment is contained within a single page.
|
|
577
|
+
*/
|
|
578
|
+
to?: number;
|
|
579
|
+
/**
|
|
580
|
+
* Combined metadata from:
|
|
581
|
+
* 1. Rule's `meta` property (static metadata)
|
|
582
|
+
* 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
|
|
583
|
+
*
|
|
584
|
+
* Named captures override static metadata with the same key.
|
|
585
|
+
*/
|
|
586
|
+
meta?: Record<string, unknown>;
|
|
587
|
+
};
|
|
210
588
|
//#endregion
|
|
211
|
-
//#region src/
|
|
589
|
+
//#region src/segmentation/segmenter.d.ts
|
|
212
590
|
/**
|
|
213
|
-
*
|
|
591
|
+
* Segments pages of content based on pattern-matching rules.
|
|
214
592
|
*
|
|
215
|
-
*
|
|
216
|
-
*
|
|
217
|
-
*
|
|
593
|
+
* This is the main entry point for the segmentation engine. It takes an array
|
|
594
|
+
* of pages and applies the provided rules to identify split points, producing
|
|
595
|
+
* an array of segments with content, page references, and metadata.
|
|
218
596
|
*
|
|
219
|
-
* @param
|
|
220
|
-
* @
|
|
221
|
-
* @
|
|
597
|
+
* @param pages - Array of pages with id and content
|
|
598
|
+
* @param options - Segmentation options including splitting rules
|
|
599
|
+
* @returns Array of segments with content, from/to page references, and optional metadata
|
|
222
600
|
*
|
|
223
601
|
* @example
|
|
224
|
-
* //
|
|
225
|
-
* const
|
|
602
|
+
* // Split markdown by headers
|
|
603
|
+
* const segments = segmentPages(pages, {
|
|
604
|
+
* rules: [
|
|
605
|
+
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
|
|
606
|
+
* ]
|
|
607
|
+
* });
|
|
226
608
|
*
|
|
227
609
|
* @example
|
|
228
|
-
* //
|
|
229
|
-
* const
|
|
610
|
+
* // Split Arabic hadith text with number extraction
|
|
611
|
+
* const segments = segmentPages(pages, {
|
|
612
|
+
* rules: [
|
|
613
|
+
* {
|
|
614
|
+
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
615
|
+
* split: 'at',
|
|
616
|
+
* fuzzy: true,
|
|
617
|
+
* meta: { type: 'hadith' }
|
|
618
|
+
* }
|
|
619
|
+
* ]
|
|
620
|
+
* });
|
|
230
621
|
*
|
|
231
622
|
* @example
|
|
232
|
-
* //
|
|
233
|
-
* const
|
|
234
|
-
*
|
|
235
|
-
*
|
|
236
|
-
*
|
|
623
|
+
* // Multiple rules with page constraints
|
|
624
|
+
* const segments = segmentPages(pages, {
|
|
625
|
+
* rules: [
|
|
626
|
+
* { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
|
|
627
|
+
* { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
|
|
628
|
+
* { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
|
|
629
|
+
* ]
|
|
237
630
|
* });
|
|
238
631
|
*/
|
|
239
|
-
declare
|
|
632
|
+
declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
|
|
633
|
+
//#endregion
|
|
634
|
+
//#region src/segmentation/textUtils.d.ts
|
|
240
635
|
/**
|
|
241
|
-
*
|
|
636
|
+
* Strip all HTML tags from content, keeping only text.
|
|
242
637
|
*
|
|
243
|
-
*
|
|
244
|
-
*
|
|
638
|
+
* @param html - HTML content
|
|
639
|
+
* @returns Plain text content
|
|
640
|
+
*/
|
|
641
|
+
declare const stripHtmlTags: (html: string) => string;
|
|
642
|
+
/**
|
|
643
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
245
644
|
*
|
|
246
|
-
*
|
|
645
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
646
|
+
* for consistent pattern matching across platforms.
|
|
247
647
|
*
|
|
248
|
-
* @
|
|
249
|
-
*
|
|
250
|
-
* const match = regex.exec('باب الصلاة');
|
|
251
|
-
* // match.groups.marker -> 'باب'
|
|
252
|
-
* // match.groups.content -> ' الصلاة'
|
|
648
|
+
* @param content - Raw content with potentially mixed line endings
|
|
649
|
+
* @returns Content with all line endings normalized to `\n`
|
|
253
650
|
*/
|
|
254
|
-
declare
|
|
651
|
+
declare const normalizeLineEndings: (content: string) => string;
|
|
652
|
+
//#endregion
|
|
653
|
+
//#region src/segmentation/tokens.d.ts
|
|
255
654
|
/**
|
|
256
|
-
*
|
|
655
|
+
* Token-based template system for Arabic text pattern matching.
|
|
257
656
|
*
|
|
258
|
-
*
|
|
259
|
-
*
|
|
260
|
-
*
|
|
657
|
+
* This module provides a human-readable way to define regex patterns using
|
|
658
|
+
* `{{token}}` placeholders that expand to their regex equivalents. It supports
|
|
659
|
+
* named capture groups for extracting matched values into metadata.
|
|
261
660
|
*
|
|
262
|
-
* @
|
|
263
|
-
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
661
|
+
* @module tokens
|
|
264
662
|
*
|
|
265
663
|
* @example
|
|
266
|
-
* //
|
|
267
|
-
*
|
|
268
|
-
*
|
|
664
|
+
* // Simple token expansion
|
|
665
|
+
* expandTokens('{{raqms}} {{dash}}')
|
|
666
|
+
* // → '[\\u0660-\\u0669]+ [-–—ـ]'
|
|
269
667
|
*
|
|
270
668
|
* @example
|
|
271
|
-
* //
|
|
272
|
-
*
|
|
273
|
-
*
|
|
274
|
-
* phrases: ['قَالَ', 'رَوَى']
|
|
275
|
-
* });
|
|
669
|
+
* // Named capture groups
|
|
670
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
671
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
276
672
|
*/
|
|
277
|
-
declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
|
|
278
673
|
/**
|
|
279
|
-
*
|
|
674
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
280
675
|
*
|
|
281
|
-
*
|
|
282
|
-
* -
|
|
283
|
-
* -
|
|
284
|
-
* -
|
|
676
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
677
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
678
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
679
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
285
680
|
*
|
|
286
|
-
* @
|
|
681
|
+
* @remarks
|
|
682
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
683
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
684
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
287
685
|
*
|
|
288
686
|
* @example
|
|
289
|
-
*
|
|
290
|
-
*
|
|
291
|
-
* // match.groups.marker -> 'بسم الله'
|
|
292
|
-
*/
|
|
293
|
-
declare function generateBasmalaRegex(): RegExp;
|
|
294
|
-
/**
|
|
295
|
-
* Generates a regular expression for custom phrase markers.
|
|
296
|
-
*
|
|
297
|
-
* Similar to hadith-chain markers but requires explicit phrase list.
|
|
298
|
-
* All phrases are made diacritic-insensitive.
|
|
687
|
+
* // Using tokens in a split rule
|
|
688
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
299
689
|
*
|
|
300
|
-
* @
|
|
301
|
-
*
|
|
302
|
-
*
|
|
690
|
+
* @example
|
|
691
|
+
* // Using tokens with named captures
|
|
692
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
303
693
|
*
|
|
304
694
|
* @example
|
|
305
|
-
*
|
|
306
|
-
*
|
|
307
|
-
* phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
|
|
308
|
-
* });
|
|
695
|
+
* // Using the numbered convenience token
|
|
696
|
+
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
309
697
|
*/
|
|
310
|
-
declare
|
|
698
|
+
declare const TOKEN_PATTERNS: Record<string, string>;
|
|
311
699
|
/**
|
|
312
|
-
*
|
|
700
|
+
* Checks if a query string contains template tokens.
|
|
313
701
|
*
|
|
314
|
-
*
|
|
315
|
-
*
|
|
316
|
-
*
|
|
317
|
-
* - ° [٦٥] - With degree prefix
|
|
702
|
+
* Performs a quick test for `{{token}}` patterns without actually
|
|
703
|
+
* expanding them. Useful for determining whether to apply token
|
|
704
|
+
* expansion to a string.
|
|
318
705
|
*
|
|
319
|
-
* @
|
|
706
|
+
* @param query - String to check for tokens
|
|
707
|
+
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
320
708
|
*
|
|
321
709
|
* @example
|
|
322
|
-
*
|
|
323
|
-
*
|
|
324
|
-
* //
|
|
710
|
+
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
711
|
+
* containsTokens('plain text') // → false
|
|
712
|
+
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
713
|
+
*/
|
|
714
|
+
declare const containsTokens: (query: string) => boolean;
|
|
715
|
+
/**
|
|
716
|
+
* Result from expanding tokens with capture information.
|
|
717
|
+
*
|
|
718
|
+
* Contains the expanded pattern string along with metadata about
|
|
719
|
+
* any named capture groups that were created.
|
|
325
720
|
*/
|
|
326
|
-
|
|
721
|
+
type ExpandResult = {
|
|
722
|
+
/**
|
|
723
|
+
* The expanded regex pattern string with all tokens replaced.
|
|
724
|
+
*
|
|
725
|
+
* Named captures use the `(?<name>pattern)` syntax.
|
|
726
|
+
*/
|
|
727
|
+
pattern: string;
|
|
728
|
+
/**
|
|
729
|
+
* Names of captured groups extracted from `{{token:name}}` syntax.
|
|
730
|
+
*
|
|
731
|
+
* Empty array if no named captures were found.
|
|
732
|
+
*/
|
|
733
|
+
captureNames: string[];
|
|
734
|
+
/**
|
|
735
|
+
* Whether the pattern has any named capturing groups.
|
|
736
|
+
*
|
|
737
|
+
* Equivalent to `captureNames.length > 0`.
|
|
738
|
+
*/
|
|
739
|
+
hasCaptures: boolean;
|
|
740
|
+
};
|
|
327
741
|
/**
|
|
328
|
-
*
|
|
742
|
+
* Expands template tokens with support for named captures.
|
|
743
|
+
*
|
|
744
|
+
* This is the primary token expansion function that handles all token syntax:
|
|
745
|
+
* - `{{token}}` → Expands to the token's pattern (no capture group)
|
|
746
|
+
* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
|
|
747
|
+
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
329
748
|
*
|
|
330
|
-
*
|
|
331
|
-
* - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
|
|
332
|
-
* - 5 ب. (Latin number, Arabic letter, dot)
|
|
749
|
+
* Unknown tokens are left as-is in the output, allowing for partial templates.
|
|
333
750
|
*
|
|
334
|
-
* @param
|
|
335
|
-
* @
|
|
751
|
+
* @param query - The template string containing tokens
|
|
752
|
+
* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
|
|
753
|
+
* Applied to both token patterns and plain Arabic text between tokens.
|
|
754
|
+
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
755
|
+
* @returns Object with expanded pattern, capture names, and capture flag
|
|
336
756
|
*
|
|
337
757
|
* @example
|
|
338
|
-
*
|
|
339
|
-
*
|
|
340
|
-
*
|
|
341
|
-
* });
|
|
342
|
-
* const match = regex.exec('٥ أ - نص');
|
|
343
|
-
*/
|
|
344
|
-
declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
|
|
345
|
-
/**
|
|
346
|
-
* Generates a regular expression for number-parenthetical-separator markers.
|
|
758
|
+
* // Simple token expansion
|
|
759
|
+
* expandTokensWithCaptures('{{raqms}} {{dash}}')
|
|
760
|
+
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
347
761
|
*
|
|
348
|
-
*
|
|
349
|
-
*
|
|
350
|
-
*
|
|
762
|
+
* @example
|
|
763
|
+
* // Named capture
|
|
764
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
765
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
351
766
|
*
|
|
352
|
-
* @
|
|
353
|
-
*
|
|
767
|
+
* @example
|
|
768
|
+
* // Capture-only token
|
|
769
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
|
|
770
|
+
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
|
|
354
771
|
*
|
|
355
772
|
* @example
|
|
356
|
-
*
|
|
357
|
-
*
|
|
358
|
-
*
|
|
359
|
-
* });
|
|
360
|
-
* const match = regex.exec('٥ (أ) - نص');
|
|
773
|
+
* // With fuzzy transform
|
|
774
|
+
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
775
|
+
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
361
776
|
*/
|
|
362
|
-
declare
|
|
777
|
+
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
|
|
363
778
|
/**
|
|
364
|
-
*
|
|
779
|
+
* Expands template tokens in a query string to their regex equivalents.
|
|
365
780
|
*
|
|
366
|
-
*
|
|
367
|
-
*
|
|
368
|
-
* - ٥ - (single number, separator)
|
|
781
|
+
* This is the simple version without capture support. It returns only the
|
|
782
|
+
* expanded pattern string, not capture metadata.
|
|
369
783
|
*
|
|
370
|
-
*
|
|
784
|
+
* Unknown tokens are left as-is, allowing for partial templates.
|
|
371
785
|
*
|
|
372
|
-
* @param
|
|
373
|
-
* @returns
|
|
786
|
+
* @param query - Template string containing `{{token}}` placeholders
|
|
787
|
+
* @returns Expanded regex pattern string
|
|
374
788
|
*
|
|
375
789
|
* @example
|
|
376
|
-
*
|
|
377
|
-
*
|
|
378
|
-
*
|
|
379
|
-
* })
|
|
380
|
-
*
|
|
381
|
-
*
|
|
790
|
+
* expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
|
|
791
|
+
* expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
|
|
792
|
+
* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
|
|
793
|
+
* expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
|
|
794
|
+
*
|
|
795
|
+
* @see expandTokensWithCaptures for full capture group support
|
|
382
796
|
*/
|
|
383
|
-
declare
|
|
797
|
+
declare const expandTokens: (query: string) => string;
|
|
384
798
|
/**
|
|
385
|
-
*
|
|
799
|
+
* Converts a template string to a compiled RegExp.
|
|
386
800
|
*
|
|
387
|
-
*
|
|
388
|
-
*
|
|
389
|
-
* 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
|
|
801
|
+
* Expands all tokens and attempts to compile the result as a RegExp
|
|
802
|
+
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
390
803
|
*
|
|
391
|
-
*
|
|
392
|
-
*
|
|
393
|
-
*
|
|
804
|
+
* @remarks
|
|
805
|
+
* This function dynamically compiles regular expressions from template strings.
|
|
806
|
+
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
807
|
+
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
808
|
+
* Consider validating pattern complexity or applying execution timeouts when
|
|
809
|
+
* running user-submitted patterns.
|
|
394
810
|
*
|
|
395
|
-
* @param
|
|
396
|
-
* @returns
|
|
811
|
+
* @param template - Template string containing `{{token}}` placeholders
|
|
812
|
+
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
397
813
|
*
|
|
398
814
|
* @example
|
|
399
|
-
* //
|
|
400
|
-
*
|
|
401
|
-
*
|
|
402
|
-
* separator: 'dash',
|
|
403
|
-
* format: '{bullet}+ {num} {dash}'
|
|
404
|
-
* });
|
|
405
|
-
*
|
|
406
|
-
* @example
|
|
407
|
-
* // Using default pattern
|
|
408
|
-
* const regex = generateNumberedRegex({
|
|
409
|
-
* numbering: 'arabic-indic',
|
|
410
|
-
* separator: 'dash'
|
|
411
|
-
* });
|
|
412
|
-
* const match = regex.exec('٥ - نص');
|
|
413
|
-
*
|
|
414
|
-
* @example
|
|
415
|
-
* // With 'none' separator
|
|
416
|
-
* const regex = generateNumberedRegex({
|
|
417
|
-
* numbering: 'latin',
|
|
418
|
-
* separator: 'none'
|
|
419
|
-
* });
|
|
420
|
-
* const match = regex.exec('5 text');
|
|
815
|
+
* templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
|
|
816
|
+
* templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
|
|
817
|
+
* templateToRegex('(((') // → null (invalid regex)
|
|
421
818
|
*/
|
|
422
|
-
declare
|
|
819
|
+
declare const templateToRegex: (template: string) => RegExp | null;
|
|
423
820
|
/**
|
|
424
|
-
*
|
|
821
|
+
* Lists all available token names defined in `TOKEN_PATTERNS`.
|
|
425
822
|
*
|
|
426
|
-
*
|
|
427
|
-
*
|
|
428
|
-
* - * (asterisk)
|
|
429
|
-
* - ° (degree)
|
|
430
|
-
* - - (dash)
|
|
823
|
+
* Useful for documentation, validation, or building user interfaces
|
|
824
|
+
* that show available tokens.
|
|
431
825
|
*
|
|
432
|
-
* @returns
|
|
826
|
+
* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
|
|
433
827
|
*
|
|
434
828
|
* @example
|
|
435
|
-
*
|
|
436
|
-
*
|
|
437
|
-
* // match.groups.content -> 'نقطة'
|
|
829
|
+
* getAvailableTokens()
|
|
830
|
+
* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
438
831
|
*/
|
|
439
|
-
declare
|
|
832
|
+
declare const getAvailableTokens: () => string[];
|
|
440
833
|
/**
|
|
441
|
-
*
|
|
834
|
+
* Gets the regex pattern for a specific token name.
|
|
442
835
|
*
|
|
443
|
-
*
|
|
444
|
-
*
|
|
445
|
-
* - ## Heading 2
|
|
446
|
-
* - ### Heading 3
|
|
447
|
-
* - etc.
|
|
836
|
+
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
837
|
+
* without any expansion or capture group wrapping.
|
|
448
838
|
*
|
|
449
|
-
* @
|
|
839
|
+
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
840
|
+
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
450
841
|
*
|
|
451
842
|
* @example
|
|
452
|
-
*
|
|
453
|
-
*
|
|
454
|
-
* //
|
|
455
|
-
* // match.groups.content -> 'عنوان فرعي'
|
|
843
|
+
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
844
|
+
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
845
|
+
* getTokenPattern('unknown') // → undefined
|
|
456
846
|
*/
|
|
457
|
-
declare
|
|
847
|
+
declare const getTokenPattern: (tokenName: string) => string | undefined;
|
|
458
848
|
//#endregion
|
|
459
|
-
export {
|
|
849
|
+
export { type Breakpoint, type BreakpointRule, type ExpandResult, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
|
|
460
850
|
//# sourceMappingURL=index.d.mts.map
|