flappa-doormal 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +325 -0
- package/README.md +477 -199
- package/dist/index.d.mts +871 -327
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1611 -393
- package/dist/index.mjs.map +1 -1
- package/package.json +13 -10
package/dist/index.d.mts
CHANGED
|
@@ -1,460 +1,1004 @@
|
|
|
1
|
-
//#region src/
|
|
1
|
+
//#region src/segmentation/fuzzy.d.ts
|
|
2
2
|
/**
|
|
3
|
-
*
|
|
3
|
+
* Fuzzy matching utilities for Arabic text.
|
|
4
|
+
*
|
|
5
|
+
* Provides diacritic-insensitive and character-equivalence matching for Arabic text.
|
|
6
|
+
* This allows matching text regardless of:
|
|
7
|
+
* - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
|
|
8
|
+
* - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
|
|
9
|
+
*
|
|
10
|
+
* @module fuzzy
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* // Make a pattern diacritic-insensitive
|
|
14
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
15
|
+
* new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* Escapes a string for safe inclusion in a regular expression.
|
|
19
|
+
*
|
|
20
|
+
* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
|
|
21
|
+
*
|
|
22
|
+
* @param s - Any string to escape
|
|
23
|
+
* @returns String with regex metacharacters escaped
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* escapeRegex('hello.world') // → 'hello\\.world'
|
|
27
|
+
* escapeRegex('[test]') // → '\\[test\\]'
|
|
28
|
+
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
29
|
+
*/
|
|
30
|
+
declare const escapeRegex: (s: string) => string;
|
|
31
|
+
/**
|
|
32
|
+
* Creates a diacritic-insensitive regex pattern for Arabic text matching.
|
|
33
|
+
*
|
|
34
|
+
* Transforms input text into a regex pattern that matches the text regardless
|
|
35
|
+
* of diacritical marks (harakat) and character variations. Each character in
|
|
36
|
+
* the input is:
|
|
37
|
+
* 1. Expanded to its equivalence class (if applicable)
|
|
38
|
+
* 2. Followed by an optional diacritics matcher
|
|
39
|
+
*
|
|
40
|
+
* This allows matching:
|
|
41
|
+
* - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
|
|
42
|
+
* - `الإيمان` with `الايمان` (alef variants)
|
|
43
|
+
* - `صلاة` with `صلاه` (ta marbuta ↔ ha)
|
|
44
|
+
*
|
|
45
|
+
* @param text - Input Arabic text to make diacritic-insensitive
|
|
46
|
+
* @returns Regex pattern string that matches the text with or without diacritics
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* const pattern = makeDiacriticInsensitive('حدثنا');
|
|
50
|
+
* // Each char gets equivalence class + optional diacritics
|
|
51
|
+
* // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* const pattern = makeDiacriticInsensitive('باب');
|
|
55
|
+
* new RegExp(pattern, 'u').test('بَابٌ') // → true
|
|
56
|
+
* new RegExp(pattern, 'u').test('باب') // → true
|
|
57
|
+
*
|
|
58
|
+
* @example
|
|
59
|
+
* // Using with split rules
|
|
60
|
+
* {
|
|
61
|
+
* lineStartsWith: ['باب'],
|
|
62
|
+
* split: 'at',
|
|
63
|
+
* fuzzy: true // Applies makeDiacriticInsensitive internally
|
|
64
|
+
* }
|
|
65
|
+
*/
|
|
66
|
+
declare const makeDiacriticInsensitive: (text: string) => string;
|
|
67
|
+
//#endregion
|
|
68
|
+
//#region src/segmentation/types.d.ts
|
|
69
|
+
/**
|
|
70
|
+
* Literal regex pattern rule - no token expansion is applied.
|
|
71
|
+
*
|
|
72
|
+
* Use this when you need full control over the regex pattern.
|
|
73
|
+
* If the regex contains capturing groups, the captured content
|
|
74
|
+
* will be used as the segment content.
|
|
75
|
+
*
|
|
76
|
+
* @example
|
|
77
|
+
* // Match Arabic-Indic numbers followed by a dash
|
|
78
|
+
* { regex: '^[٠-٩]+ - ', split: 'at' }
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* // Capture group - content after the marker becomes segment content
|
|
82
|
+
* { regex: '^[٠-٩]+ - (.*)', split: 'at' }
|
|
83
|
+
*/
|
|
84
|
+
type RegexPattern = {
|
|
85
|
+
/** Raw regex pattern string (no token expansion) */
|
|
86
|
+
regex: string;
|
|
87
|
+
};
|
|
88
|
+
/**
|
|
89
|
+
* Template pattern rule - expands `{{tokens}}` before compiling to regex.
|
|
90
|
+
*
|
|
91
|
+
* Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
|
|
92
|
+
*
|
|
93
|
+
* @example
|
|
94
|
+
* // Using tokens for Arabic-Indic digits
|
|
95
|
+
* { template: '^{{raqms}} {{dash}}', split: 'at' }
|
|
96
|
+
*
|
|
97
|
+
* @example
|
|
98
|
+
* // Named capture to extract hadith number into metadata
|
|
99
|
+
* { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
|
|
100
|
+
*
|
|
101
|
+
* @see TOKEN_PATTERNS for available tokens
|
|
102
|
+
*/
|
|
103
|
+
type TemplatePattern = {
|
|
104
|
+
/** Template string with `{{token}}` or `{{token:name}}` placeholders */
|
|
105
|
+
template: string;
|
|
106
|
+
};
|
|
107
|
+
/**
|
|
108
|
+
* Line-start pattern rule - matches lines starting with any of the given patterns.
|
|
109
|
+
*
|
|
110
|
+
* Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
|
|
111
|
+
* is **included** in the segment content.
|
|
112
|
+
*
|
|
113
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
114
|
+
* diacritic-insensitive Arabic matching.
|
|
115
|
+
*
|
|
116
|
+
* @example
|
|
117
|
+
* // Split at chapter headings (marker included in content)
|
|
118
|
+
* { lineStartsWith: ['## ', '### '], split: 'at' }
|
|
119
|
+
*
|
|
120
|
+
* @example
|
|
121
|
+
* // Split at Arabic book/chapter markers with fuzzy matching
|
|
122
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
123
|
+
*/
|
|
124
|
+
type LineStartsWithPattern = {
|
|
125
|
+
/** Array of patterns that mark line beginnings (marker included in content) */
|
|
126
|
+
lineStartsWith: string[];
|
|
127
|
+
};
|
|
128
|
+
/**
|
|
129
|
+
* Line-start-after pattern rule - matches lines starting with patterns,
|
|
130
|
+
* but **excludes** the marker from the segment content.
|
|
131
|
+
*
|
|
132
|
+
* Behaves like `lineStartsWith` but strips the marker from the output.
|
|
133
|
+
* The segment content starts after the marker and extends to the next split point
|
|
134
|
+
* (not just the end of the matching line).
|
|
135
|
+
*
|
|
136
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
137
|
+
* diacritic-insensitive Arabic matching.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* // Split at numbered hadiths, capturing content without the number prefix
|
|
141
|
+
* // Content extends to next split, not just end of that line
|
|
142
|
+
* { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
|
|
143
|
+
*
|
|
144
|
+
* @example
|
|
145
|
+
* // Extract hadith number to metadata while stripping the prefix
|
|
146
|
+
* { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
|
|
4
147
|
*/
|
|
5
|
-
type
|
|
148
|
+
type LineStartsAfterPattern = {
|
|
149
|
+
/** Array of patterns that mark line beginnings (marker excluded from content) */
|
|
150
|
+
lineStartsAfter: string[];
|
|
151
|
+
};
|
|
6
152
|
/**
|
|
7
|
-
*
|
|
153
|
+
* Line-end pattern rule - matches lines ending with any of the given patterns.
|
|
154
|
+
*
|
|
155
|
+
* Syntactic sugar for `(?:pattern1|pattern2|...)$`.
|
|
156
|
+
*
|
|
157
|
+
* Token expansion is applied to each pattern. Use `fuzzy: true` for
|
|
158
|
+
* diacritic-insensitive Arabic matching.
|
|
159
|
+
*
|
|
160
|
+
* @example
|
|
161
|
+
* // Split at lines ending with Arabic sentence-ending punctuation
|
|
162
|
+
* { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
|
|
8
163
|
*/
|
|
9
|
-
type
|
|
164
|
+
type LineEndsWithPattern = {
|
|
165
|
+
/** Array of patterns that mark line endings */
|
|
166
|
+
lineEndsWith: string[];
|
|
167
|
+
};
|
|
10
168
|
/**
|
|
11
|
-
*
|
|
169
|
+
* Union of all pattern types for split rules.
|
|
170
|
+
*
|
|
171
|
+
* Each rule must have exactly ONE pattern type:
|
|
172
|
+
* - `regex` - Raw regex pattern (no token expansion)
|
|
173
|
+
* - `template` - Pattern with `{{token}}` expansion
|
|
174
|
+
* - `lineStartsWith` - Match line beginnings (marker included)
|
|
175
|
+
* - `lineStartsAfter` - Match line beginnings (marker excluded)
|
|
176
|
+
* - `lineEndsWith` - Match line endings
|
|
12
177
|
*/
|
|
13
|
-
type
|
|
178
|
+
type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
|
|
14
179
|
/**
|
|
15
|
-
* Configuration for
|
|
180
|
+
* Configuration for how and where to split content when a pattern matches.
|
|
181
|
+
*
|
|
182
|
+
* Controls the split position relative to matches, which occurrences to
|
|
183
|
+
* split on, page span limits, and fuzzy matching for Arabic text.
|
|
16
184
|
*/
|
|
17
|
-
type
|
|
18
|
-
/** The type of marker to look for */
|
|
19
|
-
type: MarkerType;
|
|
20
|
-
/** For numbered markers, the digit style */
|
|
21
|
-
numbering?: NumberingStyle;
|
|
22
|
-
/** The separator that follows the marker */
|
|
23
|
-
separator?: SeparatorStyle | string;
|
|
185
|
+
type SplitBehavior = {
|
|
24
186
|
/**
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
187
|
+
* Where to split relative to the match.
|
|
188
|
+
* - `'at'`: New segment starts at the match position
|
|
189
|
+
* - `'after'`: New segment starts after the match ends
|
|
28
190
|
*/
|
|
29
|
-
|
|
191
|
+
split: 'at' | 'after';
|
|
30
192
|
/**
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
193
|
+
* Which occurrence(s) to split on.
|
|
194
|
+
* - `'all'`: Split at every match (default)
|
|
195
|
+
* - `'first'`: Only split at the first match
|
|
196
|
+
* - `'last'`: Only split at the last match
|
|
197
|
+
*
|
|
198
|
+
* When `maxSpan` is set, occurrence filtering is applied per sliding
|
|
199
|
+
* window rather than globally. With `'last'`, the algorithm prefers
|
|
200
|
+
* longer segments by looking as far ahead as allowed before selecting
|
|
201
|
+
* the last match in the window.
|
|
202
|
+
*
|
|
203
|
+
* @default 'all'
|
|
34
204
|
*/
|
|
35
|
-
|
|
205
|
+
occurrence?: 'first' | 'last' | 'all';
|
|
36
206
|
/**
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
40
|
-
*
|
|
207
|
+
* Maximum page ID difference allowed when looking ahead for split points.
|
|
208
|
+
*
|
|
209
|
+
* Uses a sliding window algorithm that prefers longer segments:
|
|
210
|
+
* 1. Start from the first page of the current segment
|
|
211
|
+
* 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
|
|
212
|
+
* 3. Apply occurrence filter (e.g., 'last') to select a match
|
|
213
|
+
* 4. Next window starts from the page after the match
|
|
214
|
+
*
|
|
215
|
+
* Examples:
|
|
216
|
+
* - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
|
|
217
|
+
* - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
|
|
218
|
+
* - `undefined` = no limit (entire content treated as one group)
|
|
219
|
+
*
|
|
220
|
+
* Note: With non-consecutive page IDs, the algorithm uses actual ID
|
|
221
|
+
* difference, not array index. Pages 1 and 5 have a difference of 4.
|
|
222
|
+
*
|
|
223
|
+
* @example
|
|
224
|
+
* // Split at last period, looking up to 1 page ahead
|
|
225
|
+
* // Pages 1,2: split at page 2's last period
|
|
226
|
+
* // Page 3: split at page 3's last period
|
|
227
|
+
* { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
|
|
41
228
|
*/
|
|
42
|
-
|
|
229
|
+
maxSpan?: number;
|
|
43
230
|
/**
|
|
44
|
-
*
|
|
45
|
-
*
|
|
231
|
+
* Enable diacritic-insensitive matching for Arabic text.
|
|
232
|
+
*
|
|
233
|
+
* When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
|
|
234
|
+
* `lineStartsAfter` are transformed to match text regardless of:
|
|
235
|
+
* - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
|
|
236
|
+
* - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
|
|
237
|
+
*
|
|
238
|
+
* **Note**: Does NOT apply to `regex` or `template` patterns.
|
|
239
|
+
* For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
|
|
240
|
+
*
|
|
241
|
+
* @default false
|
|
46
242
|
*/
|
|
47
|
-
|
|
243
|
+
fuzzy?: boolean;
|
|
244
|
+
};
|
|
245
|
+
/**
|
|
246
|
+
* A single page ID or a range of page IDs.
|
|
247
|
+
*
|
|
248
|
+
* - `number`: A single page ID
|
|
249
|
+
* - `[number, number]`: A range from first to second (inclusive)
|
|
250
|
+
*
|
|
251
|
+
* @example
|
|
252
|
+
* 5 // Single page 5
|
|
253
|
+
* [10, 20] // Pages 10 through 20 (inclusive)
|
|
254
|
+
*/
|
|
255
|
+
type PageRange = number | [number, number];
|
|
256
|
+
/**
|
|
257
|
+
* Optional constraints and metadata for a split rule.
|
|
258
|
+
*
|
|
259
|
+
* Use constraints to limit which pages a rule applies to, and
|
|
260
|
+
* metadata to attach arbitrary data to resulting segments.
|
|
261
|
+
*/
|
|
262
|
+
type RuleConstraints = {
|
|
263
|
+
/**
|
|
264
|
+
* Minimum page ID for this rule to apply.
|
|
265
|
+
*
|
|
266
|
+
* Matches on pages with `id < min` are ignored.
|
|
267
|
+
*
|
|
268
|
+
* @example
|
|
269
|
+
* // Only apply rule starting from page 10
|
|
270
|
+
* { min: 10, lineStartsWith: ['##'], split: 'before' }
|
|
271
|
+
*/
|
|
272
|
+
min?: number;
|
|
48
273
|
/**
|
|
49
|
-
*
|
|
50
|
-
*
|
|
274
|
+
* Maximum page ID for this rule to apply.
|
|
275
|
+
*
|
|
276
|
+
* Matches on pages with `id > max` are ignored.
|
|
277
|
+
*
|
|
278
|
+
* @example
|
|
279
|
+
* // Only apply rule up to page 100
|
|
280
|
+
* { max: 100, lineStartsWith: ['##'], split: 'before' }
|
|
51
281
|
*/
|
|
52
|
-
|
|
282
|
+
max?: number;
|
|
53
283
|
/**
|
|
54
|
-
*
|
|
55
|
-
*
|
|
284
|
+
* Specific pages or page ranges to exclude from this rule.
|
|
285
|
+
*
|
|
286
|
+
* Use this to skip the rule for specific pages without needing
|
|
287
|
+
* to repeat the rule with different min/max values.
|
|
288
|
+
*
|
|
289
|
+
* @example
|
|
290
|
+
* // Exclude specific pages
|
|
291
|
+
* { exclude: [1, 2, 5] }
|
|
292
|
+
*
|
|
293
|
+
* @example
|
|
294
|
+
* // Exclude page ranges
|
|
295
|
+
* { exclude: [[1, 10], [50, 100]] }
|
|
296
|
+
*
|
|
297
|
+
* @example
|
|
298
|
+
* // Mix single pages and ranges
|
|
299
|
+
* { exclude: [1, [5, 10], 50] }
|
|
56
300
|
*/
|
|
57
|
-
|
|
301
|
+
exclude?: PageRange[];
|
|
58
302
|
/**
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
303
|
+
* Arbitrary metadata attached to segments matching this rule.
|
|
304
|
+
*
|
|
305
|
+
* This metadata is merged with any named captures from the pattern.
|
|
306
|
+
* Named captures (e.g., `{{raqms:num}}`) take precedence over
|
|
307
|
+
* static metadata with the same key.
|
|
308
|
+
*
|
|
309
|
+
* @example
|
|
310
|
+
* // Tag segments as chapters
|
|
311
|
+
* { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
|
|
62
312
|
*/
|
|
63
|
-
|
|
313
|
+
meta?: Record<string, unknown>;
|
|
314
|
+
/**
|
|
315
|
+
* Fallback behavior when no matches are found within a maxSpan boundary.
|
|
316
|
+
* - 'page': Create split points at page boundaries
|
|
317
|
+
* - undefined: No fallback (current behavior)
|
|
318
|
+
*/
|
|
319
|
+
fallback?: 'page';
|
|
64
320
|
};
|
|
65
|
-
//#endregion
|
|
66
|
-
//#region src/markers/defaults.d.ts
|
|
67
321
|
/**
|
|
68
|
-
*
|
|
69
|
-
*/
|
|
70
|
-
declare const DEFAULT_NUMBERING: NumberingStyle;
|
|
71
|
-
/**
|
|
72
|
-
* Default separator style for markers
|
|
73
|
-
*/
|
|
74
|
-
declare const DEFAULT_SEPARATOR: SeparatorStyle;
|
|
75
|
-
/**
|
|
76
|
-
* Default separator pattern (used when separator is a custom string)
|
|
77
|
-
*/
|
|
78
|
-
declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
|
|
79
|
-
/**
|
|
80
|
-
* Numbering patterns mapped by style
|
|
81
|
-
*/
|
|
82
|
-
declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
|
|
83
|
-
/**
|
|
84
|
-
* Separator patterns mapped by style
|
|
85
|
-
*/
|
|
86
|
-
declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
|
|
87
|
-
//#endregion
|
|
88
|
-
//#region src/markers/generator.d.ts
|
|
89
|
-
/**
|
|
90
|
-
* Generates a regex pattern from a marker configuration.
|
|
91
|
-
* Always returns a regex with three named capture groups:
|
|
92
|
-
* - full: Complete match including marker
|
|
93
|
-
* - marker: Just the marker part (for metadata/indexing)
|
|
94
|
-
* - content: Clean content without marker (for LLM processing)
|
|
322
|
+
* A complete split rule combining pattern, behavior, and constraints.
|
|
95
323
|
*
|
|
96
|
-
*
|
|
324
|
+
* Each rule must specify:
|
|
325
|
+
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
326
|
+
* `lineStartsAfter`, or `lineEndsWith`
|
|
327
|
+
* - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
|
|
328
|
+
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
97
329
|
*
|
|
98
|
-
* @
|
|
99
|
-
*
|
|
330
|
+
* @example
|
|
331
|
+
* // Basic rule: split at markdown headers
|
|
332
|
+
* const rule: SplitRule = {
|
|
333
|
+
* lineStartsWith: ['## ', '### '],
|
|
334
|
+
* split: 'at',
|
|
335
|
+
* meta: { type: 'section' }
|
|
336
|
+
* };
|
|
100
337
|
*
|
|
101
338
|
* @example
|
|
102
|
-
*
|
|
103
|
-
* const
|
|
104
|
-
*
|
|
105
|
-
*
|
|
106
|
-
*
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
/**
|
|
112
|
-
* Default phrase lists for preset marker types.
|
|
113
|
-
* Export these so users can extend them.
|
|
114
|
-
*/
|
|
115
|
-
/**
|
|
116
|
-
* Common hadith narrator phrases (diacritic-insensitive)
|
|
117
|
-
* Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
|
|
339
|
+
* // Advanced rule: extract hadith numbers with fuzzy Arabic matching
|
|
340
|
+
* const rule: SplitRule = {
|
|
341
|
+
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
342
|
+
* split: 'at',
|
|
343
|
+
* fuzzy: true,
|
|
344
|
+
* min: 5,
|
|
345
|
+
* max: 500,
|
|
346
|
+
* meta: { type: 'hadith' }
|
|
347
|
+
* };
|
|
118
348
|
*/
|
|
119
|
-
|
|
349
|
+
type SplitRule = PatternType & SplitBehavior & RuleConstraints;
|
|
120
350
|
/**
|
|
121
|
-
*
|
|
122
|
-
*
|
|
351
|
+
* Input page structure for segmentation.
|
|
352
|
+
*
|
|
353
|
+
* Each page represents a logical unit of content (e.g., a book page,
|
|
354
|
+
* a document section) that can be tracked across segment boundaries.
|
|
355
|
+
*
|
|
356
|
+
* @example
|
|
357
|
+
* const pages: Page[] = [
|
|
358
|
+
* { id: 1, content: '## Chapter 1\nFirst paragraph...' },
|
|
359
|
+
* { id: 2, content: 'Continued text...\n## Chapter 2' },
|
|
360
|
+
* ];
|
|
123
361
|
*/
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
readonly content: "(.*)";
|
|
140
|
-
readonly dash: "[-–—ـ]";
|
|
141
|
-
readonly dot: "\\.";
|
|
142
|
-
readonly latin: "\\d+";
|
|
143
|
-
readonly letter: "[أ-ي]";
|
|
144
|
-
readonly num: "[\\u0660-\\u0669]+";
|
|
145
|
-
readonly paren: "\\)";
|
|
146
|
-
readonly s: "\\s?";
|
|
147
|
-
readonly slash: "/";
|
|
148
|
-
readonly space: "\\s+";
|
|
362
|
+
type Page = {
|
|
363
|
+
/**
|
|
364
|
+
* Unique page/entry ID used for:
|
|
365
|
+
* - `maxSpan` grouping (segments spanning multiple pages)
|
|
366
|
+
* - `min`/`max` constraint filtering
|
|
367
|
+
* - `from`/`to` tracking in output segments
|
|
368
|
+
*/
|
|
369
|
+
id: number;
|
|
370
|
+
/**
|
|
371
|
+
* Raw page content (may contain HTML).
|
|
372
|
+
*
|
|
373
|
+
* Line endings are normalized internally (`\r\n` and `\r` → `\n`).
|
|
374
|
+
* Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
|
|
375
|
+
*/
|
|
376
|
+
content: string;
|
|
149
377
|
};
|
|
150
|
-
type TokenMap = Record<string, string>;
|
|
151
|
-
//#endregion
|
|
152
|
-
//#region src/markers/template-parser.d.ts
|
|
153
378
|
/**
|
|
154
|
-
*
|
|
379
|
+
* A breakpoint pattern with optional page constraints.
|
|
380
|
+
*
|
|
381
|
+
* Use this to control which pages a breakpoint pattern applies to.
|
|
382
|
+
* Patterns outside the specified range are skipped, allowing
|
|
383
|
+
* the next breakpoint pattern (or fallback) to be tried.
|
|
384
|
+
*
|
|
385
|
+
* @example
|
|
386
|
+
* // Only apply punctuation-based breaking from page 10 onwards
|
|
387
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }
|
|
388
|
+
*
|
|
389
|
+
* @example
|
|
390
|
+
* // Apply to specific page range (pages 10-50)
|
|
391
|
+
* { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
|
|
155
392
|
*/
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
393
|
+
type BreakpointRule = {
|
|
394
|
+
/**
|
|
395
|
+
* Regex pattern for breaking (supports token expansion).
|
|
396
|
+
* Empty string `''` means fall back to page boundary.
|
|
397
|
+
*/
|
|
398
|
+
pattern: string;
|
|
399
|
+
/**
|
|
400
|
+
* Minimum page ID for this breakpoint to apply.
|
|
401
|
+
* Segments starting before this page skip this pattern.
|
|
402
|
+
*/
|
|
403
|
+
min?: number;
|
|
404
|
+
/**
|
|
405
|
+
* Maximum page ID for this breakpoint to apply.
|
|
406
|
+
* Segments starting after this page skip this pattern.
|
|
407
|
+
*/
|
|
408
|
+
max?: number;
|
|
409
|
+
/**
|
|
410
|
+
* Specific pages or page ranges to exclude from this breakpoint.
|
|
411
|
+
*
|
|
412
|
+
* Use this to skip the breakpoint for specific pages without needing
|
|
413
|
+
* to repeat the breakpoint with different min/max values.
|
|
414
|
+
*
|
|
415
|
+
* @example
|
|
416
|
+
* // Exclude specific pages
|
|
417
|
+
* { pattern: '\\.\\s*', exclude: [1, 2, 5] }
|
|
418
|
+
*
|
|
419
|
+
* @example
|
|
420
|
+
* // Exclude page ranges (front matter pages 1-10)
|
|
421
|
+
* { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
|
|
422
|
+
*
|
|
423
|
+
* @example
|
|
424
|
+
* // Mix single pages and ranges
|
|
425
|
+
* { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
|
|
426
|
+
*/
|
|
427
|
+
exclude?: PageRange[];
|
|
428
|
+
/**
|
|
429
|
+
* Skip this breakpoint if the segment content matches this pattern.
|
|
430
|
+
*
|
|
431
|
+
* Supports token expansion (e.g., `{{kitab}}`). When the segment's
|
|
432
|
+
* remaining content matches this regex, the breakpoint pattern is
|
|
433
|
+
* skipped and the next breakpoint in the array is tried.
|
|
434
|
+
*
|
|
435
|
+
* Useful for excluding title pages or front matter without needing
|
|
436
|
+
* to specify explicit page ranges.
|
|
437
|
+
*
|
|
438
|
+
* @example
|
|
439
|
+
* // Skip punctuation breakpoint for short content (likely titles)
|
|
440
|
+
* { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
|
|
441
|
+
*
|
|
442
|
+
* @example
|
|
443
|
+
* // Skip for content containing "kitab" (book) marker
|
|
444
|
+
* { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
|
|
445
|
+
*/
|
|
446
|
+
skipWhen?: string;
|
|
447
|
+
};
|
|
160
448
|
/**
|
|
161
|
-
*
|
|
449
|
+
* A breakpoint can be a simple string pattern or an object with constraints.
|
|
450
|
+
*
|
|
451
|
+
* String breakpoints apply to all pages. Object breakpoints can specify
|
|
452
|
+
* `min`/`max` to limit which pages they apply to.
|
|
453
|
+
*
|
|
454
|
+
* @example
|
|
455
|
+
* // String (applies everywhere)
|
|
456
|
+
* '{{tarqim}}\\s*'
|
|
457
|
+
*
|
|
458
|
+
* @example
|
|
459
|
+
* // Object with constraints (only from page 10+)
|
|
460
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }
|
|
162
461
|
*/
|
|
163
|
-
|
|
164
|
-
/** Custom token map to use instead of default TOKENS */
|
|
165
|
-
tokens?: TokenMap;
|
|
166
|
-
}
|
|
462
|
+
type Breakpoint = string | BreakpointRule;
|
|
167
463
|
/**
|
|
168
|
-
*
|
|
169
|
-
* Always creates three groups: full (entire match), marker (just the marker), content (clean text).
|
|
464
|
+
* Logger interface for custom logging implementations.
|
|
170
465
|
*
|
|
171
|
-
*
|
|
466
|
+
* All methods are optional - only implement the verbosity levels you need.
|
|
467
|
+
* When no logger is provided, no logging overhead is incurred.
|
|
172
468
|
*
|
|
173
|
-
*
|
|
174
|
-
* @param options - Optional configuration
|
|
175
|
-
* @returns Regex pattern string with named groups
|
|
469
|
+
* Compatible with the Logger interface from ffmpeg-simplified and similar libraries.
|
|
176
470
|
*
|
|
177
471
|
* @example
|
|
178
|
-
*
|
|
179
|
-
*
|
|
180
|
-
|
|
181
|
-
|
|
472
|
+
* // Simple console logger
|
|
473
|
+
* const logger: Logger = {
|
|
474
|
+
* debug: console.debug,
|
|
475
|
+
* info: console.info,
|
|
476
|
+
* warn: console.warn,
|
|
477
|
+
* error: console.error,
|
|
478
|
+
* };
|
|
479
|
+
*
|
|
480
|
+
* @example
|
|
481
|
+
* // Production logger (only warnings and errors)
|
|
482
|
+
* const prodLogger: Logger = {
|
|
483
|
+
* warn: (msg, ...args) => myLoggingService.warn(msg, args),
|
|
484
|
+
* error: (msg, ...args) => myLoggingService.error(msg, args),
|
|
485
|
+
* };
|
|
486
|
+
*/
|
|
487
|
+
interface Logger {
|
|
488
|
+
/** Log a debug message (verbose debugging output) */
|
|
489
|
+
debug?: (message: string, ...args: unknown[]) => void;
|
|
490
|
+
/** Log an error message (critical failures) */
|
|
491
|
+
error?: (message: string, ...args: unknown[]) => void;
|
|
492
|
+
/** Log an informational message (key progress points) */
|
|
493
|
+
info?: (message: string, ...args: unknown[]) => void;
|
|
494
|
+
/** Log a trace message (extremely verbose, per-iteration details) */
|
|
495
|
+
trace?: (message: string, ...args: unknown[]) => void;
|
|
496
|
+
/** Log a warning message (potential issues) */
|
|
497
|
+
warn?: (message: string, ...args: unknown[]) => void;
|
|
498
|
+
}
|
|
182
499
|
/**
|
|
183
|
-
*
|
|
500
|
+
* Segmentation options controlling how pages are split.
|
|
184
501
|
*
|
|
185
|
-
* @
|
|
186
|
-
*
|
|
502
|
+
* @example
|
|
503
|
+
* // Basic structural rules only
|
|
504
|
+
* const options: SegmentationOptions = {
|
|
505
|
+
* rules: [
|
|
506
|
+
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
|
|
507
|
+
* { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
|
|
508
|
+
* ]
|
|
509
|
+
* };
|
|
187
510
|
*
|
|
188
511
|
* @example
|
|
189
|
-
*
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
|
|
194
|
-
|
|
512
|
+
* // With breakpoints for oversized segments
|
|
513
|
+
* const options: SegmentationOptions = {
|
|
514
|
+
* rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
|
|
515
|
+
* maxPages: 2,
|
|
516
|
+
* breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
|
|
517
|
+
* prefer: 'longer'
|
|
518
|
+
* };
|
|
519
|
+
*
|
|
520
|
+
* @example
|
|
521
|
+
* // With custom logger for debugging
|
|
522
|
+
* const options: SegmentationOptions = {
|
|
523
|
+
* rules: [...],
|
|
524
|
+
* logger: {
|
|
525
|
+
* debug: console.debug,
|
|
526
|
+
* info: console.info,
|
|
527
|
+
* warn: console.warn,
|
|
528
|
+
* }
|
|
529
|
+
* };
|
|
530
|
+
*/
|
|
531
|
+
type SegmentationOptions = {
|
|
532
|
+
/**
|
|
533
|
+
* Rules applied in order to find split points.
|
|
534
|
+
*
|
|
535
|
+
* All rules are evaluated against the content, and their matches
|
|
536
|
+
* are combined to determine final split points. The first matching
|
|
537
|
+
* rule's metadata is used for each segment.
|
|
538
|
+
*/
|
|
539
|
+
rules?: SplitRule[];
|
|
540
|
+
/**
|
|
541
|
+
* Maximum pages per segment before breakpoints are applied.
|
|
542
|
+
*
|
|
543
|
+
* When a segment spans more pages than this limit, the `breakpoints`
|
|
544
|
+
* patterns are tried (in order) to find a suitable break point within
|
|
545
|
+
* the allowed window.
|
|
546
|
+
*
|
|
547
|
+
* Structural markers (from rules) always take precedence - segments
|
|
548
|
+
* are only broken within their rule-defined boundaries, never across them.
|
|
549
|
+
*
|
|
550
|
+
* @example
|
|
551
|
+
* // Break segments that exceed 2 pages
|
|
552
|
+
* { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
|
|
553
|
+
*/
|
|
554
|
+
maxPages?: number;
|
|
555
|
+
/**
|
|
556
|
+
* Patterns tried in order to break oversized segments.
|
|
557
|
+
*
|
|
558
|
+
* Each pattern is tried until one matches within the allowed page window.
|
|
559
|
+
* Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
|
|
560
|
+
* matches the page boundary (always succeeds as ultimate fallback).
|
|
561
|
+
*
|
|
562
|
+
* Patterns can be simple strings (apply everywhere) or objects with
|
|
563
|
+
* `min`/`max` constraints to limit which pages they apply to.
|
|
564
|
+
*
|
|
565
|
+
* Patterns are checked in order - put preferred break styles first:
|
|
566
|
+
* - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
|
|
567
|
+
* - `\\n` - Break at line breaks (useful for OCR content)
|
|
568
|
+
* - `''` - Break at page boundary (always works)
|
|
569
|
+
*
|
|
570
|
+
* Only applied to segments exceeding `maxPages`.
|
|
571
|
+
*
|
|
572
|
+
* @example
|
|
573
|
+
* // Simple patterns (backward compatible)
|
|
574
|
+
* breakpoints: ['{{tarqim}}\\s*', '\\n', '']
|
|
575
|
+
*
|
|
576
|
+
* @example
|
|
577
|
+
* // Object patterns with page constraints
|
|
578
|
+
* breakpoints: [
|
|
579
|
+
* { pattern: '{{tarqim}}\\s*', min: 10 }, // Only from page 10+
|
|
580
|
+
* '' // Fallback for pages 1-9
|
|
581
|
+
* ]
|
|
582
|
+
*/
|
|
583
|
+
breakpoints?: Breakpoint[];
|
|
584
|
+
/**
|
|
585
|
+
* When multiple matches exist for a breakpoint pattern, select:
|
|
586
|
+
* - `'longer'` - Last match in window (prefers longer segments)
|
|
587
|
+
* - `'shorter'` - First match in window (prefers shorter segments)
|
|
588
|
+
*
|
|
589
|
+
* @default 'longer'
|
|
590
|
+
*/
|
|
591
|
+
prefer?: 'longer' | 'shorter';
|
|
592
|
+
/**
|
|
593
|
+
* Optional logger for debugging segmentation.
|
|
594
|
+
*
|
|
595
|
+
* Provide a logger to receive detailed information about the segmentation
|
|
596
|
+
* process. Useful for debugging pattern matching, page tracking, and
|
|
597
|
+
* breakpoint processing issues.
|
|
598
|
+
*
|
|
599
|
+
* When not provided, no logging overhead is incurred (methods are not called).
|
|
600
|
+
*
|
|
601
|
+
* Verbosity levels:
|
|
602
|
+
* - `trace`: Per-iteration details (very verbose)
|
|
603
|
+
* - `debug`: Detailed operation information
|
|
604
|
+
* - `info`: Key progress points
|
|
605
|
+
* - `warn`: Potential issues
|
|
606
|
+
* - `error`: Critical failures
|
|
607
|
+
*
|
|
608
|
+
* @example
|
|
609
|
+
* // Console logger for development
|
|
610
|
+
* logger: {
|
|
611
|
+
* debug: console.debug,
|
|
612
|
+
* info: console.info,
|
|
613
|
+
* warn: console.warn,
|
|
614
|
+
* }
|
|
615
|
+
*
|
|
616
|
+
* @example
|
|
617
|
+
* // Custom logger integration
|
|
618
|
+
* logger: {
|
|
619
|
+
* debug: (msg, ...args) => winston.debug(msg, { meta: args }),
|
|
620
|
+
* error: (msg, ...args) => winston.error(msg, { meta: args }),
|
|
621
|
+
* }
|
|
622
|
+
*/
|
|
623
|
+
logger?: Logger;
|
|
624
|
+
};
|
|
195
625
|
/**
|
|
196
|
-
*
|
|
626
|
+
* Output segment produced by `segmentPages()`.
|
|
197
627
|
*
|
|
198
|
-
*
|
|
199
|
-
*
|
|
200
|
-
* @returns Validation result with errors if invalid
|
|
628
|
+
* Each segment contains extracted content, page references, and
|
|
629
|
+
* optional metadata from the matched rule and captured groups.
|
|
201
630
|
*
|
|
202
631
|
* @example
|
|
203
|
-
*
|
|
204
|
-
*
|
|
632
|
+
* // Simple segment on a single page
|
|
633
|
+
* { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
|
|
205
634
|
*
|
|
206
|
-
*
|
|
207
|
-
* //
|
|
635
|
+
* @example
|
|
636
|
+
* // Segment spanning pages 5-7 with captured hadith number
|
|
637
|
+
* { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
|
|
208
638
|
*/
|
|
209
|
-
|
|
639
|
+
type Segment = {
|
|
640
|
+
/**
|
|
641
|
+
* Segment content with:
|
|
642
|
+
* - Leading/trailing whitespace trimmed
|
|
643
|
+
* - Page breaks converted to spaces (for multi-page segments)
|
|
644
|
+
* - Markers stripped (for `lineStartsAfter` patterns)
|
|
645
|
+
*/
|
|
646
|
+
content: string;
|
|
647
|
+
/**
|
|
648
|
+
* Starting page ID (from `Page.id`).
|
|
649
|
+
*/
|
|
650
|
+
from: number;
|
|
651
|
+
/**
|
|
652
|
+
* Ending page ID if segment spans multiple pages.
|
|
653
|
+
*
|
|
654
|
+
* Only present when the segment content extends across page boundaries.
|
|
655
|
+
* When `undefined`, the segment is contained within a single page.
|
|
656
|
+
*/
|
|
657
|
+
to?: number;
|
|
658
|
+
/**
|
|
659
|
+
* Combined metadata from:
|
|
660
|
+
* 1. Rule's `meta` property (static metadata)
|
|
661
|
+
* 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
|
|
662
|
+
*
|
|
663
|
+
* Named captures override static metadata with the same key.
|
|
664
|
+
*/
|
|
665
|
+
meta?: Record<string, unknown>;
|
|
666
|
+
};
|
|
210
667
|
//#endregion
|
|
211
|
-
//#region src/
|
|
668
|
+
//#region src/segmentation/segmenter.d.ts
|
|
212
669
|
/**
|
|
213
|
-
*
|
|
670
|
+
* Segments pages of content based on pattern-matching rules.
|
|
214
671
|
*
|
|
215
|
-
*
|
|
216
|
-
*
|
|
217
|
-
*
|
|
672
|
+
* This is the main entry point for the segmentation engine. It takes an array
|
|
673
|
+
* of pages and applies the provided rules to identify split points, producing
|
|
674
|
+
* an array of segments with content, page references, and metadata.
|
|
218
675
|
*
|
|
219
|
-
* @param
|
|
220
|
-
* @
|
|
221
|
-
* @
|
|
676
|
+
* @param pages - Array of pages with id and content
|
|
677
|
+
* @param options - Segmentation options including splitting rules
|
|
678
|
+
* @returns Array of segments with content, from/to page references, and optional metadata
|
|
222
679
|
*
|
|
223
680
|
* @example
|
|
224
|
-
* //
|
|
225
|
-
* const
|
|
681
|
+
* // Split markdown by headers
|
|
682
|
+
* const segments = segmentPages(pages, {
|
|
683
|
+
* rules: [
|
|
684
|
+
* { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
|
|
685
|
+
* ]
|
|
686
|
+
* });
|
|
226
687
|
*
|
|
227
688
|
* @example
|
|
228
|
-
* //
|
|
229
|
-
* const
|
|
689
|
+
* // Split Arabic hadith text with number extraction
|
|
690
|
+
* const segments = segmentPages(pages, {
|
|
691
|
+
* rules: [
|
|
692
|
+
* {
|
|
693
|
+
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
694
|
+
* split: 'at',
|
|
695
|
+
* fuzzy: true,
|
|
696
|
+
* meta: { type: 'hadith' }
|
|
697
|
+
* }
|
|
698
|
+
* ]
|
|
699
|
+
* });
|
|
230
700
|
*
|
|
231
701
|
* @example
|
|
232
|
-
* //
|
|
233
|
-
* const
|
|
234
|
-
*
|
|
235
|
-
*
|
|
236
|
-
*
|
|
702
|
+
* // Multiple rules with page constraints
|
|
703
|
+
* const segments = segmentPages(pages, {
|
|
704
|
+
* rules: [
|
|
705
|
+
* { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
|
|
706
|
+
* { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
|
|
707
|
+
* { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
|
|
708
|
+
* ]
|
|
237
709
|
* });
|
|
238
710
|
*/
|
|
239
|
-
declare
|
|
711
|
+
declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
|
|
712
|
+
//#endregion
|
|
713
|
+
//#region src/segmentation/textUtils.d.ts
|
|
240
714
|
/**
|
|
241
|
-
*
|
|
715
|
+
* Strip all HTML tags from content, keeping only text.
|
|
242
716
|
*
|
|
243
|
-
*
|
|
244
|
-
*
|
|
717
|
+
* @param html - HTML content
|
|
718
|
+
* @returns Plain text content
|
|
719
|
+
*/
|
|
720
|
+
declare const stripHtmlTags: (html: string) => string;
|
|
721
|
+
/**
|
|
722
|
+
* Normalizes line endings to Unix-style (`\n`).
|
|
245
723
|
*
|
|
246
|
-
*
|
|
724
|
+
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
|
|
725
|
+
* for consistent pattern matching across platforms.
|
|
247
726
|
*
|
|
248
|
-
* @
|
|
249
|
-
*
|
|
250
|
-
* const match = regex.exec('باب الصلاة');
|
|
251
|
-
* // match.groups.marker -> 'باب'
|
|
252
|
-
* // match.groups.content -> ' الصلاة'
|
|
727
|
+
* @param content - Raw content with potentially mixed line endings
|
|
728
|
+
* @returns Content with all line endings normalized to `\n`
|
|
253
729
|
*/
|
|
254
|
-
declare
|
|
730
|
+
declare const normalizeLineEndings: (content: string) => string;
|
|
731
|
+
//#endregion
|
|
732
|
+
//#region src/segmentation/tokens.d.ts
|
|
255
733
|
/**
|
|
256
|
-
*
|
|
734
|
+
* Token-based template system for Arabic text pattern matching.
|
|
257
735
|
*
|
|
258
|
-
*
|
|
259
|
-
*
|
|
260
|
-
*
|
|
736
|
+
* This module provides a human-readable way to define regex patterns using
|
|
737
|
+
* `{{token}}` placeholders that expand to their regex equivalents. It supports
|
|
738
|
+
* named capture groups for extracting matched values into metadata.
|
|
261
739
|
*
|
|
262
|
-
* @
|
|
263
|
-
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
740
|
+
* @module tokens
|
|
264
741
|
*
|
|
265
742
|
* @example
|
|
266
|
-
* //
|
|
267
|
-
*
|
|
268
|
-
*
|
|
743
|
+
* // Simple token expansion
|
|
744
|
+
* expandTokens('{{raqms}} {{dash}}')
|
|
745
|
+
* // → '[\\u0660-\\u0669]+ [-–—ـ]'
|
|
269
746
|
*
|
|
270
747
|
* @example
|
|
271
|
-
* //
|
|
272
|
-
*
|
|
273
|
-
*
|
|
274
|
-
* phrases: ['قَالَ', 'رَوَى']
|
|
275
|
-
* });
|
|
748
|
+
* // Named capture groups
|
|
749
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
750
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
276
751
|
*/
|
|
277
|
-
declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
|
|
278
752
|
/**
|
|
279
|
-
*
|
|
753
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
754
|
+
*
|
|
755
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
756
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
757
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
758
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
280
759
|
*
|
|
281
|
-
*
|
|
282
|
-
*
|
|
283
|
-
*
|
|
284
|
-
*
|
|
760
|
+
* @remarks
|
|
761
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
762
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
763
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
285
764
|
*
|
|
286
|
-
* @
|
|
765
|
+
* @example
|
|
766
|
+
* // Using tokens in a split rule
|
|
767
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
287
768
|
*
|
|
288
769
|
* @example
|
|
289
|
-
*
|
|
290
|
-
*
|
|
291
|
-
*
|
|
770
|
+
* // Using tokens with named captures
|
|
771
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
772
|
+
*
|
|
773
|
+
* @example
|
|
774
|
+
* // Using the numbered convenience token
|
|
775
|
+
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
292
776
|
*/
|
|
293
|
-
declare
|
|
777
|
+
declare const TOKEN_PATTERNS: Record<string, string>;
|
|
294
778
|
/**
|
|
295
|
-
*
|
|
779
|
+
* Checks if a query string contains template tokens.
|
|
296
780
|
*
|
|
297
|
-
*
|
|
298
|
-
*
|
|
781
|
+
* Performs a quick test for `{{token}}` patterns without actually
|
|
782
|
+
* expanding them. Useful for determining whether to apply token
|
|
783
|
+
* expansion to a string.
|
|
299
784
|
*
|
|
300
|
-
* @param
|
|
301
|
-
* @returns
|
|
302
|
-
* @throws {Error} When `phrases` is undefined or empty
|
|
785
|
+
* @param query - String to check for tokens
|
|
786
|
+
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
303
787
|
*
|
|
304
788
|
* @example
|
|
305
|
-
*
|
|
306
|
-
*
|
|
307
|
-
*
|
|
308
|
-
|
|
789
|
+
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
790
|
+
* containsTokens('plain text') // → false
|
|
791
|
+
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
792
|
+
*/
|
|
793
|
+
declare const containsTokens: (query: string) => boolean;
|
|
794
|
+
/**
|
|
795
|
+
* Result from expanding tokens with capture information.
|
|
796
|
+
*
|
|
797
|
+
* Contains the expanded pattern string along with metadata about
|
|
798
|
+
* any named capture groups that were created.
|
|
309
799
|
*/
|
|
310
|
-
|
|
800
|
+
type ExpandResult = {
|
|
801
|
+
/**
|
|
802
|
+
* The expanded regex pattern string with all tokens replaced.
|
|
803
|
+
*
|
|
804
|
+
* Named captures use the `(?<name>pattern)` syntax.
|
|
805
|
+
*/
|
|
806
|
+
pattern: string;
|
|
807
|
+
/**
|
|
808
|
+
* Names of captured groups extracted from `{{token:name}}` syntax.
|
|
809
|
+
*
|
|
810
|
+
* Empty array if no named captures were found.
|
|
811
|
+
*/
|
|
812
|
+
captureNames: string[];
|
|
813
|
+
/**
|
|
814
|
+
* Whether the pattern has any named capturing groups.
|
|
815
|
+
*
|
|
816
|
+
* Equivalent to `captureNames.length > 0`.
|
|
817
|
+
*/
|
|
818
|
+
hasCaptures: boolean;
|
|
819
|
+
};
|
|
311
820
|
/**
|
|
312
|
-
*
|
|
821
|
+
* Expands template tokens with support for named captures.
|
|
313
822
|
*
|
|
314
|
-
*
|
|
315
|
-
* -
|
|
316
|
-
* -
|
|
317
|
-
* -
|
|
823
|
+
* This is the primary token expansion function that handles all token syntax:
|
|
824
|
+
* - `{{token}}` → Expands to the token's pattern (no capture group)
|
|
825
|
+
* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
|
|
826
|
+
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
318
827
|
*
|
|
319
|
-
*
|
|
828
|
+
* Unknown tokens are left as-is in the output, allowing for partial templates.
|
|
829
|
+
*
|
|
830
|
+
* @param query - The template string containing tokens
|
|
831
|
+
* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
|
|
832
|
+
* Applied to both token patterns and plain Arabic text between tokens.
|
|
833
|
+
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
834
|
+
* @returns Object with expanded pattern, capture names, and capture flag
|
|
320
835
|
*
|
|
321
836
|
* @example
|
|
322
|
-
*
|
|
323
|
-
*
|
|
324
|
-
* //
|
|
325
|
-
*/
|
|
326
|
-
declare function generateSquareBracketRegex(): RegExp;
|
|
327
|
-
/**
|
|
328
|
-
* Generates a regular expression for number-letter-separator markers.
|
|
837
|
+
* // Simple token expansion
|
|
838
|
+
* expandTokensWithCaptures('{{raqms}} {{dash}}')
|
|
839
|
+
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
329
840
|
*
|
|
330
|
-
*
|
|
331
|
-
*
|
|
332
|
-
*
|
|
841
|
+
* @example
|
|
842
|
+
* // Named capture
|
|
843
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
844
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
333
845
|
*
|
|
334
|
-
* @
|
|
335
|
-
*
|
|
846
|
+
* @example
|
|
847
|
+
* // Capture-only token
|
|
848
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
|
|
849
|
+
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
|
|
336
850
|
*
|
|
337
851
|
* @example
|
|
338
|
-
*
|
|
339
|
-
*
|
|
340
|
-
*
|
|
341
|
-
* });
|
|
342
|
-
* const match = regex.exec('٥ أ - نص');
|
|
852
|
+
* // With fuzzy transform
|
|
853
|
+
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
854
|
+
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
343
855
|
*/
|
|
344
|
-
declare
|
|
856
|
+
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
|
|
345
857
|
/**
|
|
346
|
-
*
|
|
858
|
+
* Expands template tokens in a query string to their regex equivalents.
|
|
347
859
|
*
|
|
348
|
-
*
|
|
349
|
-
*
|
|
350
|
-
* - 5 (٦) - (number with parenthetical number)
|
|
860
|
+
* This is the simple version without capture support. It returns only the
|
|
861
|
+
* expanded pattern string, not capture metadata.
|
|
351
862
|
*
|
|
352
|
-
*
|
|
353
|
-
*
|
|
863
|
+
* Unknown tokens are left as-is, allowing for partial templates.
|
|
864
|
+
*
|
|
865
|
+
* @param query - Template string containing `{{token}}` placeholders
|
|
866
|
+
* @returns Expanded regex pattern string
|
|
354
867
|
*
|
|
355
868
|
* @example
|
|
356
|
-
*
|
|
357
|
-
*
|
|
358
|
-
*
|
|
359
|
-
* })
|
|
360
|
-
*
|
|
869
|
+
* expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
|
|
870
|
+
* expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
|
|
871
|
+
* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
|
|
872
|
+
* expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
|
|
873
|
+
*
|
|
874
|
+
* @see expandTokensWithCaptures for full capture group support
|
|
361
875
|
*/
|
|
362
|
-
declare
|
|
876
|
+
declare const expandTokens: (query: string) => string;
|
|
363
877
|
/**
|
|
364
|
-
*
|
|
878
|
+
* Converts a template string to a compiled RegExp.
|
|
365
879
|
*
|
|
366
|
-
*
|
|
367
|
-
*
|
|
368
|
-
* - ٥ - (single number, separator)
|
|
880
|
+
* Expands all tokens and attempts to compile the result as a RegExp
|
|
881
|
+
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
369
882
|
*
|
|
370
|
-
*
|
|
883
|
+
* @remarks
|
|
884
|
+
* This function dynamically compiles regular expressions from template strings.
|
|
885
|
+
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
886
|
+
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
887
|
+
* Consider validating pattern complexity or applying execution timeouts when
|
|
888
|
+
* running user-submitted patterns.
|
|
371
889
|
*
|
|
372
|
-
* @param
|
|
373
|
-
* @returns
|
|
890
|
+
* @param template - Template string containing `{{token}}` placeholders
|
|
891
|
+
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
374
892
|
*
|
|
375
893
|
* @example
|
|
376
|
-
*
|
|
377
|
-
*
|
|
378
|
-
*
|
|
379
|
-
* });
|
|
380
|
-
* const match1 = regex.exec('٥/٦ - نص');
|
|
381
|
-
* const match2 = regex.exec('٥ - نص'); // Also matches
|
|
894
|
+
* templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
|
|
895
|
+
* templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
|
|
896
|
+
* templateToRegex('(((') // → null (invalid regex)
|
|
382
897
|
*/
|
|
383
|
-
declare
|
|
898
|
+
declare const templateToRegex: (template: string) => RegExp | null;
|
|
384
899
|
/**
|
|
385
|
-
*
|
|
900
|
+
* Lists all available token names defined in `TOKEN_PATTERNS`.
|
|
386
901
|
*
|
|
387
|
-
*
|
|
388
|
-
*
|
|
389
|
-
* 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
|
|
902
|
+
* Useful for documentation, validation, or building user interfaces
|
|
903
|
+
* that show available tokens.
|
|
390
904
|
*
|
|
391
|
-
*
|
|
392
|
-
* - Separator 'none' generates pattern without separator
|
|
393
|
-
* - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
|
|
394
|
-
*
|
|
395
|
-
* @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
|
|
396
|
-
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
905
|
+
* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
|
|
397
906
|
*
|
|
398
907
|
* @example
|
|
399
|
-
*
|
|
400
|
-
*
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
*
|
|
908
|
+
* getAvailableTokens()
|
|
909
|
+
* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
910
|
+
*/
|
|
911
|
+
declare const getAvailableTokens: () => string[];
|
|
912
|
+
/**
|
|
913
|
+
* Gets the regex pattern for a specific token name.
|
|
405
914
|
*
|
|
406
|
-
*
|
|
407
|
-
*
|
|
408
|
-
*
|
|
409
|
-
*
|
|
410
|
-
*
|
|
411
|
-
* });
|
|
412
|
-
* const match = regex.exec('٥ - نص');
|
|
915
|
+
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
916
|
+
* without any expansion or capture group wrapping.
|
|
917
|
+
*
|
|
918
|
+
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
919
|
+
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
413
920
|
*
|
|
414
921
|
* @example
|
|
415
|
-
* //
|
|
416
|
-
*
|
|
417
|
-
*
|
|
418
|
-
* separator: 'none'
|
|
419
|
-
* });
|
|
420
|
-
* const match = regex.exec('5 text');
|
|
922
|
+
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
923
|
+
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
924
|
+
* getTokenPattern('unknown') // → undefined
|
|
421
925
|
*/
|
|
422
|
-
declare
|
|
926
|
+
declare const getTokenPattern: (tokenName: string) => string | undefined;
|
|
927
|
+
//#endregion
|
|
928
|
+
//#region src/pattern-detection.d.ts
|
|
423
929
|
/**
|
|
424
|
-
*
|
|
930
|
+
* Pattern detection utilities for recognizing template tokens in Arabic text.
|
|
931
|
+
* Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
|
|
425
932
|
*
|
|
426
|
-
*
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
*
|
|
430
|
-
|
|
933
|
+
* @module pattern-detection
|
|
934
|
+
*/
|
|
935
|
+
/**
|
|
936
|
+
* Result of detecting a token pattern in text
|
|
937
|
+
*/
|
|
938
|
+
type DetectedPattern = {
|
|
939
|
+
/** Token name from TOKEN_PATTERNS (e.g., 'raqms', 'dash') */
|
|
940
|
+
token: string;
|
|
941
|
+
/** The matched text */
|
|
942
|
+
match: string;
|
|
943
|
+
/** Start index in the original text */
|
|
944
|
+
index: number;
|
|
945
|
+
/** End index (exclusive) */
|
|
946
|
+
endIndex: number;
|
|
947
|
+
};
|
|
948
|
+
/**
|
|
949
|
+
* Analyzes text and returns all detected token patterns with their positions.
|
|
950
|
+
* Patterns are detected in priority order to avoid partial matches.
|
|
431
951
|
*
|
|
432
|
-
* @
|
|
952
|
+
* @param text - The text to analyze for token patterns
|
|
953
|
+
* @returns Array of detected patterns sorted by position
|
|
433
954
|
*
|
|
434
955
|
* @example
|
|
435
|
-
*
|
|
436
|
-
*
|
|
437
|
-
* // match
|
|
956
|
+
* detectTokenPatterns("٣٤ - حدثنا")
|
|
957
|
+
* // Returns: [
|
|
958
|
+
* // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
|
|
959
|
+
* // { token: 'dash', match: '-', index: 3, endIndex: 4 },
|
|
960
|
+
* // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
|
|
961
|
+
* // ]
|
|
438
962
|
*/
|
|
439
|
-
declare
|
|
963
|
+
declare const detectTokenPatterns: (text: string) => DetectedPattern[];
|
|
440
964
|
/**
|
|
441
|
-
* Generates a
|
|
442
|
-
*
|
|
443
|
-
* Matches heading levels using hash symbols:
|
|
444
|
-
* - # Heading 1
|
|
445
|
-
* - ## Heading 2
|
|
446
|
-
* - ### Heading 3
|
|
447
|
-
* - etc.
|
|
965
|
+
* Generates a template pattern from text using detected tokens.
|
|
966
|
+
* Replaces matched portions with {{token}} syntax.
|
|
448
967
|
*
|
|
449
|
-
* @
|
|
968
|
+
* @param text - Original text
|
|
969
|
+
* @param detected - Array of detected patterns from detectTokenPatterns
|
|
970
|
+
* @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
|
|
450
971
|
*
|
|
451
972
|
* @example
|
|
452
|
-
* const
|
|
453
|
-
*
|
|
454
|
-
* //
|
|
455
|
-
|
|
973
|
+
* const detected = detectTokenPatterns("٣٤ - ");
|
|
974
|
+
* generateTemplateFromText("٣٤ - ", detected);
|
|
975
|
+
* // Returns: "{{raqms}} {{dash}} "
|
|
976
|
+
*/
|
|
977
|
+
declare const generateTemplateFromText: (text: string, detected: DetectedPattern[]) => string;
|
|
978
|
+
/**
|
|
979
|
+
* Determines the best pattern type for auto-generated rules based on detected patterns.
|
|
980
|
+
*
|
|
981
|
+
* @param detected - Array of detected patterns
|
|
982
|
+
* @returns Suggested pattern type and whether to use fuzzy matching
|
|
983
|
+
*/
|
|
984
|
+
declare const suggestPatternConfig: (detected: DetectedPattern[]) => {
|
|
985
|
+
patternType: "lineStartsWith" | "lineStartsAfter";
|
|
986
|
+
fuzzy: boolean;
|
|
987
|
+
metaType?: string;
|
|
988
|
+
};
|
|
989
|
+
/**
|
|
990
|
+
* Analyzes text and generates a complete suggested rule configuration.
|
|
991
|
+
*
|
|
992
|
+
* @param text - Highlighted text from the page
|
|
993
|
+
* @returns Suggested rule configuration or null if no patterns detected
|
|
456
994
|
*/
|
|
457
|
-
declare
|
|
995
|
+
declare const analyzeTextForRule: (text: string) => {
|
|
996
|
+
template: string;
|
|
997
|
+
patternType: "lineStartsWith" | "lineStartsAfter";
|
|
998
|
+
fuzzy: boolean;
|
|
999
|
+
metaType?: string;
|
|
1000
|
+
detected: DetectedPattern[];
|
|
1001
|
+
} | null;
|
|
458
1002
|
//#endregion
|
|
459
|
-
export {
|
|
1003
|
+
export { type Breakpoint, type BreakpointRule, type DetectedPattern, type ExpandResult, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
|
|
460
1004
|
//# sourceMappingURL=index.d.mts.map
|