flappa-doormal 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +325 -0
- package/README.md +477 -199
- package/dist/index.d.mts +871 -327
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1611 -393
- package/dist/index.mjs.map +1 -1
- package/package.json +13 -10
package/README.md
CHANGED
|
@@ -12,346 +12,624 @@
|
|
|
12
12
|
[](https://codecov.io/gh/ragaeeb/flappa-doormal)
|
|
13
13
|
[](https://badge.fury.io/js/flappa-doormal)
|
|
14
14
|
|
|
15
|
-
**Arabic text
|
|
15
|
+
**Declarative Arabic text segmentation library** - Split pages of content into logical segments using human-readable patterns.
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
## Why This Library?
|
|
18
|
+
|
|
19
|
+
### The Problem
|
|
20
|
+
|
|
21
|
+
Working with Arabic hadith and Islamic text collections requires splitting continuous text into segments (individual hadiths, chapters, verses). This traditionally means:
|
|
22
|
+
|
|
23
|
+
- Writing complex Unicode regex patterns: `^[\u0660-\u0669]+\s*[-–—ـ]\s*`
|
|
24
|
+
- Handling diacritic variations: `حَدَّثَنَا` vs `حدثنا`
|
|
25
|
+
- Managing multi-page spans and page boundary tracking
|
|
26
|
+
- Manually extracting hadith numbers, volume/page references
|
|
27
|
+
|
|
28
|
+
### What Exists
|
|
29
|
+
|
|
30
|
+
- **General regex libraries**: Don't understand Arabic text nuances
|
|
31
|
+
- **NLP tokenizers**: Overkill for pattern-based segmentation
|
|
32
|
+
- **Manual regex**: Error-prone, hard to maintain, no metadata extraction
|
|
33
|
+
|
|
34
|
+
### The Solution
|
|
35
|
+
|
|
36
|
+
**flappa-doormal** provides:
|
|
37
|
+
|
|
38
|
+
✅ **Readable templates**: `{{raqms}} {{dash}}` instead of cryptic regex
|
|
39
|
+
✅ **Named captures**: `{{raqms:hadithNum}}` auto-extracts to `meta.hadithNum`
|
|
40
|
+
✅ **Fuzzy matching**: Ignore diacritics with `fuzzy: true`
|
|
41
|
+
✅ **Page tracking**: Know which page each segment came from
|
|
42
|
+
✅ **Declarative rules**: Describe *what* to match, not *how*
|
|
18
43
|
|
|
19
44
|
## Installation
|
|
20
45
|
|
|
21
46
|
```bash
|
|
47
|
+
npm install flappa-doormal
|
|
48
|
+
# or
|
|
22
49
|
bun add flappa-doormal
|
|
23
|
-
#
|
|
24
|
-
|
|
50
|
+
# or
|
|
51
|
+
yarn add flappa-doormal
|
|
25
52
|
```
|
|
26
53
|
|
|
27
54
|
## Quick Start
|
|
28
55
|
|
|
29
56
|
```typescript
|
|
30
|
-
import {
|
|
31
|
-
|
|
32
|
-
//
|
|
33
|
-
const
|
|
34
|
-
|
|
57
|
+
import { segmentPages } from 'flappa-doormal';
|
|
58
|
+
|
|
59
|
+
// Your pages from a hadith book
|
|
60
|
+
const pages = [
|
|
61
|
+
{ id: 1, content: '٦٦٩٦ - حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ...' },
|
|
62
|
+
{ id: 1, content: '٦٦٩٧ - أَخْبَرَنَا عُمَرُ قَالَ...' },
|
|
63
|
+
{ id: 2, content: '٦٦٩٨ - حَدَّثَنِي مُحَمَّدٌ...' },
|
|
64
|
+
];
|
|
65
|
+
|
|
66
|
+
const segments = segmentPages(pages, {
|
|
67
|
+
rules: [{
|
|
68
|
+
lineStartsAfter: ['{{raqms:num}} {{dash}} '],
|
|
69
|
+
split: 'at',
|
|
70
|
+
}]
|
|
35
71
|
});
|
|
36
72
|
|
|
37
|
-
|
|
38
|
-
//
|
|
73
|
+
// Result:
|
|
74
|
+
// [
|
|
75
|
+
// { content: 'حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ...', from: 1, meta: { num: '٦٦٩٦' } },
|
|
76
|
+
// { content: 'أَخْبَرَنَا عُمَرُ قَالَ...', from: 1, meta: { num: '٦٦٩٧' } },
|
|
77
|
+
// { content: 'حَدَّثَنِي مُحَمَّدٌ...', from: 2, meta: { num: '٦٦٩٨' } }
|
|
78
|
+
// ]
|
|
39
79
|
```
|
|
40
80
|
|
|
41
81
|
## Features
|
|
42
82
|
|
|
43
|
-
|
|
44
|
-
✅ **Template System** - Use `{num}`, `{dash}`, `{bullet}` instead of regex
|
|
45
|
-
✅ **Type-Safe** - Full TypeScript support
|
|
46
|
-
✅ **Composable** - Mix and match tokens with quantifiers
|
|
47
|
-
✅ **Diacritic-Insensitive** - Handles Arabic text variations
|
|
83
|
+
### 1. Template Tokens
|
|
48
84
|
|
|
49
|
-
|
|
85
|
+
Replace regex with readable tokens:
|
|
50
86
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
{
|
|
54
|
-
{
|
|
55
|
-
{
|
|
56
|
-
{
|
|
57
|
-
{
|
|
58
|
-
{
|
|
59
|
-
|
|
87
|
+
| Token | Matches | Regex Equivalent |
|
|
88
|
+
|-------|---------|------------------|
|
|
89
|
+
| `{{raqms}}` | Arabic-Indic digits | `[\\u0660-\\u0669]+` |
|
|
90
|
+
| `{{raqm}}` | Single Arabic digit | `[\\u0660-\\u0669]` |
|
|
91
|
+
| `{{dash}}` | Dash variants | `[-–—ـ]` |
|
|
92
|
+
| `{{harf}}` | Arabic letter | `[أ-ي]` |
|
|
93
|
+
| `{{numbered}}` | Hadith numbering `٢٢ - ` | `{{raqms}} {{dash}} ` |
|
|
94
|
+
| `{{fasl}}` | Section markers | `فصل\|مسألة` |
|
|
95
|
+
| `{{tarqim}}` | Punctuation marks | `[.!?؟؛]` |
|
|
96
|
+
| `{{bullet}}` | Bullet points | `[•*°]` |
|
|
97
|
+
| `{{naql}}` | Narrator phrases | `حدثنا\|أخبرنا\|...` |
|
|
98
|
+
| `{{kitab}}` | "كتاب" (book) | `كتاب` |
|
|
99
|
+
| `{{bab}}` | "باب" (chapter) | `باب` |
|
|
100
|
+
| `{{basmalah}}` | "بسم الله" | `بسم الله` |
|
|
101
|
+
|
|
102
|
+
### 2. Named Capture Groups
|
|
103
|
+
|
|
104
|
+
Extract metadata automatically with the `{{token:name}}` syntax:
|
|
60
105
|
|
|
61
|
-
### Numbered Variants
|
|
62
106
|
```typescript
|
|
63
|
-
|
|
64
|
-
{
|
|
65
|
-
|
|
107
|
+
// Capture hadith number
|
|
108
|
+
{ template: '^{{raqms:hadithNum}} {{dash}} ' }
|
|
109
|
+
// Result: meta.hadithNum = '٦٦٩٦'
|
|
110
|
+
|
|
111
|
+
// Capture volume and page
|
|
112
|
+
{ template: '^{{raqms:vol}}/{{raqms:page}} {{dash}} ' }
|
|
113
|
+
// Result: meta.vol = '٣', meta.page = '٤٥٦'
|
|
114
|
+
|
|
115
|
+
// Capture rest of content
|
|
116
|
+
{ template: '^{{raqms:num}} {{dash}} {{:text}}' }
|
|
117
|
+
// Result: meta.num = '٦٦٩٦', meta.text = 'حَدَّثَنَا أَبُو بَكْرٍ'
|
|
66
118
|
```
|
|
67
119
|
|
|
68
|
-
###
|
|
120
|
+
### 3. Fuzzy Matching (Diacritic-Insensitive)
|
|
121
|
+
|
|
122
|
+
Match Arabic text regardless of harakat:
|
|
69
123
|
|
|
70
|
-
**Using templates (recommended):**
|
|
71
124
|
```typescript
|
|
72
|
-
{
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
125
|
+
const rules = [{
|
|
126
|
+
fuzzy: true,
|
|
127
|
+
lineStartsAfter: ['{{kitab:book}} '],
|
|
128
|
+
split: 'at',
|
|
129
|
+
}];
|
|
130
|
+
|
|
131
|
+
// Matches both:
|
|
132
|
+
// - 'كِتَابُ الصلاة' (with diacritics)
|
|
133
|
+
// - 'كتاب الصيام' (without diacritics)
|
|
76
134
|
```
|
|
77
135
|
|
|
78
|
-
|
|
136
|
+
### 4. Pattern Types
|
|
137
|
+
|
|
138
|
+
| Type | Marker in content? | Use case |
|
|
139
|
+
|------|-------------------|----------|
|
|
140
|
+
| `lineStartsWith` | ✅ Included | Keep marker, segment at boundary |
|
|
141
|
+
| `lineStartsAfter` | ❌ Excluded | Strip marker, capture only content |
|
|
142
|
+
| `lineEndsWith` | ✅ Included | Match patterns at end of line |
|
|
143
|
+
| `template` | Depends | Custom pattern with full control |
|
|
144
|
+
| `regex` | Depends | Raw regex for complex cases |
|
|
145
|
+
|
|
146
|
+
### 5. Page Constraints
|
|
147
|
+
|
|
148
|
+
Limit rules to specific page ranges:
|
|
149
|
+
|
|
79
150
|
```typescript
|
|
80
151
|
{
|
|
81
|
-
|
|
82
|
-
|
|
152
|
+
lineStartsWith: ['## '],
|
|
153
|
+
split: 'at',
|
|
154
|
+
min: 10, // Only pages 10+
|
|
155
|
+
max: 100, // Only pages up to 100
|
|
83
156
|
}
|
|
84
157
|
```
|
|
85
158
|
|
|
86
|
-
|
|
159
|
+
### 6. Occurrence Filtering
|
|
160
|
+
|
|
161
|
+
Control which matches to use:
|
|
162
|
+
|
|
87
163
|
```typescript
|
|
88
164
|
{
|
|
89
|
-
|
|
90
|
-
|
|
165
|
+
lineEndsWith: ['\\.'],
|
|
166
|
+
split: 'after',
|
|
167
|
+
occurrence: 'last', // Only split at LAST period on page
|
|
168
|
+
maxSpan: 1, // Apply per-page
|
|
91
169
|
}
|
|
92
170
|
```
|
|
93
171
|
|
|
94
|
-
##
|
|
172
|
+
## Use Cases
|
|
95
173
|
|
|
96
|
-
###
|
|
97
|
-
|
|
174
|
+
### Simple Hadith Segmentation
|
|
175
|
+
|
|
176
|
+
Use `{{numbered}}` for the common "number - content" format:
|
|
98
177
|
|
|
99
178
|
```typescript
|
|
100
|
-
{
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
179
|
+
const segments = segmentPages(pages, {
|
|
180
|
+
rules: [{
|
|
181
|
+
lineStartsAfter: ['{{numbered}}'],
|
|
182
|
+
split: 'at',
|
|
183
|
+
meta: { type: 'hadith' }
|
|
184
|
+
}]
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Matches: ٢٢ - حدثنا, ٦٦٩٦ – أخبرنا, etc.
|
|
188
|
+
// Content starts AFTER the number and dash
|
|
104
189
|
```
|
|
105
190
|
|
|
106
|
-
### Number
|
|
107
|
-
|
|
191
|
+
### Hadith Segmentation with Number Extraction
|
|
192
|
+
|
|
193
|
+
For capturing the hadith number, use explicit capture syntax:
|
|
108
194
|
|
|
109
195
|
```typescript
|
|
110
|
-
{
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
196
|
+
const segments = segmentPages(pages, {
|
|
197
|
+
rules: [{
|
|
198
|
+
lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
199
|
+
split: 'at',
|
|
200
|
+
meta: { type: 'hadith' }
|
|
201
|
+
}]
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Each segment has:
|
|
205
|
+
// - content: The hadith text (without number prefix)
|
|
206
|
+
// - from/to: Page range
|
|
207
|
+
// - meta: { type: 'hadith', hadithNum: '٦٦٩٦' }
|
|
114
208
|
```
|
|
115
209
|
|
|
116
|
-
###
|
|
117
|
-
Match patterns like: `١٠٢٦٦ / ١ - "وَإِذَا`
|
|
210
|
+
### Volume/Page Reference Extraction
|
|
118
211
|
|
|
119
212
|
```typescript
|
|
120
|
-
{
|
|
121
|
-
|
|
122
|
-
}
|
|
213
|
+
const segments = segmentPages(pages, {
|
|
214
|
+
rules: [{
|
|
215
|
+
lineStartsAfter: ['{{raqms:vol}}/{{raqms:page}} {{dash}} '],
|
|
216
|
+
split: 'at'
|
|
217
|
+
}]
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
// meta: { vol: '٣', page: '٤٥٦' }
|
|
123
221
|
```
|
|
124
222
|
|
|
125
|
-
###
|
|
126
|
-
Match patterns like: `. . . . . . . . . .`
|
|
223
|
+
### Chapter Detection with Fuzzy Matching
|
|
127
224
|
|
|
128
225
|
```typescript
|
|
129
|
-
{
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
}
|
|
226
|
+
const segments = segmentPages(pages, {
|
|
227
|
+
rules: [{
|
|
228
|
+
fuzzy: true,
|
|
229
|
+
lineStartsAfter: ['{{kitab:book}} '],
|
|
230
|
+
split: 'at',
|
|
231
|
+
meta: { type: 'chapter' }
|
|
232
|
+
}]
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// Matches "كِتَابُ" or "كتاب" regardless of diacritics
|
|
133
236
|
```
|
|
134
237
|
|
|
135
|
-
###
|
|
136
|
-
Match patterns like: `*. . . / ٨٦ - حَدَّثَنَا`
|
|
238
|
+
### Naql (Transmission) Phrase Detection
|
|
137
239
|
|
|
138
|
-
**Option 1: Capture from asterisk**
|
|
139
240
|
```typescript
|
|
140
|
-
{
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
241
|
+
const segments = segmentPages(pages, {
|
|
242
|
+
rules: [{
|
|
243
|
+
fuzzy: true,
|
|
244
|
+
lineStartsWith: ['{{naql:phrase}}'],
|
|
245
|
+
split: 'at'
|
|
246
|
+
}]
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
// meta.phrase captures which narrator phrase was matched:
|
|
250
|
+
// 'حدثنا', 'أخبرنا', 'حدثني', etc.
|
|
145
251
|
```
|
|
146
252
|
|
|
147
|
-
|
|
253
|
+
### Mixed Captured and Non-Captured Tokens
|
|
254
|
+
|
|
148
255
|
```typescript
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
}
|
|
256
|
+
// Only capture the number, not the letter
|
|
257
|
+
const segments = segmentPages(pages, {
|
|
258
|
+
rules: [{
|
|
259
|
+
lineStartsWith: ['{{raqms:num}} {{harf}} {{dash}} '],
|
|
260
|
+
split: 'at'
|
|
261
|
+
}]
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
// Input: '٥ أ - البند الأول'
|
|
265
|
+
// meta: { num: '٥' } // harf not captured (no :name suffix)
|
|
153
266
|
```
|
|
154
267
|
|
|
155
|
-
|
|
268
|
+
### Sentence-Based Splitting (Last Period Per Page)
|
|
156
269
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
| `{space}` | Required space | `\\s+` |
|
|
270
|
+
```typescript
|
|
271
|
+
const segments = segmentPages(pages, {
|
|
272
|
+
rules: [{
|
|
273
|
+
lineEndsWith: ['\\.'],
|
|
274
|
+
split: 'after',
|
|
275
|
+
occurrence: 'last',
|
|
276
|
+
maxSpan: 1
|
|
277
|
+
}]
|
|
278
|
+
});
|
|
279
|
+
```
|
|
168
280
|
|
|
169
|
-
|
|
281
|
+
### Page Fallback for Unmatched Content
|
|
170
282
|
|
|
171
|
-
|
|
283
|
+
When using `maxSpan` to group matches per page, use `fallback: 'page'` to prevent unmatched pages from merging with adjacent segments:
|
|
172
284
|
|
|
173
|
-
### Before (Regex)
|
|
174
285
|
```typescript
|
|
175
|
-
const
|
|
286
|
+
const segments = segmentPages(pages, {
|
|
287
|
+
rules: [{
|
|
288
|
+
template: '{{tarqim}}', // Match punctuation marks
|
|
289
|
+
split: 'after',
|
|
290
|
+
occurrence: 'last',
|
|
291
|
+
maxSpan: 1,
|
|
292
|
+
fallback: 'page' // If no punctuation found, segment the page anyway
|
|
293
|
+
}]
|
|
294
|
+
});
|
|
176
295
|
```
|
|
177
296
|
|
|
178
|
-
|
|
297
|
+
**Without `fallback`**: Pages without matches merge into the next segment
|
|
298
|
+
**With `fallback: 'page'`**: Each page becomes its own segment even without matches
|
|
299
|
+
|
|
300
|
+
> **Future extensions**: The `fallback` option may support additional values like `'skip'` (omit unmatched content) or `'line'` (split at line breaks) in future versions.
|
|
301
|
+
|
|
302
|
+
### Multiple Rules with Priority
|
|
303
|
+
|
|
179
304
|
```typescript
|
|
180
|
-
{
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
305
|
+
const segments = segmentPages(pages, {
|
|
306
|
+
rules: [
|
|
307
|
+
// First: Chapter headers (highest priority)
|
|
308
|
+
{ fuzzy: true, lineStartsAfter: ['{{kitab:book}} '], split: 'at', meta: { type: 'chapter' } },
|
|
309
|
+
// Second: Sub-chapters
|
|
310
|
+
{ fuzzy: true, lineStartsAfter: ['{{bab:section}} '], split: 'at', meta: { type: 'section' } },
|
|
311
|
+
// Third: Individual hadiths
|
|
312
|
+
{ lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at', meta: { type: 'hadith' } },
|
|
313
|
+
]
|
|
314
|
+
});
|
|
184
315
|
```
|
|
185
316
|
|
|
186
|
-
|
|
317
|
+
## API Reference
|
|
187
318
|
|
|
188
|
-
|
|
319
|
+
### `segmentPages(pages, options)`
|
|
189
320
|
|
|
190
|
-
|
|
321
|
+
Main segmentation function.
|
|
191
322
|
|
|
192
323
|
```typescript
|
|
193
|
-
import {
|
|
324
|
+
import { segmentPages, type Page, type SegmentationOptions, type Segment } from 'flappa-doormal';
|
|
325
|
+
|
|
326
|
+
const pages: Page[] = [
|
|
327
|
+
{ id: 1, content: 'First page content...' },
|
|
328
|
+
{ id: 2, content: 'Second page content...' },
|
|
329
|
+
];
|
|
194
330
|
|
|
195
|
-
const
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
removeMarker: true, // Remove marker from capture (default: true)
|
|
331
|
+
const options: SegmentationOptions = {
|
|
332
|
+
rules: [
|
|
333
|
+
{ lineStartsWith: ['## '], split: 'at' }
|
|
334
|
+
]
|
|
200
335
|
};
|
|
201
336
|
|
|
202
|
-
const
|
|
337
|
+
const segments: Segment[] = segmentPages(pages, options);
|
|
203
338
|
```
|
|
204
339
|
|
|
205
|
-
### `
|
|
340
|
+
### `stripHtmlTags(html)`
|
|
341
|
+
|
|
342
|
+
Remove all HTML tags from content, keeping only text.
|
|
206
343
|
|
|
207
344
|
```typescript
|
|
208
|
-
import {
|
|
345
|
+
import { stripHtmlTags } from 'flappa-doormal';
|
|
346
|
+
|
|
347
|
+
const text = stripHtmlTags('<p>Hello <b>World</b></p>');
|
|
348
|
+
// Returns: 'Hello World'
|
|
349
|
+
```
|
|
209
350
|
|
|
210
|
-
|
|
211
|
-
// Returns: '^[\\u0660-\\u0669]+ [-–—ـ](.*)'
|
|
351
|
+
For more sophisticated HTML to Markdown conversion (like converting `<span data-type="title">` to `## ` headers), you can implement your own function. Here's an example:
|
|
212
352
|
|
|
213
|
-
|
|
214
|
-
|
|
353
|
+
```typescript
|
|
354
|
+
const htmlToMarkdown = (html: string): string => {
|
|
355
|
+
return html
|
|
356
|
+
// Convert title spans to markdown headers
|
|
357
|
+
.replace(/<span[^>]*data-type=["']title["'][^>]*>(.*?)<\/span>/gi, '## $1')
|
|
358
|
+
// Strip narrator links but keep text
|
|
359
|
+
.replace(/<a[^>]*href=["']inr:\/\/[^"']*["'][^>]*>(.*?)<\/a>/gi, '$1')
|
|
360
|
+
// Strip all remaining HTML tags
|
|
361
|
+
.replace(/<[^>]*>/g, '');
|
|
362
|
+
};
|
|
215
363
|
```
|
|
216
364
|
|
|
217
|
-
### `
|
|
365
|
+
### `expandTokens(template)`
|
|
366
|
+
|
|
367
|
+
Expand template tokens to regex pattern.
|
|
218
368
|
|
|
219
369
|
```typescript
|
|
220
|
-
import {
|
|
370
|
+
import { expandTokens } from 'flappa-doormal';
|
|
221
371
|
|
|
222
|
-
const
|
|
223
|
-
// Returns:
|
|
372
|
+
const pattern = expandTokens('{{raqms}} {{dash}}');
|
|
373
|
+
// Returns: '[\u0660-\u0669]+ [-–—ـ]'
|
|
224
374
|
```
|
|
225
375
|
|
|
226
|
-
|
|
376
|
+
### `makeDiacriticInsensitive(text)`
|
|
377
|
+
|
|
378
|
+
Make Arabic text diacritic-insensitive for fuzzy matching.
|
|
227
379
|
|
|
228
380
|
```typescript
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
format?: string; // Template for numbered markers
|
|
234
|
-
template?: string; // Template for pattern markers
|
|
235
|
-
pattern?: string; // Raw regex (when templates aren't enough)
|
|
236
|
-
tokens?: Record<string, string>; // Custom token definitions
|
|
237
|
-
phrases?: string[]; // For 'phrase' and 'hadith-chain' types
|
|
238
|
-
removeMarker?: boolean; // Default: true for numbered/bullet
|
|
239
|
-
};
|
|
381
|
+
import { makeDiacriticInsensitive } from 'flappa-doormal';
|
|
382
|
+
|
|
383
|
+
const pattern = makeDiacriticInsensitive('حدثنا');
|
|
384
|
+
// Returns regex pattern matching 'حَدَّثَنَا', 'حدثنا', etc.
|
|
240
385
|
```
|
|
241
386
|
|
|
242
|
-
|
|
387
|
+
### `TOKEN_PATTERNS`
|
|
243
388
|
|
|
244
|
-
|
|
389
|
+
Access available token definitions.
|
|
245
390
|
|
|
246
391
|
```typescript
|
|
247
|
-
import {
|
|
392
|
+
import { TOKEN_PATTERNS } from 'flappa-doormal';
|
|
393
|
+
|
|
394
|
+
console.log(TOKEN_PATTERNS.narrated);
|
|
395
|
+
// 'حدثنا|أخبرنا|حدثني|وحدثنا|أنبأنا|سمعت'
|
|
396
|
+
```
|
|
248
397
|
|
|
249
|
-
|
|
250
|
-
const myPhrases = [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي', 'سَمِعْتُ'];
|
|
398
|
+
### Pattern Detection
|
|
251
399
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
}
|
|
400
|
+
Auto-detect tokens in Arabic text for building rules:
|
|
401
|
+
|
|
402
|
+
```typescript
|
|
403
|
+
import { detectTokenPatterns, analyzeTextForRule } from 'flappa-doormal';
|
|
404
|
+
|
|
405
|
+
// Detect individual tokens
|
|
406
|
+
const tokens = detectTokenPatterns('٣٤ - حدثنا');
|
|
407
|
+
// [
|
|
408
|
+
// { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
|
|
409
|
+
// { token: 'dash', match: '-', index: 3, endIndex: 4 },
|
|
410
|
+
// { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
|
|
411
|
+
// ]
|
|
412
|
+
|
|
413
|
+
// Get complete rule suggestion
|
|
414
|
+
const rule = analyzeTextForRule('٣٤ - ');
|
|
415
|
+
// {
|
|
416
|
+
// template: '{{raqms}} {{dash}} ',
|
|
417
|
+
// patternType: 'lineStartsAfter',
|
|
418
|
+
// fuzzy: false,
|
|
419
|
+
// metaType: 'hadith',
|
|
420
|
+
// detected: [...]
|
|
421
|
+
// }
|
|
256
422
|
```
|
|
257
423
|
|
|
258
|
-
|
|
424
|
+
## Types
|
|
425
|
+
|
|
426
|
+
### `SplitRule`
|
|
259
427
|
|
|
260
428
|
```typescript
|
|
261
|
-
|
|
429
|
+
type SplitRule = {
|
|
430
|
+
// Pattern (choose one)
|
|
431
|
+
lineStartsWith?: string[];
|
|
432
|
+
lineStartsAfter?: string[];
|
|
433
|
+
lineEndsWith?: string[];
|
|
434
|
+
template?: string;
|
|
435
|
+
regex?: string;
|
|
436
|
+
|
|
437
|
+
// Split behavior
|
|
438
|
+
split: 'at' | 'after';
|
|
439
|
+
occurrence?: 'first' | 'last' | 'all';
|
|
440
|
+
maxSpan?: number;
|
|
441
|
+
fuzzy?: boolean;
|
|
442
|
+
fallback?: 'page'; // NEW: Page-boundary fallback
|
|
443
|
+
|
|
444
|
+
// Constraints
|
|
445
|
+
min?: number;
|
|
446
|
+
max?: number;
|
|
447
|
+
meta?: Record<string, unknown>;
|
|
448
|
+
};
|
|
449
|
+
```
|
|
262
450
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
451
|
+
### `Segment`
|
|
452
|
+
|
|
453
|
+
```typescript
|
|
454
|
+
type Segment = {
|
|
455
|
+
content: string;
|
|
456
|
+
from: number;
|
|
457
|
+
to?: number;
|
|
458
|
+
meta?: Record<string, unknown>;
|
|
459
|
+
};
|
|
268
460
|
```
|
|
269
461
|
|
|
270
|
-
###
|
|
462
|
+
### `Logger`
|
|
463
|
+
|
|
464
|
+
Optional logging interface for debugging segmentation:
|
|
271
465
|
|
|
272
466
|
```typescript
|
|
273
|
-
|
|
467
|
+
interface Logger {
|
|
468
|
+
trace?: (message: string, ...args: unknown[]) => void; // Per-iteration details
|
|
469
|
+
debug?: (message: string, ...args: unknown[]) => void; // Detailed operations
|
|
470
|
+
info?: (message: string, ...args: unknown[]) => void; // Key progress points
|
|
471
|
+
warn?: (message: string, ...args: unknown[]) => void; // Potential issues
|
|
472
|
+
error?: (message: string, ...args: unknown[]) => void; // Critical failures
|
|
473
|
+
}
|
|
474
|
+
```
|
|
274
475
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
476
|
+
## Debugging
|
|
477
|
+
|
|
478
|
+
### Using the Logger
|
|
479
|
+
|
|
480
|
+
Pass a `logger` option to receive detailed information about the segmentation process:
|
|
481
|
+
|
|
482
|
+
```typescript
|
|
483
|
+
// Console logger for development
|
|
484
|
+
const segments = segmentPages(pages, {
|
|
485
|
+
rules: [...],
|
|
486
|
+
logger: {
|
|
487
|
+
debug: console.debug,
|
|
488
|
+
info: console.info,
|
|
489
|
+
warn: console.warn,
|
|
490
|
+
}
|
|
278
491
|
});
|
|
279
492
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
493
|
+
// Production logger (only errors)
|
|
494
|
+
const segments = segmentPages(pages, {
|
|
495
|
+
rules: [...],
|
|
496
|
+
logger: {
|
|
497
|
+
error: (msg, ...args) => myLoggingService.error(msg, args),
|
|
498
|
+
}
|
|
283
499
|
});
|
|
284
500
|
```
|
|
285
501
|
|
|
286
|
-
|
|
502
|
+
**Verbosity levels:**
|
|
503
|
+
- `trace` - Per-iteration loop details (very verbose)
|
|
504
|
+
- `debug` - Segment processing, pattern matching
|
|
505
|
+
- `info` - Start/completion of breakpoint processing
|
|
506
|
+
- `warn` - Safety checks triggered
|
|
507
|
+
- `error` - Infinite loop detection
|
|
287
508
|
|
|
288
|
-
|
|
289
|
-
- `DEFAULT_HADITH_PHRASES` - Default narrator phrases
|
|
290
|
-
- `DEFAULT_BASMALA_PATTERNS` - Default basmala patterns
|
|
291
|
-
- `TOKENS` - Token definitions
|
|
509
|
+
When no logger is provided, no logging overhead is incurred.
|
|
292
510
|
|
|
293
|
-
**Functions:**
|
|
294
|
-
- `generateRegexFromMarker()` - Main function
|
|
295
|
-
- `generate{Type}Regex()` - 12 type-specific generators
|
|
296
|
-
- `expandTemplate()` - Template expansion
|
|
297
|
-
- `validateTemplate()` - Template validation
|
|
298
|
-
- `createTokenMap()` - Custom token maps
|
|
299
511
|
|
|
300
|
-
##
|
|
512
|
+
## Usage with Next.js / Node.js
|
|
301
513
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
514
|
+
```typescript
|
|
515
|
+
// app/api/segment/route.ts (Next.js App Router)
|
|
516
|
+
import { segmentPages } from 'flappa-doormal';
|
|
517
|
+
import { NextResponse } from 'next/server';
|
|
518
|
+
|
|
519
|
+
export async function POST(request: Request) {
|
|
520
|
+
const { pages, rules } = await request.json();
|
|
521
|
+
|
|
522
|
+
const segments = segmentPages(pages, { rules });
|
|
523
|
+
|
|
524
|
+
return NextResponse.json({ segments });
|
|
525
|
+
}
|
|
526
|
+
```
|
|
307
527
|
|
|
308
|
-
|
|
309
|
-
|
|
528
|
+
```typescript
|
|
529
|
+
// Node.js script
|
|
530
|
+
import { segmentPages, stripHtmlTags } from 'flappa-doormal';
|
|
531
|
+
|
|
532
|
+
const pages = rawPages.map((p, i) => ({
|
|
533
|
+
id: i + 1,
|
|
534
|
+
content: stripHtmlTags(p.html)
|
|
535
|
+
}));
|
|
536
|
+
|
|
537
|
+
const segments = segmentPages(pages, {
|
|
538
|
+
rules: [{
|
|
539
|
+
lineStartsAfter: ['{{raqms:num}} {{dash}} '],
|
|
540
|
+
split: 'at'
|
|
541
|
+
}]
|
|
542
|
+
});
|
|
310
543
|
|
|
311
|
-
|
|
312
|
-
bun test --coverage
|
|
544
|
+
console.log(`Found ${segments.length} segments`);
|
|
313
545
|
```
|
|
314
546
|
|
|
315
|
-
**Test Coverage**: 100% coverage for `type-generators.ts` with 54+ test cases covering:
|
|
316
|
-
- All 12 marker type generators
|
|
317
|
-
- Edge cases (empty phrases, diacritic variations, custom separators)
|
|
318
|
-
- Error handling (missing required fields)
|
|
319
|
-
- Various numbering styles and separators
|
|
320
|
-
|
|
321
547
|
## Development
|
|
322
548
|
|
|
323
549
|
```bash
|
|
324
550
|
# Install dependencies
|
|
325
551
|
bun install
|
|
326
552
|
|
|
327
|
-
# Run tests
|
|
553
|
+
# Run tests (251 tests)
|
|
328
554
|
bun test
|
|
329
555
|
|
|
330
|
-
# Build
|
|
556
|
+
# Build
|
|
331
557
|
bun run build
|
|
332
558
|
|
|
333
|
-
#
|
|
334
|
-
|
|
559
|
+
# Run performance test (generates 50K pages, measures segmentation speed/memory)
|
|
560
|
+
bun run perf
|
|
335
561
|
|
|
336
|
-
# Lint
|
|
562
|
+
# Lint
|
|
337
563
|
bunx biome lint .
|
|
564
|
+
|
|
565
|
+
# Format
|
|
566
|
+
bunx biome format --write .
|
|
567
|
+
```
|
|
568
|
+
|
|
569
|
+
## Design Decisions
|
|
570
|
+
|
|
571
|
+
### Double-Brace Syntax `{{token}}`
|
|
572
|
+
|
|
573
|
+
Single braces conflict with regex quantifiers `{n,m}`. Double braces are visually distinct and match common template syntax (Handlebars, Mustache).
|
|
574
|
+
|
|
575
|
+
### `lineStartsAfter` vs `lineStartsWith`
|
|
576
|
+
|
|
577
|
+
- `lineStartsWith`: Keep marker in content (for detection only)
|
|
578
|
+
- `lineStartsAfter`: Strip marker, capture only content (for clean extraction)
|
|
579
|
+
|
|
580
|
+
### Fuzzy Applied at Token Level
|
|
581
|
+
|
|
582
|
+
Fuzzy transforms are applied to raw Arabic text *before* wrapping in regex groups. This prevents corruption of regex metacharacters like `(`, `)`, `|`.
|
|
583
|
+
|
|
584
|
+
### Extracted Utilities
|
|
585
|
+
|
|
586
|
+
Complex logic was extracted into `match-utils.ts` for independent testing and reduced complexity (main function: 37 → 10).
|
|
587
|
+
|
|
588
|
+
## Performance Notes
|
|
589
|
+
|
|
590
|
+
### Memory Requirements
|
|
591
|
+
|
|
592
|
+
The library concatenates all pages into a single string for pattern matching across page boundaries. Memory usage scales linearly with total content size:
|
|
593
|
+
|
|
594
|
+
| Pages | Avg Page Size | Approximate Memory |
|
|
595
|
+
|-------|---------------|-------------------|
|
|
596
|
+
| 1,000 | 5 KB | ~5 MB |
|
|
597
|
+
| 6,000 | 5 KB | ~30 MB |
|
|
598
|
+
| 40,000 | 5 KB | ~200 MB |
|
|
599
|
+
|
|
600
|
+
For typical book processing (up to 6,000 pages), memory usage is well within Node.js defaults. For very large books (40,000+ pages), ensure adequate heap size.
|
|
601
|
+
|
|
602
|
+
### `maxSpan` Sliding Window Behavior
|
|
603
|
+
|
|
604
|
+
The `maxSpan` option uses a **sliding window algorithm** based on page ID difference:
|
|
605
|
+
|
|
606
|
+
```typescript
|
|
607
|
+
// maxSpan = maximum page ID difference when looking ahead for split points
|
|
608
|
+
// Algorithm prefers LONGER segments by looking as far ahead as allowed
|
|
609
|
+
|
|
610
|
+
// Pages [1, 2, 3, 4] with maxSpan: 1, occurrence: 'last'
|
|
611
|
+
// Window from page 1: pages 1-2 (diff <= 1), splits at page 2's last match
|
|
612
|
+
// Window from page 3: pages 3-4 (diff <= 1), splits at page 4's last match
|
|
613
|
+
// Result: 2 segments spanning pages 1-2 and 3-4
|
|
614
|
+
|
|
615
|
+
// Pages [1, 5, 10] with maxSpan: 1, occurrence: 'last'
|
|
616
|
+
// Window from page 1: only page 1 (5-1=4 > 1), splits at page 1
|
|
617
|
+
// Window from page 5: only page 5 (10-5=5 > 1), splits at page 5
|
|
618
|
+
// Window from page 10: only page 10, splits at page 10
|
|
619
|
+
// Result: 3 segments (pages too far apart to merge)
|
|
338
620
|
```
|
|
339
621
|
|
|
622
|
+
This is intentional for books where page IDs represent actual page numbers. With `occurrence: 'last'`, the algorithm finds the last match within the lookahead window, creating longer segments where possible.
|
|
623
|
+
|
|
340
624
|
## For AI Agents
|
|
341
625
|
|
|
342
|
-
See [AGENTS.md](./AGENTS.md) for
|
|
343
|
-
-
|
|
344
|
-
- Adding new
|
|
345
|
-
-
|
|
346
|
-
-
|
|
347
|
-
- Extension points
|
|
626
|
+
See [AGENTS.md](./AGENTS.md) for:
|
|
627
|
+
- Architecture details and design patterns
|
|
628
|
+
- Adding new tokens and pattern types
|
|
629
|
+
- Algorithm explanations
|
|
630
|
+
- Lessons learned during development
|
|
348
631
|
|
|
349
632
|
## License
|
|
350
633
|
|
|
351
634
|
MIT
|
|
352
635
|
|
|
353
|
-
## Related
|
|
354
|
-
|
|
355
|
-
- [bitaboom](https://github.com/ragaeeb/bitaboom) - Arabic text utilities
|
|
356
|
-
- [baburchi](https://github.com/ragaeeb/baburchi) - Text sanitization
|
|
357
|
-
- [shamela](https://github.com/ragaeeb/shamela) - Shamela library utilities
|