flappa-doormal 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +407 -205
- package/dist/index.d.mts +722 -332
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1363 -396
- package/dist/index.mjs.map +1 -1
- package/package.json +11 -9
package/README.md
CHANGED
|
@@ -12,311 +12,461 @@
|
|
|
12
12
|
[](https://codecov.io/gh/ragaeeb/flappa-doormal)
|
|
13
13
|
[](https://badge.fury.io/js/flappa-doormal)
|
|
14
14
|
|
|
15
|
-
**Arabic text
|
|
15
|
+
**Declarative Arabic text segmentation library** - Split pages of content into logical segments using human-readable patterns.
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
## Why This Library?
|
|
18
|
+
|
|
19
|
+
### The Problem
|
|
20
|
+
|
|
21
|
+
Working with Arabic hadith and Islamic text collections requires splitting continuous text into segments (individual hadiths, chapters, verses). This traditionally means:
|
|
22
|
+
|
|
23
|
+
- Writing complex Unicode regex patterns: `^[\u0660-\u0669]+\s*[-–—ـ]\s*`
|
|
24
|
+
- Handling diacritic variations: `حَدَّثَنَا` vs `حدثنا`
|
|
25
|
+
- Managing multi-page spans and page boundary tracking
|
|
26
|
+
- Manually extracting hadith numbers, volume/page references
|
|
27
|
+
|
|
28
|
+
### What Exists
|
|
29
|
+
|
|
30
|
+
- **General regex libraries**: Don't understand Arabic text nuances
|
|
31
|
+
- **NLP tokenizers**: Overkill for pattern-based segmentation
|
|
32
|
+
- **Manual regex**: Error-prone, hard to maintain, no metadata extraction
|
|
33
|
+
|
|
34
|
+
### The Solution
|
|
35
|
+
|
|
36
|
+
**flappa-doormal** provides:
|
|
37
|
+
|
|
38
|
+
✅ **Readable templates**: `{{raqms}} {{dash}}` instead of cryptic regex
|
|
39
|
+
✅ **Named captures**: `{{raqms:hadithNum}}` auto-extracts to `meta.hadithNum`
|
|
40
|
+
✅ **Fuzzy matching**: Ignore diacritics with `fuzzy: true`
|
|
41
|
+
✅ **Page tracking**: Know which page each segment came from
|
|
42
|
+
✅ **Declarative rules**: Describe *what* to match, not *how*
|
|
18
43
|
|
|
19
44
|
## Installation
|
|
20
45
|
|
|
21
46
|
```bash
|
|
47
|
+
npm install flappa-doormal
|
|
48
|
+
# or
|
|
22
49
|
bun add flappa-doormal
|
|
23
|
-
#
|
|
24
|
-
|
|
50
|
+
# or
|
|
51
|
+
yarn add flappa-doormal
|
|
25
52
|
```
|
|
26
53
|
|
|
27
54
|
## Quick Start
|
|
28
55
|
|
|
29
56
|
```typescript
|
|
30
|
-
import {
|
|
31
|
-
|
|
32
|
-
//
|
|
33
|
-
const
|
|
34
|
-
|
|
57
|
+
import { segmentPages } from 'flappa-doormal';
|
|
58
|
+
|
|
59
|
+
// Your pages from a hadith book
|
|
60
|
+
const pages = [
|
|
61
|
+
{ id: 1, content: '٦٦٩٦ - حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ...' },
|
|
62
|
+
{ id: 1, content: '٦٦٩٧ - أَخْبَرَنَا عُمَرُ قَالَ...' },
|
|
63
|
+
{ id: 2, content: '٦٦٩٨ - حَدَّثَنِي مُحَمَّدٌ...' },
|
|
64
|
+
];
|
|
65
|
+
|
|
66
|
+
const segments = segmentPages(pages, {
|
|
67
|
+
rules: [{
|
|
68
|
+
lineStartsAfter: ['{{raqms:num}} {{dash}} '],
|
|
69
|
+
split: 'at',
|
|
70
|
+
}]
|
|
35
71
|
});
|
|
36
72
|
|
|
37
|
-
|
|
38
|
-
//
|
|
73
|
+
// Result:
|
|
74
|
+
// [
|
|
75
|
+
// { content: 'حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ...', from: 1, meta: { num: '٦٦٩٦' } },
|
|
76
|
+
// { content: 'أَخْبَرَنَا عُمَرُ قَالَ...', from: 1, meta: { num: '٦٦٩٧' } },
|
|
77
|
+
// { content: 'حَدَّثَنِي مُحَمَّدٌ...', from: 2, meta: { num: '٦٦٩٨' } }
|
|
78
|
+
// ]
|
|
39
79
|
```
|
|
40
80
|
|
|
41
81
|
## Features
|
|
42
82
|
|
|
43
|
-
|
|
44
|
-
✅ **Template System** - Use `{num}`, `{dash}`, `{bullet}` instead of regex
|
|
45
|
-
✅ **Type-Safe** - Full TypeScript support
|
|
46
|
-
✅ **Composable** - Mix and match tokens with quantifiers
|
|
47
|
-
✅ **Diacritic-Insensitive** - Handles Arabic text variations
|
|
83
|
+
### 1. Template Tokens
|
|
48
84
|
|
|
49
|
-
|
|
85
|
+
Replace regex with readable tokens:
|
|
50
86
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
{
|
|
54
|
-
{
|
|
55
|
-
{
|
|
56
|
-
{
|
|
57
|
-
{
|
|
58
|
-
{
|
|
59
|
-
|
|
87
|
+
| Token | Matches | Regex Equivalent |
|
|
88
|
+
|-------|---------|------------------|
|
|
89
|
+
| `{{raqms}}` | Arabic-Indic digits | `[\\u0660-\\u0669]+` |
|
|
90
|
+
| `{{raqm}}` | Single Arabic digit | `[\\u0660-\\u0669]` |
|
|
91
|
+
| `{{dash}}` | Dash variants | `[-–—ـ]` |
|
|
92
|
+
| `{{harf}}` | Arabic letter | `[أ-ي]` |
|
|
93
|
+
| `{{numbered}}` | Hadith numbering `٢٢ - ` | `{{raqms}} {{dash}} ` |
|
|
94
|
+
| `{{fasl}}` | Section markers | `فصل\|مسألة` |
|
|
95
|
+
| `{{tarqim}}` | Punctuation marks | `[.!?؟؛]` |
|
|
96
|
+
| `{{bullet}}` | Bullet points | `[•*°]` |
|
|
97
|
+
| `{{naql}}` | Narrator phrases | `حدثنا\|أخبرنا\|...` |
|
|
98
|
+
| `{{kitab}}` | "كتاب" (book) | `كتاب` |
|
|
99
|
+
| `{{bab}}` | "باب" (chapter) | `باب` |
|
|
100
|
+
| `{{basmalah}}` | "بسم الله" | `بسم الله` |
|
|
60
101
|
|
|
61
|
-
###
|
|
62
|
-
```typescript
|
|
63
|
-
{ type: 'num-letter' } // ٥ أ - (number + letter)
|
|
64
|
-
{ type: 'num-paren' } // ٥ (أ) - (number + paren)
|
|
65
|
-
{ type: 'num-slash' } // ٥/٦ - (number/number)
|
|
66
|
-
```
|
|
102
|
+
### 2. Named Capture Groups
|
|
67
103
|
|
|
68
|
-
|
|
104
|
+
Extract metadata automatically with the `{{token:name}}` syntax:
|
|
69
105
|
|
|
70
|
-
**Using templates (recommended):**
|
|
71
106
|
```typescript
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
107
|
+
// Capture hadith number
|
|
108
|
+
{ template: '^{{raqms:hadithNum}} {{dash}} ' }
|
|
109
|
+
// Result: meta.hadithNum = '٦٦٩٦'
|
|
110
|
+
|
|
111
|
+
// Capture volume and page
|
|
112
|
+
{ template: '^{{raqms:vol}}/{{raqms:page}} {{dash}} ' }
|
|
113
|
+
// Result: meta.vol = '٣', meta.page = '٤٥٦'
|
|
114
|
+
|
|
115
|
+
// Capture rest of content
|
|
116
|
+
{ template: '^{{raqms:num}} {{dash}} {{:text}}' }
|
|
117
|
+
// Result: meta.num = '٦٦٩٦', meta.text = 'حَدَّثَنَا أَبُو بَكْرٍ'
|
|
76
118
|
```
|
|
77
119
|
|
|
78
|
-
|
|
120
|
+
### 3. Fuzzy Matching (Diacritic-Insensitive)
|
|
121
|
+
|
|
122
|
+
Match Arabic text regardless of harakat:
|
|
123
|
+
|
|
79
124
|
```typescript
|
|
80
|
-
{
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
125
|
+
const rules = [{
|
|
126
|
+
fuzzy: true,
|
|
127
|
+
lineStartsAfter: ['{{kitab:book}} '],
|
|
128
|
+
split: 'at',
|
|
129
|
+
}];
|
|
130
|
+
|
|
131
|
+
// Matches both:
|
|
132
|
+
// - 'كِتَابُ الصلاة' (with diacritics)
|
|
133
|
+
// - 'كتاب الصيام' (without diacritics)
|
|
84
134
|
```
|
|
85
135
|
|
|
86
|
-
|
|
136
|
+
### 4. Pattern Types
|
|
137
|
+
|
|
138
|
+
| Type | Marker in content? | Use case |
|
|
139
|
+
|------|-------------------|----------|
|
|
140
|
+
| `lineStartsWith` | ✅ Included | Keep marker, segment at boundary |
|
|
141
|
+
| `lineStartsAfter` | ❌ Excluded | Strip marker, capture only content |
|
|
142
|
+
| `lineEndsWith` | ✅ Included | Match patterns at end of line |
|
|
143
|
+
| `template` | Depends | Custom pattern with full control |
|
|
144
|
+
| `regex` | Depends | Raw regex for complex cases |
|
|
145
|
+
|
|
146
|
+
### 5. Page Constraints
|
|
147
|
+
|
|
148
|
+
Limit rules to specific page ranges:
|
|
149
|
+
|
|
87
150
|
```typescript
|
|
88
151
|
{
|
|
89
|
-
|
|
90
|
-
|
|
152
|
+
lineStartsWith: ['## '],
|
|
153
|
+
split: 'at',
|
|
154
|
+
min: 10, // Only pages 10+
|
|
155
|
+
max: 100, // Only pages up to 100
|
|
91
156
|
}
|
|
92
157
|
```
|
|
93
158
|
|
|
94
|
-
|
|
159
|
+
### 6. Occurrence Filtering
|
|
95
160
|
|
|
96
|
-
|
|
97
|
-
Match patterns like: `٩٩٣٦، ٩٩٣٧ - حَدَّثَنَا`
|
|
161
|
+
Control which matches to use:
|
|
98
162
|
|
|
99
163
|
```typescript
|
|
100
164
|
{
|
|
101
|
-
|
|
102
|
-
|
|
165
|
+
lineEndsWith: ['\\.'],
|
|
166
|
+
split: 'after',
|
|
167
|
+
occurrence: 'last', // Only split at LAST period on page
|
|
168
|
+
maxSpan: 1, // Apply per-page
|
|
103
169
|
}
|
|
104
170
|
```
|
|
105
171
|
|
|
106
|
-
|
|
107
|
-
|
|
172
|
+
## Use Cases
|
|
173
|
+
|
|
174
|
+
### Simple Hadith Segmentation
|
|
175
|
+
|
|
176
|
+
Use `{{numbered}}` for the common "number - content" format:
|
|
108
177
|
|
|
109
178
|
```typescript
|
|
110
|
-
{
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
179
|
+
const segments = segmentPages(pages, {
|
|
180
|
+
rules: [{
|
|
181
|
+
lineStartsAfter: ['{{numbered}}'],
|
|
182
|
+
split: 'at',
|
|
183
|
+
meta: { type: 'hadith' }
|
|
184
|
+
}]
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Matches: ٢٢ - حدثنا, ٦٦٩٦ – أخبرنا, etc.
|
|
188
|
+
// Content starts AFTER the number and dash
|
|
114
189
|
```
|
|
115
190
|
|
|
116
|
-
###
|
|
117
|
-
|
|
191
|
+
### Hadith Segmentation with Number Extraction
|
|
192
|
+
|
|
193
|
+
For capturing the hadith number, use explicit capture syntax:
|
|
118
194
|
|
|
119
195
|
```typescript
|
|
120
|
-
{
|
|
121
|
-
|
|
122
|
-
}
|
|
196
|
+
const segments = segmentPages(pages, {
|
|
197
|
+
rules: [{
|
|
198
|
+
lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
199
|
+
split: 'at',
|
|
200
|
+
meta: { type: 'hadith' }
|
|
201
|
+
}]
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Each segment has:
|
|
205
|
+
// - content: The hadith text (without number prefix)
|
|
206
|
+
// - from/to: Page range
|
|
207
|
+
// - meta: { type: 'hadith', hadithNum: '٦٦٩٦' }
|
|
123
208
|
```
|
|
124
209
|
|
|
125
|
-
###
|
|
126
|
-
Match patterns like: `. . . . . . . . . .`
|
|
210
|
+
### Volume/Page Reference Extraction
|
|
127
211
|
|
|
128
212
|
```typescript
|
|
129
|
-
{
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
213
|
+
const segments = segmentPages(pages, {
|
|
214
|
+
rules: [{
|
|
215
|
+
lineStartsAfter: ['{{raqms:vol}}/{{raqms:page}} {{dash}} '],
|
|
216
|
+
split: 'at'
|
|
217
|
+
}]
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
// meta: { vol: '٣', page: '٤٥٦' }
|
|
133
221
|
```
|
|
134
222
|
|
|
135
|
-
###
|
|
136
|
-
Match patterns like: `*. . . / ٨٦ - حَدَّثَنَا`
|
|
223
|
+
### Chapter Detection with Fuzzy Matching
|
|
137
224
|
|
|
138
|
-
**Option 1: Capture from asterisk**
|
|
139
225
|
```typescript
|
|
140
|
-
{
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
226
|
+
const segments = segmentPages(pages, {
|
|
227
|
+
rules: [{
|
|
228
|
+
fuzzy: true,
|
|
229
|
+
lineStartsAfter: ['{{kitab:book}} '],
|
|
230
|
+
split: 'at',
|
|
231
|
+
meta: { type: 'chapter' }
|
|
232
|
+
}]
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// Matches "كِتَابُ" or "كتاب" regardless of diacritics
|
|
145
236
|
```
|
|
146
237
|
|
|
147
|
-
|
|
238
|
+
### Naql (Transmission) Phrase Detection
|
|
239
|
+
|
|
148
240
|
```typescript
|
|
149
|
-
{
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
}
|
|
241
|
+
const segments = segmentPages(pages, {
|
|
242
|
+
rules: [{
|
|
243
|
+
fuzzy: true,
|
|
244
|
+
lineStartsWith: ['{{naql:phrase}}'],
|
|
245
|
+
split: 'at'
|
|
246
|
+
}]
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
// meta.phrase captures which narrator phrase was matched:
|
|
250
|
+
// 'حدثنا', 'أخبرنا', 'حدثني', etc.
|
|
153
251
|
```
|
|
154
252
|
|
|
155
|
-
|
|
253
|
+
### Mixed Captured and Non-Captured Tokens
|
|
156
254
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
| `{letter}` | Arabic letters | `[أ-ي]` |
|
|
166
|
-
| `{s}` | Optional space | `\\s?` |
|
|
167
|
-
| `{space}` | Required space | `\\s+` |
|
|
255
|
+
```typescript
|
|
256
|
+
// Only capture the number, not the letter
|
|
257
|
+
const segments = segmentPages(pages, {
|
|
258
|
+
rules: [{
|
|
259
|
+
lineStartsWith: ['{{raqms:num}} {{harf}} {{dash}} '],
|
|
260
|
+
split: 'at'
|
|
261
|
+
}]
|
|
262
|
+
});
|
|
168
263
|
|
|
169
|
-
|
|
264
|
+
// Input: '٥ أ - البند الأول'
|
|
265
|
+
// meta: { num: '٥' } // harf not captured (no :name suffix)
|
|
266
|
+
```
|
|
170
267
|
|
|
171
|
-
|
|
268
|
+
### Sentence-Based Splitting (Last Period Per Page)
|
|
172
269
|
|
|
173
|
-
### Before (Regex)
|
|
174
270
|
```typescript
|
|
175
|
-
const
|
|
271
|
+
const segments = segmentPages(pages, {
|
|
272
|
+
rules: [{
|
|
273
|
+
lineEndsWith: ['\\.'],
|
|
274
|
+
split: 'after',
|
|
275
|
+
occurrence: 'last',
|
|
276
|
+
maxSpan: 1
|
|
277
|
+
}]
|
|
278
|
+
});
|
|
176
279
|
```
|
|
177
280
|
|
|
178
|
-
###
|
|
281
|
+
### Page Fallback for Unmatched Content
|
|
282
|
+
|
|
283
|
+
When using `maxSpan` to group matches per page, use `fallback: 'page'` to prevent unmatched pages from merging with adjacent segments:
|
|
284
|
+
|
|
179
285
|
```typescript
|
|
180
|
-
{
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
286
|
+
const segments = segmentPages(pages, {
|
|
287
|
+
rules: [{
|
|
288
|
+
template: '{{tarqim}}', // Match punctuation marks
|
|
289
|
+
split: 'after',
|
|
290
|
+
occurrence: 'last',
|
|
291
|
+
maxSpan: 1,
|
|
292
|
+
fallback: 'page' // If no punctuation found, segment the page anyway
|
|
293
|
+
}]
|
|
294
|
+
});
|
|
184
295
|
```
|
|
185
296
|
|
|
186
|
-
**
|
|
297
|
+
**Without `fallback`**: Pages without matches merge into the next segment
|
|
298
|
+
**With `fallback: 'page'`**: Each page becomes its own segment even without matches
|
|
187
299
|
|
|
188
|
-
|
|
300
|
+
> **Future extensions**: The `fallback` option may support additional values like `'skip'` (omit unmatched content) or `'line'` (split at line breaks) in future versions.
|
|
189
301
|
|
|
190
|
-
###
|
|
302
|
+
### Multiple Rules with Priority
|
|
191
303
|
|
|
192
304
|
```typescript
|
|
193
|
-
|
|
305
|
+
const segments = segmentPages(pages, {
|
|
306
|
+
rules: [
|
|
307
|
+
// First: Chapter headers (highest priority)
|
|
308
|
+
{ fuzzy: true, lineStartsAfter: ['{{kitab:book}} '], split: 'at', meta: { type: 'chapter' } },
|
|
309
|
+
// Second: Sub-chapters
|
|
310
|
+
{ fuzzy: true, lineStartsAfter: ['{{bab:section}} '], split: 'at', meta: { type: 'section' } },
|
|
311
|
+
// Third: Individual hadiths
|
|
312
|
+
{ lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at', meta: { type: 'hadith' } },
|
|
313
|
+
]
|
|
314
|
+
});
|
|
315
|
+
```
|
|
194
316
|
|
|
195
|
-
|
|
196
|
-
type: 'numbered',
|
|
197
|
-
numbering: 'arabic-indic', // or 'latin', 'roman'
|
|
198
|
-
separator: 'dash', // or 'dot', 'colon', 'paren'
|
|
199
|
-
removeMarker: true, // Remove marker from capture (default: true)
|
|
200
|
-
};
|
|
317
|
+
## API Reference
|
|
201
318
|
|
|
202
|
-
|
|
203
|
-
```
|
|
319
|
+
### `segmentPages(pages, options)`
|
|
204
320
|
|
|
205
|
-
|
|
321
|
+
Main segmentation function.
|
|
206
322
|
|
|
207
323
|
```typescript
|
|
208
|
-
import {
|
|
324
|
+
import { segmentPages, type Page, type SegmentationOptions, type Segment } from 'flappa-doormal';
|
|
325
|
+
|
|
326
|
+
const pages: Page[] = [
|
|
327
|
+
{ id: 1, content: 'First page content...' },
|
|
328
|
+
{ id: 2, content: 'Second page content...' },
|
|
329
|
+
];
|
|
209
330
|
|
|
210
|
-
const
|
|
211
|
-
|
|
331
|
+
const options: SegmentationOptions = {
|
|
332
|
+
rules: [
|
|
333
|
+
{ lineStartsWith: ['## '], split: 'at' }
|
|
334
|
+
]
|
|
335
|
+
};
|
|
212
336
|
|
|
213
|
-
const
|
|
214
|
-
// Returns: '^([\\u0660-\\u0669]+ [-–—ـ].*)'
|
|
337
|
+
const segments: Segment[] = segmentPages(pages, options);
|
|
215
338
|
```
|
|
216
339
|
|
|
217
|
-
### `
|
|
340
|
+
### `stripHtmlTags(html)`
|
|
341
|
+
|
|
342
|
+
Remove all HTML tags from content, keeping only text.
|
|
218
343
|
|
|
219
344
|
```typescript
|
|
220
|
-
import {
|
|
345
|
+
import { stripHtmlTags } from 'flappa-doormal';
|
|
221
346
|
|
|
222
|
-
const
|
|
223
|
-
// Returns:
|
|
347
|
+
const text = stripHtmlTags('<p>Hello <b>World</b></p>');
|
|
348
|
+
// Returns: 'Hello World'
|
|
224
349
|
```
|
|
225
350
|
|
|
226
|
-
|
|
351
|
+
For more sophisticated HTML to Markdown conversion (like converting `<span data-type="title">` to `## ` headers), you can implement your own function. Here's an example:
|
|
227
352
|
|
|
228
353
|
```typescript
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
phrases?: string[]; // For 'phrase' and 'hadith-chain' types
|
|
238
|
-
removeMarker?: boolean; // Default: true for numbered/bullet
|
|
354
|
+
const htmlToMarkdown = (html: string): string => {
|
|
355
|
+
return html
|
|
356
|
+
// Convert title spans to markdown headers
|
|
357
|
+
.replace(/<span[^>]*data-type=["']title["'][^>]*>(.*?)<\/span>/gi, '## $1')
|
|
358
|
+
// Strip narrator links but keep text
|
|
359
|
+
.replace(/<a[^>]*href=["']inr:\/\/[^"']*["'][^>]*>(.*?)<\/a>/gi, '$1')
|
|
360
|
+
// Strip all remaining HTML tags
|
|
361
|
+
.replace(/<[^>]*>/g, '');
|
|
239
362
|
};
|
|
240
363
|
```
|
|
241
364
|
|
|
242
|
-
|
|
365
|
+
### `expandTokens(template)`
|
|
243
366
|
|
|
244
|
-
|
|
367
|
+
Expand template tokens to regex pattern.
|
|
245
368
|
|
|
246
369
|
```typescript
|
|
247
|
-
import {
|
|
370
|
+
import { expandTokens } from 'flappa-doormal';
|
|
248
371
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
const regex = generateRegexFromMarker({
|
|
253
|
-
type: 'hadith-chain',
|
|
254
|
-
phrases: myPhrases,
|
|
255
|
-
});
|
|
372
|
+
const pattern = expandTokens('{{raqms}} {{dash}}');
|
|
373
|
+
// Returns: '[\u0660-\u0669]+ [-–—ـ]'
|
|
256
374
|
```
|
|
257
375
|
|
|
258
|
-
###
|
|
376
|
+
### `makeDiacriticInsensitive(text)`
|
|
377
|
+
|
|
378
|
+
Make Arabic text diacritic-insensitive for fuzzy matching.
|
|
259
379
|
|
|
260
380
|
```typescript
|
|
261
|
-
import {
|
|
381
|
+
import { makeDiacriticInsensitive } from 'flappa-doormal';
|
|
262
382
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
{ type: 'hadith-chain', phrases: [...DEFAULT_HADITH_PHRASES, 'extra'] },
|
|
266
|
-
true // removeMarker
|
|
267
|
-
);
|
|
383
|
+
const pattern = makeDiacriticInsensitive('حدثنا');
|
|
384
|
+
// Returns regex pattern matching 'حَدَّثَنَا', 'حدثنا', etc.
|
|
268
385
|
```
|
|
269
386
|
|
|
270
|
-
###
|
|
387
|
+
### `TOKEN_PATTERNS`
|
|
271
388
|
|
|
272
|
-
|
|
273
|
-
import { createTokenMap, expandTemplate } from 'flappa-doormal';
|
|
389
|
+
Access available token definitions.
|
|
274
390
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
tafsir: 'تفسير',
|
|
278
|
-
});
|
|
391
|
+
```typescript
|
|
392
|
+
import { TOKEN_PATTERNS } from 'flappa-doormal';
|
|
279
393
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
removeMarker: true
|
|
283
|
-
});
|
|
394
|
+
console.log(TOKEN_PATTERNS.narrated);
|
|
395
|
+
// 'حدثنا|أخبرنا|حدثني|وحدثنا|أنبأنا|سمعت'
|
|
284
396
|
```
|
|
285
397
|
|
|
286
|
-
##
|
|
398
|
+
## Types
|
|
287
399
|
|
|
288
|
-
|
|
289
|
-
- `DEFAULT_HADITH_PHRASES` - Default narrator phrases
|
|
290
|
-
- `DEFAULT_BASMALA_PATTERNS` - Default basmala patterns
|
|
291
|
-
- `TOKENS` - Token definitions
|
|
400
|
+
### `SplitRule`
|
|
292
401
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
402
|
+
```typescript
|
|
403
|
+
type SplitRule = {
|
|
404
|
+
// Pattern (choose one)
|
|
405
|
+
lineStartsWith?: string[];
|
|
406
|
+
lineStartsAfter?: string[];
|
|
407
|
+
lineEndsWith?: string[];
|
|
408
|
+
template?: string;
|
|
409
|
+
regex?: string;
|
|
410
|
+
|
|
411
|
+
// Split behavior
|
|
412
|
+
split: 'at' | 'after';
|
|
413
|
+
occurrence?: 'first' | 'last' | 'all';
|
|
414
|
+
maxSpan?: number;
|
|
415
|
+
fuzzy?: boolean;
|
|
416
|
+
fallback?: 'page'; // NEW: Page-boundary fallback
|
|
417
|
+
|
|
418
|
+
// Constraints
|
|
419
|
+
min?: number;
|
|
420
|
+
max?: number;
|
|
421
|
+
meta?: Record<string, unknown>;
|
|
422
|
+
};
|
|
423
|
+
```
|
|
301
424
|
|
|
302
|
-
|
|
425
|
+
### `Segment`
|
|
303
426
|
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
|
|
427
|
+
```typescript
|
|
428
|
+
type Segment = {
|
|
429
|
+
content: string;
|
|
430
|
+
from: number;
|
|
431
|
+
to?: number;
|
|
432
|
+
meta?: Record<string, unknown>;
|
|
433
|
+
};
|
|
434
|
+
```
|
|
307
435
|
|
|
308
|
-
|
|
309
|
-
bun test src/markers/type-generators.test.ts
|
|
436
|
+
## Usage with Next.js / Node.js
|
|
310
437
|
|
|
311
|
-
|
|
312
|
-
|
|
438
|
+
```typescript
|
|
439
|
+
// app/api/segment/route.ts (Next.js App Router)
|
|
440
|
+
import { segmentPages } from 'flappa-doormal';
|
|
441
|
+
import { NextResponse } from 'next/server';
|
|
442
|
+
|
|
443
|
+
export async function POST(request: Request) {
|
|
444
|
+
const { pages, rules } = await request.json();
|
|
445
|
+
|
|
446
|
+
const segments = segmentPages(pages, { rules });
|
|
447
|
+
|
|
448
|
+
return NextResponse.json({ segments });
|
|
449
|
+
}
|
|
313
450
|
```
|
|
314
451
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
452
|
+
```typescript
|
|
453
|
+
// Node.js script
|
|
454
|
+
import { segmentPages, stripHtmlTags } from 'flappa-doormal';
|
|
455
|
+
|
|
456
|
+
const pages = rawPages.map((p, i) => ({
|
|
457
|
+
id: i + 1,
|
|
458
|
+
content: stripHtmlTags(p.html)
|
|
459
|
+
}));
|
|
460
|
+
|
|
461
|
+
const segments = segmentPages(pages, {
|
|
462
|
+
rules: [{
|
|
463
|
+
lineStartsAfter: ['{{raqms:num}} {{dash}} '],
|
|
464
|
+
split: 'at'
|
|
465
|
+
}]
|
|
466
|
+
});
|
|
467
|
+
|
|
468
|
+
console.log(`Found ${segments.length} segments`);
|
|
469
|
+
```
|
|
320
470
|
|
|
321
471
|
## Development
|
|
322
472
|
|
|
@@ -324,34 +474,86 @@ bun test --coverage
|
|
|
324
474
|
# Install dependencies
|
|
325
475
|
bun install
|
|
326
476
|
|
|
327
|
-
# Run tests
|
|
477
|
+
# Run tests (222 tests)
|
|
328
478
|
bun test
|
|
329
479
|
|
|
330
|
-
# Build
|
|
480
|
+
# Build
|
|
331
481
|
bun run build
|
|
332
482
|
|
|
333
|
-
#
|
|
334
|
-
|
|
483
|
+
# Run performance test (generates 50K pages, measures segmentation speed/memory)
|
|
484
|
+
bun run perf
|
|
335
485
|
|
|
336
|
-
# Lint
|
|
486
|
+
# Lint
|
|
337
487
|
bunx biome lint .
|
|
488
|
+
|
|
489
|
+
# Format
|
|
490
|
+
bunx biome format --write .
|
|
338
491
|
```
|
|
339
492
|
|
|
493
|
+
## Design Decisions
|
|
494
|
+
|
|
495
|
+
### Double-Brace Syntax `{{token}}`
|
|
496
|
+
|
|
497
|
+
Single braces conflict with regex quantifiers `{n,m}`. Double braces are visually distinct and match common template syntax (Handlebars, Mustache).
|
|
498
|
+
|
|
499
|
+
### `lineStartsAfter` vs `lineStartsWith`
|
|
500
|
+
|
|
501
|
+
- `lineStartsWith`: Keep marker in content (for detection only)
|
|
502
|
+
- `lineStartsAfter`: Strip marker, capture only content (for clean extraction)
|
|
503
|
+
|
|
504
|
+
### Fuzzy Applied at Token Level
|
|
505
|
+
|
|
506
|
+
Fuzzy transforms are applied to raw Arabic text *before* wrapping in regex groups. This prevents corruption of regex metacharacters like `(`, `)`, `|`.
|
|
507
|
+
|
|
508
|
+
### Extracted Utilities
|
|
509
|
+
|
|
510
|
+
Complex logic was extracted into `match-utils.ts` for independent testing and reduced complexity (main function: 37 → 10).
|
|
511
|
+
|
|
512
|
+
## Performance Notes
|
|
513
|
+
|
|
514
|
+
### Memory Requirements
|
|
515
|
+
|
|
516
|
+
The library concatenates all pages into a single string for pattern matching across page boundaries. Memory usage scales linearly with total content size:
|
|
517
|
+
|
|
518
|
+
| Pages | Avg Page Size | Approximate Memory |
|
|
519
|
+
|-------|---------------|-------------------|
|
|
520
|
+
| 1,000 | 5 KB | ~5 MB |
|
|
521
|
+
| 6,000 | 5 KB | ~30 MB |
|
|
522
|
+
| 40,000 | 5 KB | ~200 MB |
|
|
523
|
+
|
|
524
|
+
For typical book processing (up to 6,000 pages), memory usage is well within Node.js defaults. For very large books (40,000+ pages), ensure adequate heap size.
|
|
525
|
+
|
|
526
|
+
### `maxSpan` Sliding Window Behavior
|
|
527
|
+
|
|
528
|
+
The `maxSpan` option uses a **sliding window algorithm** based on page ID difference:
|
|
529
|
+
|
|
530
|
+
```typescript
|
|
531
|
+
// maxSpan = maximum page ID difference when looking ahead for split points
|
|
532
|
+
// Algorithm prefers LONGER segments by looking as far ahead as allowed
|
|
533
|
+
|
|
534
|
+
// Pages [1, 2, 3, 4] with maxSpan: 1, occurrence: 'last'
|
|
535
|
+
// Window from page 1: pages 1-2 (diff <= 1), splits at page 2's last match
|
|
536
|
+
// Window from page 3: pages 3-4 (diff <= 1), splits at page 4's last match
|
|
537
|
+
// Result: 2 segments spanning pages 1-2 and 3-4
|
|
538
|
+
|
|
539
|
+
// Pages [1, 5, 10] with maxSpan: 1, occurrence: 'last'
|
|
540
|
+
// Window from page 1: only page 1 (5-1=4 > 1), splits at page 1
|
|
541
|
+
// Window from page 5: only page 5 (10-5=5 > 1), splits at page 5
|
|
542
|
+
// Window from page 10: only page 10, splits at page 10
|
|
543
|
+
// Result: 3 segments (pages too far apart to merge)
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
This is intentional for books where page IDs represent actual page numbers. With `occurrence: 'last'`, the algorithm finds the last match within the lookahead window, creating longer segments where possible.
|
|
547
|
+
|
|
340
548
|
## For AI Agents
|
|
341
549
|
|
|
342
|
-
See [AGENTS.md](./AGENTS.md) for
|
|
343
|
-
-
|
|
344
|
-
- Adding new
|
|
345
|
-
-
|
|
346
|
-
-
|
|
347
|
-
- Extension points
|
|
550
|
+
See [AGENTS.md](./AGENTS.md) for:
|
|
551
|
+
- Architecture details and design patterns
|
|
552
|
+
- Adding new tokens and pattern types
|
|
553
|
+
- Algorithm explanations
|
|
554
|
+
- Lessons learned during development
|
|
348
555
|
|
|
349
556
|
## License
|
|
350
557
|
|
|
351
558
|
MIT
|
|
352
559
|
|
|
353
|
-
## Related
|
|
354
|
-
|
|
355
|
-
- [bitaboom](https://github.com/ragaeeb/bitaboom) - Arabic text utilities
|
|
356
|
-
- [baburchi](https://github.com/ragaeeb/baburchi) - Text sanitization
|
|
357
|
-
- [shamela](https://github.com/ragaeeb/shamela) - Shamela library utilities
|