flappa-doormal 2.2.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +38 -6
- package/README.md +34 -3
- package/dist/index.d.mts +28 -0
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +473 -338
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/AGENTS.md
CHANGED
|
@@ -30,8 +30,10 @@ src/
|
|
|
30
30
|
├── pattern-detection.test.ts # Pattern detection tests (22 tests)
|
|
31
31
|
└── segmentation/
|
|
32
32
|
├── types.ts # TypeScript type definitions for rules/segments
|
|
33
|
-
├── segmenter.ts # Core segmentation engine (segmentPages
|
|
34
|
-
├── breakpoint-
|
|
33
|
+
├── segmenter.ts # Core segmentation engine (segmentPages)
|
|
34
|
+
├── breakpoint-processor.ts # Breakpoint post-processing engine (applyBreakpoints)
|
|
35
|
+
├── breakpoint-utils.ts # Breakpoint processing utilities (windowing, excludes, page joins)
|
|
36
|
+
├── rule-regex.ts # SplitRule -> compiled regex builder (buildRuleRegex, processPattern)
|
|
35
37
|
├── tokens.ts # Token definitions and expansion logic
|
|
36
38
|
├── fuzzy.ts # Diacritic-insensitive matching utilities
|
|
37
39
|
├── html.ts # HTML utilities (stripHtmlTags)
|
|
@@ -40,6 +42,8 @@ src/
|
|
|
40
42
|
├── segmenter.test.ts # Core test suite (150+ tests including breakpoints)
|
|
41
43
|
├── segmenter.bukhari.test.ts # Real-world test cases
|
|
42
44
|
├── breakpoint-utils.test.ts # Breakpoint utility tests (42 tests)
|
|
45
|
+
├── rule-regex.test.ts # Rule regex builder tests
|
|
46
|
+
├── segmenter-utils.test.ts # Segmenter helper tests
|
|
43
47
|
├── tokens.test.ts # Token expansion tests
|
|
44
48
|
├── fuzzy.test.ts # Fuzzy matching tests
|
|
45
49
|
├── textUtils.test.ts # Text utility tests
|
|
@@ -72,7 +76,16 @@ docs/
|
|
|
72
76
|
- `filterByConstraints()` - Apply min/max page filters
|
|
73
77
|
- `anyRuleAllowsId()` - Check if page passes rule constraints
|
|
74
78
|
|
|
75
|
-
4. **`
|
|
79
|
+
4. **`rule-regex.ts`** - SplitRule → compiled regex builder
|
|
80
|
+
- `buildRuleRegex()` - Compiles rule patterns (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, `template`, `regex`)
|
|
81
|
+
- `processPattern()` - Token expansion + auto-escaping + optional fuzzy application
|
|
82
|
+
- `extractNamedCaptureNames()` - Extract `(?<name>...)` groups from raw regex patterns (NEW)
|
|
83
|
+
|
|
84
|
+
5. **`breakpoint-processor.ts`** - Breakpoint post-processing engine
|
|
85
|
+
- `applyBreakpoints()` - Splits oversized structural segments using breakpoint patterns + windowing
|
|
86
|
+
- Applies `pageJoiner` normalization to breakpoint-created segments
|
|
87
|
+
|
|
88
|
+
6. **`breakpoint-utils.ts`** - Breakpoint processing utilities (NEW)
|
|
76
89
|
- `normalizeBreakpoint()` - Convert string to BreakpointRule object
|
|
77
90
|
- `isPageExcluded()` - Check if page is in exclude list
|
|
78
91
|
- `isInBreakpointRange()` - Validate page against min/max/exclude constraints
|
|
@@ -80,20 +93,23 @@ docs/
|
|
|
80
93
|
- `createSegment()` - Create segment with optional to/meta fields
|
|
81
94
|
- `expandBreakpoints()` - Expand patterns with pre-compiled regexes
|
|
82
95
|
- `findActualEndPage()` - Search backwards for ending page by content
|
|
96
|
+
- `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
|
|
97
|
+
- `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
|
|
83
98
|
- `findBreakPosition()` - Find break position using breakpoint patterns
|
|
84
99
|
- `hasExcludedPageInRange()` - Check if range contains excluded pages
|
|
85
100
|
- `findNextPagePosition()` - Find next page content position
|
|
86
101
|
- `findPatternBreakPosition()` - Find pattern match by preference
|
|
87
102
|
|
|
88
|
-
|
|
103
|
+
7. **`types.ts`** - Type definitions
|
|
89
104
|
- `Logger` interface - Optional logging for debugging
|
|
90
105
|
- `SegmentationOptions` - Options with `logger` property
|
|
106
|
+
- `pageJoiner` - Controls how page boundaries are represented in output (`space` default)
|
|
91
107
|
- Verbosity levels: `trace`, `debug`, `info`, `warn`, `error`
|
|
92
108
|
|
|
93
|
-
|
|
109
|
+
8. **`fuzzy.ts`** - Arabic text normalization
|
|
94
110
|
- `makeDiacriticInsensitive()` - Generate regex that ignores diacritics
|
|
95
111
|
|
|
96
|
-
|
|
112
|
+
9. **`pattern-detection.ts`** - Token auto-detection (NEW)
|
|
97
113
|
- `detectTokenPatterns()` - Detect tokens in text with positions
|
|
98
114
|
- `generateTemplateFromText()` - Convert text to template string
|
|
99
115
|
- `suggestPatternConfig()` - Suggest rule configuration
|
|
@@ -163,6 +179,22 @@ export const escapeTemplateBrackets = (pattern: string): string => {
|
|
|
163
179
|
- Direct `template` processing in `buildRuleRegex()`
|
|
164
180
|
- **NOT** applied to `regex` patterns (user has full control)
|
|
165
181
|
|
|
182
|
+
### Named Captures in Raw Regex Patterns (NEW)
|
|
183
|
+
|
|
184
|
+
Raw `regex` patterns now support named capture groups for metadata extraction:
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
// Named groups like (?<num>...) are automatically detected and extracted
|
|
188
|
+
{ regex: '^(?<num>[٠-٩]+)\\s+[أ-ي\\s]+:\\s*(.+)' }
|
|
189
|
+
// meta.num = matched number
|
|
190
|
+
// content = the (.+) anonymous capture group
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**How it works:**
|
|
194
|
+
1. `extractNamedCaptureNames()` parses `(?<name>...)` from regex string
|
|
195
|
+
2. Named captures go to `segment.meta`
|
|
196
|
+
3. Anonymous `(...)` captures can still be used for content extraction
|
|
197
|
+
|
|
166
198
|
### Breakpoints Post-Processing Algorithm
|
|
167
199
|
|
|
168
200
|
The `breakpoints` option provides a post-processing mechanism for limiting segment size. Unlike the deprecated `maxSpan` (which was per-rule), breakpoints runs AFTER all structural rules.
|
package/README.md
CHANGED
|
@@ -90,6 +90,7 @@ Replace regex with readable tokens:
|
|
|
90
90
|
| `{{raqm}}` | Single Arabic digit | `[\\u0660-\\u0669]` |
|
|
91
91
|
| `{{dash}}` | Dash variants | `[-–—ـ]` |
|
|
92
92
|
| `{{harf}}` | Arabic letter | `[أ-ي]` |
|
|
93
|
+
| `{{harfs}}` | Arabic letters with spaces | `[أ-ي](?:[أ-ي\s]*[أ-ي])?` |
|
|
93
94
|
| `{{numbered}}` | Hadith numbering `٢٢ - ` | `{{raqms}} {{dash}} ` |
|
|
94
95
|
| `{{fasl}}` | Section markers | `فصل\|مسألة` |
|
|
95
96
|
| `{{tarqim}}` | Punctuation marks | `[.!?؟؛]` |
|
|
@@ -165,6 +166,10 @@ For full regex control (character classes, capturing groups), use the `regex` pa
|
|
|
165
166
|
|
|
166
167
|
// Capturing group (test|text) matches either
|
|
167
168
|
{ regex: '^(test|text) ', split: 'at' }
|
|
169
|
+
|
|
170
|
+
// Named capture groups extract metadata from raw regex too!
|
|
171
|
+
{ regex: '^(?<num>[٠-٩]+)\\s+[أ-ي\\s]+:\\s*(.+)' }
|
|
172
|
+
// meta.num = matched number, content = captured (.+) group
|
|
168
173
|
```
|
|
169
174
|
|
|
170
175
|
### 6. Page Constraints
|
|
@@ -289,6 +294,23 @@ const segments = segmentPages(pages, {
|
|
|
289
294
|
// meta: { num: '٥' } // harf not captured (no :name suffix)
|
|
290
295
|
```
|
|
291
296
|
|
|
297
|
+
### Narrator Abbreviation Codes
|
|
298
|
+
|
|
299
|
+
Use `{{harfs}}` for matching Arabic letter abbreviations with spaces (common in narrator biography books):
|
|
300
|
+
|
|
301
|
+
```typescript
|
|
302
|
+
const segments = segmentPages(pages, {
|
|
303
|
+
rules: [{
|
|
304
|
+
lineStartsAfter: ['{{raqms:num}} {{harfs}}:'],
|
|
305
|
+
split: 'at'
|
|
306
|
+
}]
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
// Matches: ١١١٨ د ت سي ق: حجاج بن دينار
|
|
310
|
+
// meta: { num: '١١١٨' }
|
|
311
|
+
// content: 'حجاج بن دينار' (abbreviations stripped)
|
|
312
|
+
```
|
|
313
|
+
|
|
292
314
|
### Sentence-Based Splitting (Last Period Per Page)
|
|
293
315
|
|
|
294
316
|
```typescript
|
|
@@ -355,7 +377,11 @@ const pages: Page[] = [
|
|
|
355
377
|
const options: SegmentationOptions = {
|
|
356
378
|
rules: [
|
|
357
379
|
{ lineStartsWith: ['## '], split: 'at' }
|
|
358
|
-
]
|
|
380
|
+
],
|
|
381
|
+
// How to join content across page boundaries in OUTPUT segments:
|
|
382
|
+
// - 'space' (default): page boundaries become spaces
|
|
383
|
+
// - 'newline': preserve page boundaries as newlines
|
|
384
|
+
pageJoiner: 'space',
|
|
359
385
|
};
|
|
360
386
|
|
|
361
387
|
const segments: Segment[] = segmentPages(pages, options);
|
|
@@ -588,7 +614,7 @@ console.log(`Found ${segments.length} segments`);
|
|
|
588
614
|
# Install dependencies
|
|
589
615
|
bun install
|
|
590
616
|
|
|
591
|
-
# Run tests
|
|
617
|
+
# Run tests
|
|
592
618
|
bun test
|
|
593
619
|
|
|
594
620
|
# Build
|
|
@@ -621,7 +647,12 @@ Fuzzy transforms are applied to raw Arabic text *before* wrapping in regex group
|
|
|
621
647
|
|
|
622
648
|
### Extracted Utilities
|
|
623
649
|
|
|
624
|
-
Complex logic
|
|
650
|
+
Complex logic is intentionally split into small, independently testable modules:
|
|
651
|
+
|
|
652
|
+
- `src/segmentation/match-utils.ts`: match filtering + capture extraction
|
|
653
|
+
- `src/segmentation/rule-regex.ts`: SplitRule → compiled regex builder (`buildRuleRegex`, `processPattern`)
|
|
654
|
+
- `src/segmentation/breakpoint-utils.ts`: breakpoint windowing/exclusion helpers + page boundary join normalization
|
|
655
|
+
- `src/segmentation/breakpoint-processor.ts`: breakpoint post-processing engine (applies breakpoints after structural segmentation)
|
|
625
656
|
|
|
626
657
|
## Performance Notes
|
|
627
658
|
|
package/dist/index.d.mts
CHANGED
|
@@ -630,6 +630,19 @@ type SegmentationOptions = {
|
|
|
630
630
|
* @default 'longer'
|
|
631
631
|
*/
|
|
632
632
|
prefer?: 'longer' | 'shorter';
|
|
633
|
+
/**
|
|
634
|
+
* How to join content across page boundaries in OUTPUT segments.
|
|
635
|
+
*
|
|
636
|
+
* Internally, pages are still concatenated with `\\n` for matching (multiline regex),
|
|
637
|
+
* but when a segment spans multiple pages, the inserted page-boundary separator is
|
|
638
|
+
* normalized for output.
|
|
639
|
+
*
|
|
640
|
+
* - `'space'`: Join pages with a single space (default)
|
|
641
|
+
* - `'newline'`: Preserve page boundary as a newline
|
|
642
|
+
*
|
|
643
|
+
* @default 'space'
|
|
644
|
+
*/
|
|
645
|
+
pageJoiner?: 'space' | 'newline';
|
|
633
646
|
/**
|
|
634
647
|
* Optional logger for debugging segmentation.
|
|
635
648
|
*
|
|
@@ -707,6 +720,21 @@ type Segment = {
|
|
|
707
720
|
};
|
|
708
721
|
//#endregion
|
|
709
722
|
//#region src/segmentation/segmenter.d.ts
|
|
723
|
+
|
|
724
|
+
/**
|
|
725
|
+
* Applies breakpoints to oversized segments.
|
|
726
|
+
*
|
|
727
|
+
* For each segment that spans more than maxPages, tries the breakpoint patterns
|
|
728
|
+
* in order to find a suitable split point. Structural markers (from rules) are
|
|
729
|
+
* always respected - segments are only broken within their boundaries.
|
|
730
|
+
*
|
|
731
|
+
* @param segments - Initial segments from rule processing
|
|
732
|
+
* @param pages - Original pages for page lookup
|
|
733
|
+
* @param maxPages - Maximum pages before breakpoints apply
|
|
734
|
+
* @param breakpoints - Patterns to try in order (tokens supported)
|
|
735
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
736
|
+
* @returns Processed segments with oversized ones broken up
|
|
737
|
+
*/
|
|
710
738
|
/**
|
|
711
739
|
* Segments pages of content based on pattern-matching rules.
|
|
712
740
|
*
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;ACrjBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA,KA9VK,YAAA,GA8VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AA6NA,KHvOK,eAAA,GG2OJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAmHA;AAuBA;AAqBA;AAgBA;;;;ACrjBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAwCC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;;;;AEhtBX;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;ACrjBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cHgNa,sBAAuB,iBAAiB,wBAAsB;;;;AFxU3E;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cEjJO,oBFiJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;cA7fa;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cDuGa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,2FAA0F;;;;;;;;;;;;;;;;;;;;cAmH1F;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;;AJ/fb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cIlmBN,mBJkmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIlmBgC,eJkmBhC,EAAA;AAiBnB;;;;AC/SA;;;;;;;;ACpYA;AAca,cE8GA,wBF7GyD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE6GL,eF7GK,EAAA,EAAA,GAAA,MAAA;;;;AC+CtE;AA6NA;AA2CA;AAWY,cC9LC,oBD8LW,EAAA,CAAA,QAAA,EC7LV,eD6LU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBA8FZ,GAAA,iBA9FsG;EAmH1F,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;ACrjBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
|