flappa-doormal 2.2.1 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +23 -8
- package/README.md +13 -4
- package/dist/index.d.mts +32 -5
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +470 -338
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
package/AGENTS.md
CHANGED
|
@@ -30,8 +30,10 @@ src/
|
|
|
30
30
|
├── pattern-detection.test.ts # Pattern detection tests (22 tests)
|
|
31
31
|
└── segmentation/
|
|
32
32
|
├── types.ts # TypeScript type definitions for rules/segments
|
|
33
|
-
├── segmenter.ts # Core segmentation engine (segmentPages
|
|
34
|
-
├── breakpoint-
|
|
33
|
+
├── segmenter.ts # Core segmentation engine (segmentPages)
|
|
34
|
+
├── breakpoint-processor.ts # Breakpoint post-processing engine (applyBreakpoints)
|
|
35
|
+
├── breakpoint-utils.ts # Breakpoint processing utilities (windowing, excludes, page joins)
|
|
36
|
+
├── rule-regex.ts # SplitRule -> compiled regex builder (buildRuleRegex, processPattern)
|
|
35
37
|
├── tokens.ts # Token definitions and expansion logic
|
|
36
38
|
├── fuzzy.ts # Diacritic-insensitive matching utilities
|
|
37
39
|
├── html.ts # HTML utilities (stripHtmlTags)
|
|
@@ -40,6 +42,8 @@ src/
|
|
|
40
42
|
├── segmenter.test.ts # Core test suite (150+ tests including breakpoints)
|
|
41
43
|
├── segmenter.bukhari.test.ts # Real-world test cases
|
|
42
44
|
├── breakpoint-utils.test.ts # Breakpoint utility tests (42 tests)
|
|
45
|
+
├── rule-regex.test.ts # Rule regex builder tests
|
|
46
|
+
├── segmenter-utils.test.ts # Segmenter helper tests
|
|
43
47
|
├── tokens.test.ts # Token expansion tests
|
|
44
48
|
├── fuzzy.test.ts # Fuzzy matching tests
|
|
45
49
|
├── textUtils.test.ts # Text utility tests
|
|
@@ -72,7 +76,15 @@ docs/
|
|
|
72
76
|
- `filterByConstraints()` - Apply min/max page filters
|
|
73
77
|
- `anyRuleAllowsId()` - Check if page passes rule constraints
|
|
74
78
|
|
|
75
|
-
4. **`
|
|
79
|
+
4. **`rule-regex.ts`** - SplitRule → compiled regex builder
|
|
80
|
+
- `buildRuleRegex()` - Compiles rule patterns (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, `template`, `regex`)
|
|
81
|
+
- `processPattern()` - Token expansion + auto-escaping + optional fuzzy application
|
|
82
|
+
|
|
83
|
+
5. **`breakpoint-processor.ts`** - Breakpoint post-processing engine
|
|
84
|
+
- `applyBreakpoints()` - Splits oversized structural segments using breakpoint patterns + windowing
|
|
85
|
+
- Applies `pageJoiner` normalization to breakpoint-created segments
|
|
86
|
+
|
|
87
|
+
6. **`breakpoint-utils.ts`** - Breakpoint processing utilities (NEW)
|
|
76
88
|
- `normalizeBreakpoint()` - Convert string to BreakpointRule object
|
|
77
89
|
- `isPageExcluded()` - Check if page is in exclude list
|
|
78
90
|
- `isInBreakpointRange()` - Validate page against min/max/exclude constraints
|
|
@@ -80,20 +92,23 @@ docs/
|
|
|
80
92
|
- `createSegment()` - Create segment with optional to/meta fields
|
|
81
93
|
- `expandBreakpoints()` - Expand patterns with pre-compiled regexes
|
|
82
94
|
- `findActualEndPage()` - Search backwards for ending page by content
|
|
95
|
+
- `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
|
|
96
|
+
- `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
|
|
83
97
|
- `findBreakPosition()` - Find break position using breakpoint patterns
|
|
84
98
|
- `hasExcludedPageInRange()` - Check if range contains excluded pages
|
|
85
99
|
- `findNextPagePosition()` - Find next page content position
|
|
86
100
|
- `findPatternBreakPosition()` - Find pattern match by preference
|
|
87
101
|
|
|
88
|
-
|
|
102
|
+
7. **`types.ts`** - Type definitions
|
|
89
103
|
- `Logger` interface - Optional logging for debugging
|
|
90
104
|
- `SegmentationOptions` - Options with `logger` property
|
|
105
|
+
- `pageJoiner` - Controls how page boundaries are represented in output (`space` default)
|
|
91
106
|
- Verbosity levels: `trace`, `debug`, `info`, `warn`, `error`
|
|
92
107
|
|
|
93
|
-
|
|
108
|
+
8. **`fuzzy.ts`** - Arabic text normalization
|
|
94
109
|
- `makeDiacriticInsensitive()` - Generate regex that ignores diacritics
|
|
95
110
|
|
|
96
|
-
|
|
111
|
+
9. **`pattern-detection.ts`** - Token auto-detection (NEW)
|
|
97
112
|
- `detectTokenPatterns()` - Detect tokens in text with positions
|
|
98
113
|
- `generateTemplateFromText()` - Convert text to template string
|
|
99
114
|
- `suggestPatternConfig()` - Suggest rule configuration
|
|
@@ -187,8 +202,8 @@ interface SegmentationOptions {
|
|
|
187
202
|
```typescript
|
|
188
203
|
segmentPages(pages, {
|
|
189
204
|
rules: [
|
|
190
|
-
{ lineStartsWith: ['{{basmalah}}'], split
|
|
191
|
-
{ lineStartsWith: ['{{bab}}'],
|
|
205
|
+
{ lineStartsWith: ['{{basmalah}}'] }, // split defaults to 'at'
|
|
206
|
+
{ lineStartsWith: ['{{bab}}'], meta: { type: 'chapter' } },
|
|
192
207
|
],
|
|
193
208
|
maxPages: 2,
|
|
194
209
|
breakpoints: ['{{tarqim}}\\s*', '\\n', ''], // Try: punctuation → newline → page boundary
|
package/README.md
CHANGED
|
@@ -355,7 +355,11 @@ const pages: Page[] = [
|
|
|
355
355
|
const options: SegmentationOptions = {
|
|
356
356
|
rules: [
|
|
357
357
|
{ lineStartsWith: ['## '], split: 'at' }
|
|
358
|
-
]
|
|
358
|
+
],
|
|
359
|
+
// How to join content across page boundaries in OUTPUT segments:
|
|
360
|
+
// - 'space' (default): page boundaries become spaces
|
|
361
|
+
// - 'newline': preserve page boundaries as newlines
|
|
362
|
+
pageJoiner: 'space',
|
|
359
363
|
};
|
|
360
364
|
|
|
361
365
|
const segments: Segment[] = segmentPages(pages, options);
|
|
@@ -510,7 +514,7 @@ type SplitRule = {
|
|
|
510
514
|
regex?: string;
|
|
511
515
|
|
|
512
516
|
// Split behavior
|
|
513
|
-
split
|
|
517
|
+
split?: 'at' | 'after'; // Default: 'at'
|
|
514
518
|
occurrence?: 'first' | 'last' | 'all';
|
|
515
519
|
maxSpan?: number;
|
|
516
520
|
fuzzy?: boolean;
|
|
@@ -588,7 +592,7 @@ console.log(`Found ${segments.length} segments`);
|
|
|
588
592
|
# Install dependencies
|
|
589
593
|
bun install
|
|
590
594
|
|
|
591
|
-
# Run tests
|
|
595
|
+
# Run tests
|
|
592
596
|
bun test
|
|
593
597
|
|
|
594
598
|
# Build
|
|
@@ -621,7 +625,12 @@ Fuzzy transforms are applied to raw Arabic text *before* wrapping in regex group
|
|
|
621
625
|
|
|
622
626
|
### Extracted Utilities
|
|
623
627
|
|
|
624
|
-
Complex logic
|
|
628
|
+
Complex logic is intentionally split into small, independently testable modules:
|
|
629
|
+
|
|
630
|
+
- `src/segmentation/match-utils.ts`: match filtering + capture extraction
|
|
631
|
+
- `src/segmentation/rule-regex.ts`: SplitRule → compiled regex builder (`buildRuleRegex`, `processPattern`)
|
|
632
|
+
- `src/segmentation/breakpoint-utils.ts`: breakpoint windowing/exclusion helpers + page boundary join normalization
|
|
633
|
+
- `src/segmentation/breakpoint-processor.ts`: breakpoint post-processing engine (applies breakpoints after structural segmentation)
|
|
625
634
|
|
|
626
635
|
## Performance Notes
|
|
627
636
|
|
package/dist/index.d.mts
CHANGED
|
@@ -229,8 +229,9 @@ type SplitBehavior = {
|
|
|
229
229
|
* Where to split relative to the match.
|
|
230
230
|
* - `'at'`: New segment starts at the match position
|
|
231
231
|
* - `'after'`: New segment starts after the match ends
|
|
232
|
+
* @default 'at'
|
|
232
233
|
*/
|
|
233
|
-
split
|
|
234
|
+
split?: 'at' | 'after';
|
|
234
235
|
/**
|
|
235
236
|
* Which occurrence(s) to split on.
|
|
236
237
|
* - `'all'`: Split at every match (default)
|
|
@@ -366,14 +367,13 @@ type RuleConstraints = {
|
|
|
366
367
|
* Each rule must specify:
|
|
367
368
|
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
368
369
|
* `lineStartsAfter`, or `lineEndsWith`
|
|
369
|
-
* - **Split behavior**: `split` (
|
|
370
|
+
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `maxSpan`, `fuzzy`
|
|
370
371
|
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
371
372
|
*
|
|
372
373
|
* @example
|
|
373
|
-
* // Basic rule: split at markdown headers
|
|
374
|
+
* // Basic rule: split at markdown headers (split defaults to 'at')
|
|
374
375
|
* const rule: SplitRule = {
|
|
375
376
|
* lineStartsWith: ['## ', '### '],
|
|
376
|
-
* split: 'at',
|
|
377
377
|
* meta: { type: 'section' }
|
|
378
378
|
* };
|
|
379
379
|
*
|
|
@@ -381,7 +381,6 @@ type RuleConstraints = {
|
|
|
381
381
|
* // Advanced rule: extract hadith numbers with fuzzy Arabic matching
|
|
382
382
|
* const rule: SplitRule = {
|
|
383
383
|
* lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
|
|
384
|
-
* split: 'at',
|
|
385
384
|
* fuzzy: true,
|
|
386
385
|
* min: 5,
|
|
387
386
|
* max: 500,
|
|
@@ -631,6 +630,19 @@ type SegmentationOptions = {
|
|
|
631
630
|
* @default 'longer'
|
|
632
631
|
*/
|
|
633
632
|
prefer?: 'longer' | 'shorter';
|
|
633
|
+
/**
|
|
634
|
+
* How to join content across page boundaries in OUTPUT segments.
|
|
635
|
+
*
|
|
636
|
+
* Internally, pages are still concatenated with `\\n` for matching (multiline regex),
|
|
637
|
+
* but when a segment spans multiple pages, the inserted page-boundary separator is
|
|
638
|
+
* normalized for output.
|
|
639
|
+
*
|
|
640
|
+
* - `'space'`: Join pages with a single space (default)
|
|
641
|
+
* - `'newline'`: Preserve page boundary as a newline
|
|
642
|
+
*
|
|
643
|
+
* @default 'space'
|
|
644
|
+
*/
|
|
645
|
+
pageJoiner?: 'space' | 'newline';
|
|
634
646
|
/**
|
|
635
647
|
* Optional logger for debugging segmentation.
|
|
636
648
|
*
|
|
@@ -708,6 +720,21 @@ type Segment = {
|
|
|
708
720
|
};
|
|
709
721
|
//#endregion
|
|
710
722
|
//#region src/segmentation/segmenter.d.ts
|
|
723
|
+
|
|
724
|
+
/**
|
|
725
|
+
* Applies breakpoints to oversized segments.
|
|
726
|
+
*
|
|
727
|
+
* For each segment that spans more than maxPages, tries the breakpoint patterns
|
|
728
|
+
* in order to find a suitable split point. Structural markers (from rules) are
|
|
729
|
+
* always respected - segments are only broken within their boundaries.
|
|
730
|
+
*
|
|
731
|
+
* @param segments - Initial segments from rule processing
|
|
732
|
+
* @param pages - Original pages for page lookup
|
|
733
|
+
* @param maxPages - Maximum pages before breakpoints apply
|
|
734
|
+
* @param breakpoints - Patterns to try in order (tokens supported)
|
|
735
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
736
|
+
* @returns Processed segments with oversized ones broken up
|
|
737
|
+
*/
|
|
711
738
|
/**
|
|
712
739
|
* Segments pages of content based on pattern-matching rules.
|
|
713
740
|
*
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA,KA9VK,YAAA,GA8VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA,KH5NK,eAAA,GGgOJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAwCC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;;;;AEhtBX;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cHgNa,sBAAuB,iBAAiB,wBAAsB;;;;AFxU3E;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cEjJO,oBFiJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;cAlfa;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cD4Fa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,2FAA0F;;;;;;;;;;;;;;;;;;;;cAmH1F;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;;AJpfb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cIlmBN,mBJkmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIlmBgC,eJkmBhC,EAAA;AAiBnB;;;;AC/SA;;;;;;;;ACpYA;AAca,cE8GA,wBF7GyD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE6GL,eF7GK,EAAA,EAAA,GAAA,MAAA;;;;AC+CtE;AAkNA;AA2CA;AAWY,cCnLC,oBDmLW,EAAA,CAAA,QAAA,EClLV,eDkLU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBA8FZ,GAAA,iBA9FsG;EAmH1F,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;AC1iBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
|