flappa-doormal 2.2.2 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +21 -6
- package/README.md +12 -3
- package/dist/index.d.mts +28 -0
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +443 -331
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/AGENTS.md
CHANGED
|
@@ -30,8 +30,10 @@ src/
|
|
|
30
30
|
├── pattern-detection.test.ts # Pattern detection tests (22 tests)
|
|
31
31
|
└── segmentation/
|
|
32
32
|
├── types.ts # TypeScript type definitions for rules/segments
|
|
33
|
-
├── segmenter.ts # Core segmentation engine (segmentPages
|
|
34
|
-
├── breakpoint-
|
|
33
|
+
├── segmenter.ts # Core segmentation engine (segmentPages)
|
|
34
|
+
├── breakpoint-processor.ts # Breakpoint post-processing engine (applyBreakpoints)
|
|
35
|
+
├── breakpoint-utils.ts # Breakpoint processing utilities (windowing, excludes, page joins)
|
|
36
|
+
├── rule-regex.ts # SplitRule -> compiled regex builder (buildRuleRegex, processPattern)
|
|
35
37
|
├── tokens.ts # Token definitions and expansion logic
|
|
36
38
|
├── fuzzy.ts # Diacritic-insensitive matching utilities
|
|
37
39
|
├── html.ts # HTML utilities (stripHtmlTags)
|
|
@@ -40,6 +42,8 @@ src/
|
|
|
40
42
|
├── segmenter.test.ts # Core test suite (150+ tests including breakpoints)
|
|
41
43
|
├── segmenter.bukhari.test.ts # Real-world test cases
|
|
42
44
|
├── breakpoint-utils.test.ts # Breakpoint utility tests (42 tests)
|
|
45
|
+
├── rule-regex.test.ts # Rule regex builder tests
|
|
46
|
+
├── segmenter-utils.test.ts # Segmenter helper tests
|
|
43
47
|
├── tokens.test.ts # Token expansion tests
|
|
44
48
|
├── fuzzy.test.ts # Fuzzy matching tests
|
|
45
49
|
├── textUtils.test.ts # Text utility tests
|
|
@@ -72,7 +76,15 @@ docs/
|
|
|
72
76
|
- `filterByConstraints()` - Apply min/max page filters
|
|
73
77
|
- `anyRuleAllowsId()` - Check if page passes rule constraints
|
|
74
78
|
|
|
75
|
-
4. **`
|
|
79
|
+
4. **`rule-regex.ts`** - SplitRule → compiled regex builder
|
|
80
|
+
- `buildRuleRegex()` - Compiles rule patterns (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, `template`, `regex`)
|
|
81
|
+
- `processPattern()` - Token expansion + auto-escaping + optional fuzzy application
|
|
82
|
+
|
|
83
|
+
5. **`breakpoint-processor.ts`** - Breakpoint post-processing engine
|
|
84
|
+
- `applyBreakpoints()` - Splits oversized structural segments using breakpoint patterns + windowing
|
|
85
|
+
- Applies `pageJoiner` normalization to breakpoint-created segments
|
|
86
|
+
|
|
87
|
+
6. **`breakpoint-utils.ts`** - Breakpoint processing utilities (NEW)
|
|
76
88
|
- `normalizeBreakpoint()` - Convert string to BreakpointRule object
|
|
77
89
|
- `isPageExcluded()` - Check if page is in exclude list
|
|
78
90
|
- `isInBreakpointRange()` - Validate page against min/max/exclude constraints
|
|
@@ -80,20 +92,23 @@ docs/
|
|
|
80
92
|
- `createSegment()` - Create segment with optional to/meta fields
|
|
81
93
|
- `expandBreakpoints()` - Expand patterns with pre-compiled regexes
|
|
82
94
|
- `findActualEndPage()` - Search backwards for ending page by content
|
|
95
|
+
- `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
|
|
96
|
+
- `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
|
|
83
97
|
- `findBreakPosition()` - Find break position using breakpoint patterns
|
|
84
98
|
- `hasExcludedPageInRange()` - Check if range contains excluded pages
|
|
85
99
|
- `findNextPagePosition()` - Find next page content position
|
|
86
100
|
- `findPatternBreakPosition()` - Find pattern match by preference
|
|
87
101
|
|
|
88
|
-
|
|
102
|
+
7. **`types.ts`** - Type definitions
|
|
89
103
|
- `Logger` interface - Optional logging for debugging
|
|
90
104
|
- `SegmentationOptions` - Options with `logger` property
|
|
105
|
+
- `pageJoiner` - Controls how page boundaries are represented in output (`space` default)
|
|
91
106
|
- Verbosity levels: `trace`, `debug`, `info`, `warn`, `error`
|
|
92
107
|
|
|
93
|
-
|
|
108
|
+
8. **`fuzzy.ts`** - Arabic text normalization
|
|
94
109
|
- `makeDiacriticInsensitive()` - Generate regex that ignores diacritics
|
|
95
110
|
|
|
96
|
-
|
|
111
|
+
9. **`pattern-detection.ts`** - Token auto-detection (NEW)
|
|
97
112
|
- `detectTokenPatterns()` - Detect tokens in text with positions
|
|
98
113
|
- `generateTemplateFromText()` - Convert text to template string
|
|
99
114
|
- `suggestPatternConfig()` - Suggest rule configuration
|
package/README.md
CHANGED
|
@@ -355,7 +355,11 @@ const pages: Page[] = [
|
|
|
355
355
|
const options: SegmentationOptions = {
|
|
356
356
|
rules: [
|
|
357
357
|
{ lineStartsWith: ['## '], split: 'at' }
|
|
358
|
-
]
|
|
358
|
+
],
|
|
359
|
+
// How to join content across page boundaries in OUTPUT segments:
|
|
360
|
+
// - 'space' (default): page boundaries become spaces
|
|
361
|
+
// - 'newline': preserve page boundaries as newlines
|
|
362
|
+
pageJoiner: 'space',
|
|
359
363
|
};
|
|
360
364
|
|
|
361
365
|
const segments: Segment[] = segmentPages(pages, options);
|
|
@@ -588,7 +592,7 @@ console.log(`Found ${segments.length} segments`);
|
|
|
588
592
|
# Install dependencies
|
|
589
593
|
bun install
|
|
590
594
|
|
|
591
|
-
# Run tests
|
|
595
|
+
# Run tests
|
|
592
596
|
bun test
|
|
593
597
|
|
|
594
598
|
# Build
|
|
@@ -621,7 +625,12 @@ Fuzzy transforms are applied to raw Arabic text *before* wrapping in regex group
|
|
|
621
625
|
|
|
622
626
|
### Extracted Utilities
|
|
623
627
|
|
|
624
|
-
Complex logic
|
|
628
|
+
Complex logic is intentionally split into small, independently testable modules:
|
|
629
|
+
|
|
630
|
+
- `src/segmentation/match-utils.ts`: match filtering + capture extraction
|
|
631
|
+
- `src/segmentation/rule-regex.ts`: SplitRule → compiled regex builder (`buildRuleRegex`, `processPattern`)
|
|
632
|
+
- `src/segmentation/breakpoint-utils.ts`: breakpoint windowing/exclusion helpers + page boundary join normalization
|
|
633
|
+
- `src/segmentation/breakpoint-processor.ts`: breakpoint post-processing engine (applies breakpoints after structural segmentation)
|
|
625
634
|
|
|
626
635
|
## Performance Notes
|
|
627
636
|
|
package/dist/index.d.mts
CHANGED
|
@@ -630,6 +630,19 @@ type SegmentationOptions = {
|
|
|
630
630
|
* @default 'longer'
|
|
631
631
|
*/
|
|
632
632
|
prefer?: 'longer' | 'shorter';
|
|
633
|
+
/**
|
|
634
|
+
* How to join content across page boundaries in OUTPUT segments.
|
|
635
|
+
*
|
|
636
|
+
* Internally, pages are still concatenated with `\\n` for matching (multiline regex),
|
|
637
|
+
* but when a segment spans multiple pages, the inserted page-boundary separator is
|
|
638
|
+
* normalized for output.
|
|
639
|
+
*
|
|
640
|
+
* - `'space'`: Join pages with a single space (default)
|
|
641
|
+
* - `'newline'`: Preserve page boundary as a newline
|
|
642
|
+
*
|
|
643
|
+
* @default 'space'
|
|
644
|
+
*/
|
|
645
|
+
pageJoiner?: 'space' | 'newline';
|
|
633
646
|
/**
|
|
634
647
|
* Optional logger for debugging segmentation.
|
|
635
648
|
*
|
|
@@ -707,6 +720,21 @@ type Segment = {
|
|
|
707
720
|
};
|
|
708
721
|
//#endregion
|
|
709
722
|
//#region src/segmentation/segmenter.d.ts
|
|
723
|
+
|
|
724
|
+
/**
|
|
725
|
+
* Applies breakpoints to oversized segments.
|
|
726
|
+
*
|
|
727
|
+
* For each segment that spans more than maxPages, tries the breakpoint patterns
|
|
728
|
+
* in order to find a suitable split point. Structural markers (from rules) are
|
|
729
|
+
* always respected - segments are only broken within their boundaries.
|
|
730
|
+
*
|
|
731
|
+
* @param segments - Initial segments from rule processing
|
|
732
|
+
* @param pages - Original pages for page lookup
|
|
733
|
+
* @param maxPages - Maximum pages before breakpoints apply
|
|
734
|
+
* @param breakpoints - Patterns to try in order (tokens supported)
|
|
735
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
736
|
+
* @returns Processed segments with oversized ones broken up
|
|
737
|
+
*/
|
|
710
738
|
/**
|
|
711
739
|
* Segments pages of content based on pattern-matching rules.
|
|
712
740
|
*
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA,KA9VK,YAAA,GA8VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA,KH5NK,eAAA,GGgOJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAwCC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;;;;AEhtBX;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;;AC1iBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cHgNa,sBAAuB,iBAAiB,wBAAsB;;;;AFxU3E;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cEjJO,oBFiJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;AC/SA;;;;;;;;ACpYA;AAcA;;;;ACgDA;AAkNA;AA2CA;AAWA;AA2DA;AAmHA;AAuBA;AAqBA;AAgBA;;;cAlfa;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cD4Fa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,2FAA0F;;;;;;;;;;;;;;;;;;;;cAmH1F;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;;AJpfb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cIlmBN,mBJkmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIlmBgC,eJkmBhC,EAAA;AAiBnB;;;;AC/SA;;;;;;;;ACpYA;AAca,cE8GA,wBF7GyD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE6GL,eF7GK,EAAA,EAAA,GAAA,MAAA;;;;AC+CtE;AAkNA;AA2CA;AAWY,cCnLC,oBDmLW,EAAA,CAAA,QAAA,EClLV,eDkLU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBA8FZ,GAAA,iBA9FsG;EAmH1F,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;AC1iBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
|