flappa-doormal 2.2.3 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +17 -0
- package/README.md +22 -0
- package/dist/index.d.mts +1 -1
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +425 -219
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/AGENTS.md
CHANGED
|
@@ -79,6 +79,7 @@ docs/
|
|
|
79
79
|
4. **`rule-regex.ts`** - SplitRule → compiled regex builder
|
|
80
80
|
- `buildRuleRegex()` - Compiles rule patterns (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`, `template`, `regex`)
|
|
81
81
|
- `processPattern()` - Token expansion + auto-escaping + optional fuzzy application
|
|
82
|
+
- `extractNamedCaptureNames()` - Extract `(?<name>...)` groups from raw regex patterns (NEW)
|
|
82
83
|
|
|
83
84
|
5. **`breakpoint-processor.ts`** - Breakpoint post-processing engine
|
|
84
85
|
- `applyBreakpoints()` - Splits oversized structural segments using breakpoint patterns + windowing
|
|
@@ -178,6 +179,22 @@ export const escapeTemplateBrackets = (pattern: string): string => {
|
|
|
178
179
|
- Direct `template` processing in `buildRuleRegex()`
|
|
179
180
|
- **NOT** applied to `regex` patterns (user has full control)
|
|
180
181
|
|
|
182
|
+
### Named Captures in Raw Regex Patterns (NEW)
|
|
183
|
+
|
|
184
|
+
Raw `regex` patterns now support named capture groups for metadata extraction:
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
// Named groups like (?<num>...) are automatically detected and extracted
|
|
188
|
+
{ regex: '^(?<num>[٠-٩]+)\\s+[أ-ي\\s]+:\\s*(.+)' }
|
|
189
|
+
// meta.num = matched number
|
|
190
|
+
// content = the (.+) anonymous capture group
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**How it works:**
|
|
194
|
+
1. `extractNamedCaptureNames()` parses `(?<name>...)` from regex string
|
|
195
|
+
2. Named captures go to `segment.meta`
|
|
196
|
+
3. Anonymous `(...)` captures can still be used for content extraction
|
|
197
|
+
|
|
181
198
|
### Breakpoints Post-Processing Algorithm
|
|
182
199
|
|
|
183
200
|
The `breakpoints` option provides a post-processing mechanism for limiting segment size. Unlike the deprecated `maxSpan` (which was per-rule), breakpoints runs AFTER all structural rules.
|
package/README.md
CHANGED
|
@@ -90,6 +90,7 @@ Replace regex with readable tokens:
|
|
|
90
90
|
| `{{raqm}}` | Single Arabic digit | `[\\u0660-\\u0669]` |
|
|
91
91
|
| `{{dash}}` | Dash variants | `[-–—ـ]` |
|
|
92
92
|
| `{{harf}}` | Arabic letter | `[أ-ي]` |
|
|
93
|
+
| `{{harfs}}` | Arabic letters with spaces | `[أ-ي](?:[أ-ي\s]*[أ-ي])?` |
|
|
93
94
|
| `{{numbered}}` | Hadith numbering `٢٢ - ` | `{{raqms}} {{dash}} ` |
|
|
94
95
|
| `{{fasl}}` | Section markers | `فصل\|مسألة` |
|
|
95
96
|
| `{{tarqim}}` | Punctuation marks | `[.!?؟؛]` |
|
|
@@ -165,6 +166,10 @@ For full regex control (character classes, capturing groups), use the `regex` pa
|
|
|
165
166
|
|
|
166
167
|
// Capturing group (test|text) matches either
|
|
167
168
|
{ regex: '^(test|text) ', split: 'at' }
|
|
169
|
+
|
|
170
|
+
// Named capture groups extract metadata from raw regex too!
|
|
171
|
+
{ regex: '^(?<num>[٠-٩]+)\\s+[أ-ي\\s]+:\\s*(.+)' }
|
|
172
|
+
// meta.num = matched number, content = captured (.+) group
|
|
168
173
|
```
|
|
169
174
|
|
|
170
175
|
### 6. Page Constraints
|
|
@@ -289,6 +294,23 @@ const segments = segmentPages(pages, {
|
|
|
289
294
|
// meta: { num: '٥' } // harf not captured (no :name suffix)
|
|
290
295
|
```
|
|
291
296
|
|
|
297
|
+
### Narrator Abbreviation Codes
|
|
298
|
+
|
|
299
|
+
Use `{{harfs}}` for matching Arabic letter abbreviations with spaces (common in narrator biography books):
|
|
300
|
+
|
|
301
|
+
```typescript
|
|
302
|
+
const segments = segmentPages(pages, {
|
|
303
|
+
rules: [{
|
|
304
|
+
lineStartsAfter: ['{{raqms:num}} {{harfs}}:'],
|
|
305
|
+
split: 'at'
|
|
306
|
+
}]
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
// Matches: ١١١٨ د ت سي ق: حجاج بن دينار
|
|
310
|
+
// meta: { num: '١١١٨' }
|
|
311
|
+
// content: 'حجاج بن دينار' (abbreviations stripped)
|
|
312
|
+
```
|
|
313
|
+
|
|
292
314
|
### Sentence-Based Splitting (Last Period Per Page)
|
|
293
315
|
|
|
294
316
|
```typescript
|
package/dist/index.d.mts
CHANGED
|
@@ -967,7 +967,7 @@ type ExpandResult = {
|
|
|
967
967
|
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
968
968
|
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
969
969
|
*/
|
|
970
|
-
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
|
|
970
|
+
declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string, capturePrefix?: string) => ExpandResult;
|
|
971
971
|
/**
|
|
972
972
|
* Expands template tokens in a query string to their regex equivalents.
|
|
973
973
|
*
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EY,cDzaC,WCyaqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA,KA9VK,YAAA,GA8VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA,KHvOK,eAAA,GG2OJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAwCC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;;;;AEhtBX;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;;AC3jBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cHgaa,sBAAuB,iBAAiB,wBAAsB;;;;AFxhB3E;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cEjJO,oBFiJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;AC/FA;;;;;;;;ACplBA;AAcA;;;;ACgDA;AA6NA;AA2CA;AAWA;AA2DA;AAyHA;AAuBA;AAqBA;AAgBA;;;cAngBa;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cDuGa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,mHAIV;;;;;;;;;;;;;;;;;;;;cAqHU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;;AJrgBb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AA0GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cIlmBN,mBJkmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIlmBgC,eJkmBhC,EAAA;AAiBnB;;;;AC/FA;;;;;;;;ACplBA;AAca,cE8GA,wBF7GyD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE6GL,eF7GK,EAAA,EAAA,GAAA,MAAA;;;;AC+CtE;AA6NA;AA2CA;AAWY,cC9LC,oBD8LW,EAAA,CAAA,QAAA,EC7LV,eD6LU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBAoGZ,GAAA,iBAhGE;EAqHU,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;AC3jBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
|