flappa-doormal 2.5.3 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +10 -0
- package/README.md +16 -0
- package/dist/index.d.mts +51 -1
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +111 -11
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/AGENTS.md
CHANGED
|
@@ -203,6 +203,11 @@ The `breakpoints` option provides a post-processing mechanism for limiting segme
|
|
|
203
203
|
```typescript
|
|
204
204
|
interface SegmentationOptions {
|
|
205
205
|
rules: SplitRule[];
|
|
206
|
+
// Optional preprocessing step: regex replacements applied per-page BEFORE segmentation
|
|
207
|
+
// - default flags: 'gu' (and g+u are always enforced)
|
|
208
|
+
// - pageIds omitted: apply to all pages
|
|
209
|
+
// - pageIds: []: apply to no pages (skip)
|
|
210
|
+
replace?: Array<{ regex: string; replacement: string; flags?: string; pageIds?: number[] }>;
|
|
206
211
|
maxPages?: number; // Maximum pages a segment can span
|
|
207
212
|
breakpoints?: string[]; // Ordered array of regex patterns (supports token expansion)
|
|
208
213
|
prefer?: 'longer' | 'shorter'; // Select last or first match within window
|
|
@@ -420,6 +425,11 @@ Useful options (recent additions):
|
|
|
420
425
|
- **`normalizeArabicDiacritics`**: `true` by default so tokens match diacritized forms (e.g. `وأَخْبَرَنَا` → `{{naql}}`).
|
|
421
426
|
- **`whitespace`**: `'regex'` (default) uses `\\s*` placeholders; `'space'` uses literal spaces in returned signatures.
|
|
422
427
|
|
|
428
|
+
**Note on brackets in returned signatures**:
|
|
429
|
+
- `analyzeCommonLineStarts()` emits **template-like** signatures.
|
|
430
|
+
- It intentionally **does not escape literal `()` / `[]`** (e.g. `(ح)` stays `(ح)`), because template patterns auto-escape `()[]` later.
|
|
431
|
+
- If you reuse a signature inside a raw `regex` rule, you may need to escape literal brackets yourself.
|
|
432
|
+
|
|
423
433
|
Examples:
|
|
424
434
|
|
|
425
435
|
```typescript
|
package/README.md
CHANGED
|
@@ -383,6 +383,12 @@ Key options:
|
|
|
383
383
|
- `'regex'` (default): uses `\\s*` placeholders between tokens
|
|
384
384
|
- `'space'`: uses literal single spaces (`' '`) between tokens (useful if you don't want `\\s` to later match newlines when reusing these patterns)
|
|
385
385
|
|
|
386
|
+
**Note on brackets in returned patterns**:
|
|
387
|
+
- `analyzeCommonLineStarts()` returns **template-like signatures**, not “ready-to-run regex”.
|
|
388
|
+
- It intentionally **does not escape literal `()` / `[]`** in the returned `pattern` (e.g. `(ح)` stays `(ح)`).
|
|
389
|
+
- If you paste these signatures into `lineStartsWith` / `lineStartsAfter` / `template`, that’s fine: those template pattern types **auto-escape `()[]`** outside `{{tokens}}`.
|
|
390
|
+
- If you paste them into a raw `regex` rule, you may need to escape literal brackets yourself.
|
|
391
|
+
|
|
386
392
|
|
|
387
393
|
## Prompting LLMs / Agents to Generate Rules (Shamela books)
|
|
388
394
|
|
|
@@ -630,6 +636,16 @@ const pages: Page[] = [
|
|
|
630
636
|
];
|
|
631
637
|
|
|
632
638
|
const options: SegmentationOptions = {
|
|
639
|
+
// Optional preprocessing step: regex replacements applied per-page BEFORE segmentation.
|
|
640
|
+
// Useful for normalizing OCR/typos/spacing so rules match consistently.
|
|
641
|
+
//
|
|
642
|
+
// Notes:
|
|
643
|
+
// - `flags` defaults to 'gu'. If provided, `g` and `u` are always enforced.
|
|
644
|
+
// - `pageIds: []` means "apply to no pages" (skip that rule).
|
|
645
|
+
// - Remember JSON escaping: to match a literal '.', use regex: "\\\\." in JSON.
|
|
646
|
+
replace: [
|
|
647
|
+
{ regex: "([\\u0660-\\u0669]+)\\s*[-–—ـ]\\s*", replacement: "$1 - " }
|
|
648
|
+
],
|
|
633
649
|
rules: [
|
|
634
650
|
{ lineStartsWith: ['## '], split: 'at' }
|
|
635
651
|
],
|
package/dist/index.d.mts
CHANGED
|
@@ -558,6 +558,26 @@ interface Logger {
|
|
|
558
558
|
/** Log a warning message (potential issues) */
|
|
559
559
|
warn?: (message: string, ...args: unknown[]) => void;
|
|
560
560
|
}
|
|
561
|
+
/**
|
|
562
|
+
* - Default regex flags: `gu` (global + unicode)
|
|
563
|
+
* - If `flags` is provided, it is validated and merged with required flags:
|
|
564
|
+
* `g` and `u` are always enforced.
|
|
565
|
+
*
|
|
566
|
+
* `pageIds` controls which pages a rule applies to:
|
|
567
|
+
* - `undefined`: apply to all pages
|
|
568
|
+
* - `[]`: apply to no pages (rule is skipped)
|
|
569
|
+
* - `[id1, id2, ...]`: apply only to those pages
|
|
570
|
+
*/
|
|
571
|
+
type Replacement = {
|
|
572
|
+
/** Raw regex source string (no token expansion). Compiled with `u` (and always `g`). */
|
|
573
|
+
regex: string;
|
|
574
|
+
/** Replacement string (passed to `String.prototype.replace`). */
|
|
575
|
+
replacement: string;
|
|
576
|
+
/** Optional regex flags; `g` and `u` are always enforced. */
|
|
577
|
+
flags?: string;
|
|
578
|
+
/** Optional list of page IDs to apply this replacement to. Empty array means skip. */
|
|
579
|
+
pageIds?: number[];
|
|
580
|
+
};
|
|
561
581
|
/**
|
|
562
582
|
* Segmentation options controlling how pages are split.
|
|
563
583
|
*
|
|
@@ -591,6 +611,12 @@ interface Logger {
|
|
|
591
611
|
* };
|
|
592
612
|
*/
|
|
593
613
|
type SegmentationOptions = {
|
|
614
|
+
/**
|
|
615
|
+
* Optional pre-processing replacements applied to page content BEFORE segmentation.
|
|
616
|
+
*
|
|
617
|
+
* Replacements are applied per-page (not on concatenated content), in array order.
|
|
618
|
+
*/
|
|
619
|
+
replace?: Replacement[];
|
|
594
620
|
/**
|
|
595
621
|
* Rules applied in order to find split points.
|
|
596
622
|
*
|
|
@@ -799,6 +825,30 @@ type Segment = {
|
|
|
799
825
|
*/
|
|
800
826
|
declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
|
|
801
827
|
//#endregion
|
|
828
|
+
//#region src/segmentation/replace.d.ts
|
|
829
|
+
/**
|
|
830
|
+
* A single replacement rule applied by `applyReplacements()` / `SegmentationOptions.replace`.
|
|
831
|
+
*
|
|
832
|
+
* Notes:
|
|
833
|
+
* - `regex` is a raw JavaScript regex source string (no token expansion).
|
|
834
|
+
* - Default flags are `gu` (global + unicode).
|
|
835
|
+
* - If `flags` is provided, it is validated and `g` + `u` are always enforced.
|
|
836
|
+
* - If `pageIds` is omitted, the rule applies to all pages.
|
|
837
|
+
* - If `pageIds` is `[]`, the rule applies to no pages (rule is skipped).
|
|
838
|
+
*/
|
|
839
|
+
type ReplaceRule = NonNullable<SegmentationOptions['replace']>[number];
|
|
840
|
+
/**
|
|
841
|
+
* Applies ordered regex replacements to page content (per page).
|
|
842
|
+
*
|
|
843
|
+
* - Replacement rules are applied in array order.
|
|
844
|
+
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
845
|
+
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
846
|
+
*
|
|
847
|
+
* This function is intentionally **pure**:
|
|
848
|
+
* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
|
|
849
|
+
*/
|
|
850
|
+
declare const applyReplacements: (pages: Page[], rules?: ReplaceRule[]) => Page[];
|
|
851
|
+
//#endregion
|
|
802
852
|
//#region src/segmentation/tokens.d.ts
|
|
803
853
|
/**
|
|
804
854
|
* Token-based template system for Arabic text pattern matching.
|
|
@@ -1213,5 +1263,5 @@ declare const analyzeTextForRule: (text: string) => {
|
|
|
1213
1263
|
detected: DetectedPattern[];
|
|
1214
1264
|
} | null;
|
|
1215
1265
|
//#endregion
|
|
1216
|
-
export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
1266
|
+
export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type ReplaceRule, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
1217
1267
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/replace.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6Ca,cJzeA,wBIyeyE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJxkBtF;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;KFbK,eAAA,GEaoE;EAAI;;;;ACH7E;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;;;ACvVA,KL4EK,qBAAA,GK5EsB;EA+Ed;EAgEA,cAAA,EAAA,MAAA,EAAA;AAuBb,CAAA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;KL1FK,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBZ,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CO,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AAtXX;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;cH4Ha,sBAAuB,iBAAiB,wBAAsB;;;AF7Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;AD9TxB;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKa,cAzhBA,sBA6hBV,EAAA,CAAA,OAAA,EAAA,MAoBF,EAAA,GAAA,MAAA;AAqBD;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;AAGyB,cD5BZ,+BC4BY,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AC1VzB;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;cF6Ka,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AJpoBA,KK9DD,wBAAA,GL8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAWC;AA2DD;EAMc,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EASF;;;;AAuHZ;;;;AC5QA;;EAAqD,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAAsB;;;;;ACnd3E;AA2DA;;;;;;;mBEXqB;EDQR;AAsQb;AAsDA;AA2CA;AAWA;AAuKA;EA6Ca,UAAA,CAAA,EAAA,OAAyE,GAAA,OAAA;AAuBtF,CAAA;AAqBa,KChnBD,uBAAA,GDgnBuD;EAgBtD,IAAA,EAAA,MAAA;;;KC9nBD,sBAAA;EApEA,OAAA,EAAA,MAAA;EAkEA,KAAA,EAAA,MAAA;EAEA,QAAA,EAGE,uBAHoB,EAAA;AA2RlC,CAAA;;;;;;;cAAa,iCACF,kBACE,6BACV;;;;ALpSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KK7GO,eAAA,GL6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;AA6DkB,cKvkBL,mBLukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GKvkB8B,eLukB9B,EAAA;;;AAyElB;;;;AC5QA;;;;;;;cIpUa,mDAAoD;AH/IjE;AA2DA;;;;;cG2Ga,iCACC;;;EF/GD,QAAA,CAAA,EAAA,MAAA;AAsQb,CAAA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEneA,kBFmesD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAgBtD,QAAA,EAAA,MAAA;;;;EClsBD,QAAA,ECsNE,eDtNF,EAAwB;AAkEpC,CAAA,GAAY,IAAA"}
|
package/dist/index.mjs
CHANGED
|
@@ -974,7 +974,33 @@ const escapeTemplateBrackets = (pattern) => {
|
|
|
974
974
|
return `\\${bracket}`;
|
|
975
975
|
});
|
|
976
976
|
};
|
|
977
|
-
const RUMUZ_ATOM = `(
|
|
977
|
+
const RUMUZ_ATOM = `(?:${[
|
|
978
|
+
"خت",
|
|
979
|
+
"خغ",
|
|
980
|
+
"بخ",
|
|
981
|
+
"عخ",
|
|
982
|
+
"مق",
|
|
983
|
+
"مت",
|
|
984
|
+
"عس",
|
|
985
|
+
"سي",
|
|
986
|
+
"سن",
|
|
987
|
+
"كن",
|
|
988
|
+
"مد",
|
|
989
|
+
"قد",
|
|
990
|
+
"خد",
|
|
991
|
+
"فد",
|
|
992
|
+
"دل",
|
|
993
|
+
"كد",
|
|
994
|
+
"غد",
|
|
995
|
+
"صد",
|
|
996
|
+
"دت",
|
|
997
|
+
"دس",
|
|
998
|
+
"تم",
|
|
999
|
+
"فق",
|
|
1000
|
+
"دق",
|
|
1001
|
+
"[خرزيمنصسدفلتقع]",
|
|
1002
|
+
"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
|
|
1003
|
+
].join("|")})`;
|
|
978
1004
|
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
979
1005
|
const BASE_TOKENS = {
|
|
980
1006
|
bab: "باب",
|
|
@@ -992,7 +1018,9 @@ const BASE_TOKENS = {
|
|
|
992
1018
|
"سمعت",
|
|
993
1019
|
"أنبأنا",
|
|
994
1020
|
"وحدثنا",
|
|
995
|
-
"أخبرنا"
|
|
1021
|
+
"أخبرنا",
|
|
1022
|
+
"وحدثني",
|
|
1023
|
+
"وحدثنيه"
|
|
996
1024
|
].join("|"),
|
|
997
1025
|
raqm: "[\\u0660-\\u0669]",
|
|
998
1026
|
raqms: "[\\u0660-\\u0669]+",
|
|
@@ -1452,6 +1480,77 @@ const buildRuleRegex = (rule, capturePrefix) => {
|
|
|
1452
1480
|
};
|
|
1453
1481
|
};
|
|
1454
1482
|
|
|
1483
|
+
//#endregion
|
|
1484
|
+
//#region src/segmentation/replace.ts
|
|
1485
|
+
const DEFAULT_REPLACE_FLAGS = "gu";
|
|
1486
|
+
const normalizeReplaceFlags = (flags) => {
|
|
1487
|
+
if (!flags) return DEFAULT_REPLACE_FLAGS;
|
|
1488
|
+
const allowed = new Set([
|
|
1489
|
+
"g",
|
|
1490
|
+
"i",
|
|
1491
|
+
"m",
|
|
1492
|
+
"s",
|
|
1493
|
+
"u",
|
|
1494
|
+
"y"
|
|
1495
|
+
]);
|
|
1496
|
+
const set = /* @__PURE__ */ new Set();
|
|
1497
|
+
for (const ch of flags) {
|
|
1498
|
+
if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
|
|
1499
|
+
set.add(ch);
|
|
1500
|
+
}
|
|
1501
|
+
set.add("g");
|
|
1502
|
+
set.add("u");
|
|
1503
|
+
return [
|
|
1504
|
+
"g",
|
|
1505
|
+
"i",
|
|
1506
|
+
"m",
|
|
1507
|
+
"s",
|
|
1508
|
+
"y",
|
|
1509
|
+
"u"
|
|
1510
|
+
].filter((c) => set.has(c)).join("");
|
|
1511
|
+
};
|
|
1512
|
+
const compileReplaceRules = (rules) => {
|
|
1513
|
+
const compiled = [];
|
|
1514
|
+
for (const r of rules) {
|
|
1515
|
+
if (r.pageIds && r.pageIds.length === 0) continue;
|
|
1516
|
+
const flags = normalizeReplaceFlags(r.flags);
|
|
1517
|
+
const re = new RegExp(r.regex, flags);
|
|
1518
|
+
compiled.push({
|
|
1519
|
+
pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
|
|
1520
|
+
re,
|
|
1521
|
+
replacement: r.replacement
|
|
1522
|
+
});
|
|
1523
|
+
}
|
|
1524
|
+
return compiled;
|
|
1525
|
+
};
|
|
1526
|
+
/**
|
|
1527
|
+
* Applies ordered regex replacements to page content (per page).
|
|
1528
|
+
*
|
|
1529
|
+
* - Replacement rules are applied in array order.
|
|
1530
|
+
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
1531
|
+
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
1532
|
+
*
|
|
1533
|
+
* This function is intentionally **pure**:
|
|
1534
|
+
* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
|
|
1535
|
+
*/
|
|
1536
|
+
const applyReplacements = (pages, rules) => {
|
|
1537
|
+
if (!rules || rules.length === 0 || pages.length === 0) return pages;
|
|
1538
|
+
const compiled = compileReplaceRules(rules);
|
|
1539
|
+
if (compiled.length === 0) return pages;
|
|
1540
|
+
return pages.map((p) => {
|
|
1541
|
+
let content = p.content;
|
|
1542
|
+
for (const rule of compiled) {
|
|
1543
|
+
if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
|
|
1544
|
+
content = content.replace(rule.re, rule.replacement);
|
|
1545
|
+
}
|
|
1546
|
+
if (content === p.content) return p;
|
|
1547
|
+
return {
|
|
1548
|
+
...p,
|
|
1549
|
+
content
|
|
1550
|
+
};
|
|
1551
|
+
});
|
|
1552
|
+
};
|
|
1553
|
+
|
|
1455
1554
|
//#endregion
|
|
1456
1555
|
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1457
1556
|
/**
|
|
@@ -2021,12 +2120,13 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2021
2120
|
*/
|
|
2022
2121
|
const segmentPages = (pages, options) => {
|
|
2023
2122
|
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
2024
|
-
const
|
|
2123
|
+
const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
|
|
2124
|
+
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
|
|
2025
2125
|
let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
|
|
2026
|
-
segments = ensureFallbackSegment(segments,
|
|
2126
|
+
segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
|
|
2027
2127
|
if (maxPages >= 0 && breakpoints.length) {
|
|
2028
2128
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2029
|
-
return applyBreakpoints(segments,
|
|
2129
|
+
return applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
2030
2130
|
}
|
|
2031
2131
|
return segments;
|
|
2032
2132
|
};
|
|
@@ -2122,7 +2222,7 @@ const DEFAULT_OPTIONS = {
|
|
|
2122
2222
|
topK: 40,
|
|
2123
2223
|
whitespace: "regex"
|
|
2124
2224
|
};
|
|
2125
|
-
const
|
|
2225
|
+
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
2126
2226
|
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2127
2227
|
"basmalah",
|
|
2128
2228
|
"kitab",
|
|
@@ -2170,7 +2270,7 @@ const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
|
|
|
2170
2270
|
if (currentPos >= s.length) break;
|
|
2171
2271
|
const m = re.exec(s.slice(currentPos));
|
|
2172
2272
|
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2173
|
-
currentOut +=
|
|
2273
|
+
currentOut += escapeSignatureLiteral(m[0]);
|
|
2174
2274
|
currentPos += m[0].length;
|
|
2175
2275
|
matchedAny = true;
|
|
2176
2276
|
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
@@ -2239,7 +2339,7 @@ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallba
|
|
|
2239
2339
|
if (matchedAny) {
|
|
2240
2340
|
const ch = s[pos];
|
|
2241
2341
|
if (ch && isCommonDelimiter(ch)) {
|
|
2242
|
-
out +=
|
|
2342
|
+
out += escapeSignatureLiteral(ch);
|
|
2243
2343
|
pos += 1;
|
|
2244
2344
|
continue;
|
|
2245
2345
|
}
|
|
@@ -2248,14 +2348,14 @@ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallba
|
|
|
2248
2348
|
if (includeFirstWordFallback && !matchedToken) {
|
|
2249
2349
|
const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2250
2350
|
if (!firstWord$1) break;
|
|
2251
|
-
out +=
|
|
2351
|
+
out += escapeSignatureLiteral(firstWord$1);
|
|
2252
2352
|
}
|
|
2253
2353
|
break;
|
|
2254
2354
|
}
|
|
2255
2355
|
if (!includeFirstWordFallback) return null;
|
|
2256
2356
|
const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2257
2357
|
if (!firstWord) return null;
|
|
2258
|
-
out +=
|
|
2358
|
+
out += escapeSignatureLiteral(firstWord);
|
|
2259
2359
|
return out;
|
|
2260
2360
|
}
|
|
2261
2361
|
if (!matchedAny) return null;
|
|
@@ -2492,5 +2592,5 @@ const analyzeTextForRule = (text) => {
|
|
|
2492
2592
|
};
|
|
2493
2593
|
|
|
2494
2594
|
//#endregion
|
|
2495
|
-
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
2595
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
2496
2596
|
//# sourceMappingURL=index.mjs.map
|