flappa-doormal 2.5.4 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -203,6 +203,11 @@ The `breakpoints` option provides a post-processing mechanism for limiting segme
203
203
  ```typescript
204
204
  interface SegmentationOptions {
205
205
  rules: SplitRule[];
206
+ // Optional preprocessing step: regex replacements applied per-page BEFORE segmentation
207
+ // - default flags: 'gu' (and g+u are always enforced)
208
+ // - pageIds omitted: apply to all pages
209
+ // - pageIds: []: apply to no pages (skip)
210
+ replace?: Array<{ regex: string; replacement: string; flags?: string; pageIds?: number[] }>;
206
211
  maxPages?: number; // Maximum pages a segment can span
207
212
  breakpoints?: string[]; // Ordered array of regex patterns (supports token expansion)
208
213
  prefer?: 'longer' | 'shorter'; // Select last or first match within window
package/README.md CHANGED
@@ -636,6 +636,16 @@ const pages: Page[] = [
636
636
  ];
637
637
 
638
638
  const options: SegmentationOptions = {
639
+ // Optional preprocessing step: regex replacements applied per-page BEFORE segmentation.
640
+ // Useful for normalizing OCR/typos/spacing so rules match consistently.
641
+ //
642
+ // Notes:
643
+ // - `flags` defaults to 'gu'. If provided, `g` and `u` are always enforced.
644
+ // - `pageIds: []` means "apply to no pages" (skip that rule).
645
+ // - Remember JSON escaping: to match a literal '.', use regex: "\\\\." in JSON.
646
+ replace: [
647
+ { regex: "([\\u0660-\\u0669]+)\\s*[-–—ـ]\\s*", replacement: "$1 - " }
648
+ ],
639
649
  rules: [
640
650
  { lineStartsWith: ['## '], split: 'at' }
641
651
  ],
package/dist/index.d.mts CHANGED
@@ -558,6 +558,26 @@ interface Logger {
558
558
  /** Log a warning message (potential issues) */
559
559
  warn?: (message: string, ...args: unknown[]) => void;
560
560
  }
561
+ /**
562
+ * - Default regex flags: `gu` (global + unicode)
563
+ * - If `flags` is provided, it is validated and merged with required flags:
564
+ * `g` and `u` are always enforced.
565
+ *
566
+ * `pageIds` controls which pages a rule applies to:
567
+ * - `undefined`: apply to all pages
568
+ * - `[]`: apply to no pages (rule is skipped)
569
+ * - `[id1, id2, ...]`: apply only to those pages
570
+ */
571
+ type Replacement = {
572
+ /** Raw regex source string (no token expansion). Compiled with `u` (and always `g`). */
573
+ regex: string;
574
+ /** Replacement string (passed to `String.prototype.replace`). */
575
+ replacement: string;
576
+ /** Optional regex flags; `g` and `u` are always enforced. */
577
+ flags?: string;
578
+ /** Optional list of page IDs to apply this replacement to. Empty array means skip. */
579
+ pageIds?: number[];
580
+ };
561
581
  /**
562
582
  * Segmentation options controlling how pages are split.
563
583
  *
@@ -591,6 +611,12 @@ interface Logger {
591
611
  * };
592
612
  */
593
613
  type SegmentationOptions = {
614
+ /**
615
+ * Optional pre-processing replacements applied to page content BEFORE segmentation.
616
+ *
617
+ * Replacements are applied per-page (not on concatenated content), in array order.
618
+ */
619
+ replace?: Replacement[];
594
620
  /**
595
621
  * Rules applied in order to find split points.
596
622
  *
@@ -799,6 +825,30 @@ type Segment = {
799
825
  */
800
826
  declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
801
827
  //#endregion
828
+ //#region src/segmentation/replace.d.ts
829
+ /**
830
+ * A single replacement rule applied by `applyReplacements()` / `SegmentationOptions.replace`.
831
+ *
832
+ * Notes:
833
+ * - `regex` is a raw JavaScript regex source string (no token expansion).
834
+ * - Default flags are `gu` (global + unicode).
835
+ * - If `flags` is provided, it is validated and `g` + `u` are always enforced.
836
+ * - If `pageIds` is omitted, the rule applies to all pages.
837
+ * - If `pageIds` is `[]`, the rule applies to no pages (rule is skipped).
838
+ */
839
+ type ReplaceRule = NonNullable<SegmentationOptions['replace']>[number];
840
+ /**
841
+ * Applies ordered regex replacements to page content (per page).
842
+ *
843
+ * - Replacement rules are applied in array order.
844
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
845
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
846
+ *
847
+ * This function is intentionally **pure**:
848
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
849
+ */
850
+ declare const applyReplacements: (pages: Page[], rules?: ReplaceRule[]) => Page[];
851
+ //#endregion
802
852
  //#region src/segmentation/tokens.d.ts
803
853
  /**
804
854
  * Token-based template system for Arabic text pattern matching.
@@ -1213,5 +1263,5 @@ declare const analyzeTextForRule: (text: string) => {
1213
1263
  detected: DetectedPattern[];
1214
1264
  } | null;
1215
1265
  //#endregion
1216
- export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
1266
+ export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type ReplaceRule, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
1217
1267
  //# sourceMappingURL=index.d.mts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AA2RA;AACW,cJnME,wBImMF,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJlSX;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA,KFjjBK,eAAA,GEijBiF;EAuBzE;EAqBA,QAAA,EAAA,MAAA;AAgBb,CAAA;;;;ACnqBA;AAkEA;AAEA;AA2RA;;;;;;;;ACvVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;KJ3HK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AA1VX;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AA2RA;;;;;;;;ACvVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;cH2Qa,sBAAuB,iBAAiB,wBAAsB;;;;AF5Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AA2Ra,cD/RA,sBCkXZ,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;;AC1aD;AA+EA;AAgEA;AAuBA;AAiCA;;;cFwFa;;;;;;;;;;;;;;;;;;;;;;;;;;cAsDA,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AHrmBA,KI9DD,wBAAA,GJ8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAiDA;;EAsDkB,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EAwDL;;AAiBb;;;;ACjPA;;;;EAAkF,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;;;;AC1ZlF;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;mBC3mBqB;;;AAxDrB;AAkEA;AAEA;AA2RA;EACW,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACE;AACV,KAhSS,uBAAA,GAgST;EAAsB,IAAA,EAAA,MAAA;;;KA9Rb,sBAAA;EC5DA,OAAA,EAAA,MAAA;EA+EC,KAAA,EAAA,MAAA;EAgEA,QAAA,EDhFC,uBC+Fb,EAAA;AAQD,CAAA;AAiCA;;;;;;cDgJa,iCACF,kBACE,6BACV;;;;AJpSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cInmBN,mBJmmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GInmB6B,eJmmB7B,EAAA;AAiBnB;;;;ACjPA;;;;;;;;AC1ZA;AAuOa,cEhJA,wBF6JZ,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE7JgE,eF6JhE,EAAA,EAAA,GAAA,MAAA;AAyCD;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEreA,oBFqesD,EAAA,CAAA,QAAA,EEperD,eFoeqD,EAAA,EAAA,GAAA;EAgBtD,WAAA,EAAA,gBAAsF,GAAA,iBAAA;;;;ACnqBnG;AAkEA;AAEA;AA2RA;;;AAGG,cCnJU,kBDmJV,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAAsB,QAAA,EAAA,MAAA;;;;EC1Vb,QAAA,EA8ME,eA9Ma,EAAA;AA+E3B,CAAA,GAAa,IAAA"}
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/replace.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6Ca,cJzeA,wBIyeyE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJxkBtF;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;KFbK,eAAA,GEaoE;EAAI;;;;ACH7E;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;;;ACvVA,KL4EK,qBAAA,GK5EsB;EA+Ed;EAgEA,cAAA,EAAA,MAAA,EAAA;AAuBb,CAAA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;KL1FK,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBZ,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CO,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AAtXX;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;cH4Ha,sBAAuB,iBAAiB,wBAAsB;;;AF7Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;AD9TxB;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKa,cAzhBA,sBA6hBV,EAAA,CAAA,OAAA,EAAA,MAoBF,EAAA,GAAA,MAAA;AAqBD;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;AAGyB,cD5BZ,+BC4BY,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AC1VzB;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;cF6Ka,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AJpoBA,KK9DD,wBAAA,GL8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAWC;AA2DD;EAMc,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EASF;;;;AAuHZ;;;;AC5QA;;EAAqD,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAAsB;;;;;ACnd3E;AA2DA;;;;;;;mBEXqB;EDQR;AAsQb;AAsDA;AA2CA;AAWA;AAuKA;EA6Ca,UAAA,CAAA,EAAA,OAAyE,GAAA,OAAA;AAuBtF,CAAA;AAqBa,KChnBD,uBAAA,GDgnBuD;EAgBtD,IAAA,EAAA,MAAA;;;KC9nBD,sBAAA;EApEA,OAAA,EAAA,MAAA;EAkEA,KAAA,EAAA,MAAA;EAEA,QAAA,EAGE,uBAHoB,EAAA;AA2RlC,CAAA;;;;;;;cAAa,iCACF,kBACE,6BACV;;;;ALpSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KK7GO,eAAA,GL6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;AA6DkB,cKvkBL,mBLukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GKvkB8B,eLukB9B,EAAA;;;AAyElB;;;;AC5QA;;;;;;;cIpUa,mDAAoD;AH/IjE;AA2DA;;;;;cG2Ga,iCACC;;;EF/GD,QAAA,CAAA,EAAA,MAAA;AAsQb,CAAA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEneA,kBFmesD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAgBtD,QAAA,EAAA,MAAA;;;;EClsBD,QAAA,ECsNE,eDtNF,EAAwB;AAkEpC,CAAA,GAAY,IAAA"}
package/dist/index.mjs CHANGED
@@ -974,7 +974,33 @@ const escapeTemplateBrackets = (pattern) => {
974
974
  return `\\${bracket}`;
975
975
  });
976
976
  };
977
- const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|سن|كن|مد|قد|خد|فد|دل|كد|غد|صد|دت|تم|فق|دق|[خرزيمنصسدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
977
+ const RUMUZ_ATOM = `(?:${[
978
+ "خت",
979
+ "خغ",
980
+ "بخ",
981
+ "عخ",
982
+ "مق",
983
+ "مت",
984
+ "عس",
985
+ "سي",
986
+ "سن",
987
+ "كن",
988
+ "مد",
989
+ "قد",
990
+ "خد",
991
+ "فد",
992
+ "دل",
993
+ "كد",
994
+ "غد",
995
+ "صد",
996
+ "دت",
997
+ "دس",
998
+ "تم",
999
+ "فق",
1000
+ "دق",
1001
+ "[خرزيمنصسدفلتقع]",
1002
+ "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
1003
+ ].join("|")})`;
978
1004
  const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
979
1005
  const BASE_TOKENS = {
980
1006
  bab: "باب",
@@ -992,7 +1018,9 @@ const BASE_TOKENS = {
992
1018
  "سمعت",
993
1019
  "أنبأنا",
994
1020
  "وحدثنا",
995
- "أخبرنا"
1021
+ "أخبرنا",
1022
+ "وحدثني",
1023
+ "وحدثنيه"
996
1024
  ].join("|"),
997
1025
  raqm: "[\\u0660-\\u0669]",
998
1026
  raqms: "[\\u0660-\\u0669]+",
@@ -1452,6 +1480,77 @@ const buildRuleRegex = (rule, capturePrefix) => {
1452
1480
  };
1453
1481
  };
1454
1482
 
1483
+ //#endregion
1484
+ //#region src/segmentation/replace.ts
1485
+ const DEFAULT_REPLACE_FLAGS = "gu";
1486
+ const normalizeReplaceFlags = (flags) => {
1487
+ if (!flags) return DEFAULT_REPLACE_FLAGS;
1488
+ const allowed = new Set([
1489
+ "g",
1490
+ "i",
1491
+ "m",
1492
+ "s",
1493
+ "u",
1494
+ "y"
1495
+ ]);
1496
+ const set = /* @__PURE__ */ new Set();
1497
+ for (const ch of flags) {
1498
+ if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
1499
+ set.add(ch);
1500
+ }
1501
+ set.add("g");
1502
+ set.add("u");
1503
+ return [
1504
+ "g",
1505
+ "i",
1506
+ "m",
1507
+ "s",
1508
+ "y",
1509
+ "u"
1510
+ ].filter((c) => set.has(c)).join("");
1511
+ };
1512
+ const compileReplaceRules = (rules) => {
1513
+ const compiled = [];
1514
+ for (const r of rules) {
1515
+ if (r.pageIds && r.pageIds.length === 0) continue;
1516
+ const flags = normalizeReplaceFlags(r.flags);
1517
+ const re = new RegExp(r.regex, flags);
1518
+ compiled.push({
1519
+ pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
1520
+ re,
1521
+ replacement: r.replacement
1522
+ });
1523
+ }
1524
+ return compiled;
1525
+ };
1526
+ /**
1527
+ * Applies ordered regex replacements to page content (per page).
1528
+ *
1529
+ * - Replacement rules are applied in array order.
1530
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
1531
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
1532
+ *
1533
+ * This function is intentionally **pure**:
1534
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
1535
+ */
1536
+ const applyReplacements = (pages, rules) => {
1537
+ if (!rules || rules.length === 0 || pages.length === 0) return pages;
1538
+ const compiled = compileReplaceRules(rules);
1539
+ if (compiled.length === 0) return pages;
1540
+ return pages.map((p) => {
1541
+ let content = p.content;
1542
+ for (const rule of compiled) {
1543
+ if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1544
+ content = content.replace(rule.re, rule.replacement);
1545
+ }
1546
+ if (content === p.content) return p;
1547
+ return {
1548
+ ...p,
1549
+ content
1550
+ };
1551
+ });
1552
+ };
1553
+
1455
1554
  //#endregion
1456
1555
  //#region src/segmentation/fast-fuzzy-prefix.ts
1457
1556
  /**
@@ -2021,12 +2120,13 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2021
2120
  */
2022
2121
  const segmentPages = (pages, options) => {
2023
2122
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2024
- const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
2123
+ const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
2124
+ const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
2025
2125
  let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
2026
- segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
2126
+ segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
2027
2127
  if (maxPages >= 0 && breakpoints.length) {
2028
2128
  const patternProcessor = (p) => processPattern(p, false).pattern;
2029
- return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2129
+ return applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2030
2130
  }
2031
2131
  return segments;
2032
2132
  };
@@ -2492,5 +2592,5 @@ const analyzeTextForRule = (text) => {
2492
2592
  };
2493
2593
 
2494
2594
  //#endregion
2495
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2595
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2496
2596
  //# sourceMappingURL=index.mjs.map