flappa-doormal 2.5.3 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -203,6 +203,11 @@ The `breakpoints` option provides a post-processing mechanism for limiting segme
203
203
  ```typescript
204
204
  interface SegmentationOptions {
205
205
  rules: SplitRule[];
206
+ // Optional preprocessing step: regex replacements applied per-page BEFORE segmentation
207
+ // - default flags: 'gu' (and g+u are always enforced)
208
+ // - pageIds omitted: apply to all pages
209
+ // - pageIds: []: apply to no pages (skip)
210
+ replace?: Array<{ regex: string; replacement: string; flags?: string; pageIds?: number[] }>;
206
211
  maxPages?: number; // Maximum pages a segment can span
207
212
  breakpoints?: string[]; // Ordered array of regex patterns (supports token expansion)
208
213
  prefer?: 'longer' | 'shorter'; // Select last or first match within window
@@ -420,6 +425,11 @@ Useful options (recent additions):
420
425
  - **`normalizeArabicDiacritics`**: `true` by default so tokens match diacritized forms (e.g. `وأَخْبَرَنَا` → `{{naql}}`).
421
426
  - **`whitespace`**: `'regex'` (default) uses `\\s*` placeholders; `'space'` uses literal spaces in returned signatures.
422
427
 
428
+ **Note on brackets in returned signatures**:
429
+ - `analyzeCommonLineStarts()` emits **template-like** signatures.
430
+ - It intentionally **does not escape literal `()` / `[]`** (e.g. `(ح)` stays `(ح)`), because template patterns auto-escape `()[]` later.
431
+ - If you reuse a signature inside a raw `regex` rule, you may need to escape literal brackets yourself.
432
+
423
433
  Examples:
424
434
 
425
435
  ```typescript
package/README.md CHANGED
@@ -383,6 +383,12 @@ Key options:
383
383
  - `'regex'` (default): uses `\\s*` placeholders between tokens
384
384
  - `'space'`: uses literal single spaces (`' '`) between tokens (useful if you don't want `\\s` to later match newlines when reusing these patterns)
385
385
 
386
+ **Note on brackets in returned patterns**:
387
+ - `analyzeCommonLineStarts()` returns **template-like signatures**, not “ready-to-run regex”.
388
+ - It intentionally **does not escape literal `()` / `[]`** in the returned `pattern` (e.g. `(ح)` stays `(ح)`).
389
+ - If you paste these signatures into `lineStartsWith` / `lineStartsAfter` / `template`, that’s fine: those template pattern types **auto-escape `()[]`** outside `{{tokens}}`.
390
+ - If you paste them into a raw `regex` rule, you may need to escape literal brackets yourself.
391
+
386
392
 
387
393
  ## Prompting LLMs / Agents to Generate Rules (Shamela books)
388
394
 
@@ -630,6 +636,16 @@ const pages: Page[] = [
630
636
  ];
631
637
 
632
638
  const options: SegmentationOptions = {
639
+ // Optional preprocessing step: regex replacements applied per-page BEFORE segmentation.
640
+ // Useful for normalizing OCR/typos/spacing so rules match consistently.
641
+ //
642
+ // Notes:
643
+ // - `flags` defaults to 'gu'. If provided, `g` and `u` are always enforced.
644
+ // - `pageIds: []` means "apply to no pages" (skip that rule).
645
+ // - Remember JSON escaping: to match a literal '.', use regex: "\\\\." in JSON.
646
+ replace: [
647
+ { regex: "([\\u0660-\\u0669]+)\\s*[-–—ـ]\\s*", replacement: "$1 - " }
648
+ ],
633
649
  rules: [
634
650
  { lineStartsWith: ['## '], split: 'at' }
635
651
  ],
package/dist/index.d.mts CHANGED
@@ -558,6 +558,26 @@ interface Logger {
558
558
  /** Log a warning message (potential issues) */
559
559
  warn?: (message: string, ...args: unknown[]) => void;
560
560
  }
561
+ /**
562
+ * - Default regex flags: `gu` (global + unicode)
563
+ * - If `flags` is provided, it is validated and merged with required flags:
564
+ * `g` and `u` are always enforced.
565
+ *
566
+ * `pageIds` controls which pages a rule applies to:
567
+ * - `undefined`: apply to all pages
568
+ * - `[]`: apply to no pages (rule is skipped)
569
+ * - `[id1, id2, ...]`: apply only to those pages
570
+ */
571
+ type Replacement = {
572
+ /** Raw regex source string (no token expansion). Compiled with `u` (and always `g`). */
573
+ regex: string;
574
+ /** Replacement string (passed to `String.prototype.replace`). */
575
+ replacement: string;
576
+ /** Optional regex flags; `g` and `u` are always enforced. */
577
+ flags?: string;
578
+ /** Optional list of page IDs to apply this replacement to. Empty array means skip. */
579
+ pageIds?: number[];
580
+ };
561
581
  /**
562
582
  * Segmentation options controlling how pages are split.
563
583
  *
@@ -591,6 +611,12 @@ interface Logger {
591
611
  * };
592
612
  */
593
613
  type SegmentationOptions = {
614
+ /**
615
+ * Optional pre-processing replacements applied to page content BEFORE segmentation.
616
+ *
617
+ * Replacements are applied per-page (not on concatenated content), in array order.
618
+ */
619
+ replace?: Replacement[];
594
620
  /**
595
621
  * Rules applied in order to find split points.
596
622
  *
@@ -799,6 +825,30 @@ type Segment = {
799
825
  */
800
826
  declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
801
827
  //#endregion
828
+ //#region src/segmentation/replace.d.ts
829
+ /**
830
+ * A single replacement rule applied by `applyReplacements()` / `SegmentationOptions.replace`.
831
+ *
832
+ * Notes:
833
+ * - `regex` is a raw JavaScript regex source string (no token expansion).
834
+ * - Default flags are `gu` (global + unicode).
835
+ * - If `flags` is provided, it is validated and `g` + `u` are always enforced.
836
+ * - If `pageIds` is omitted, the rule applies to all pages.
837
+ * - If `pageIds` is `[]`, the rule applies to no pages (rule is skipped).
838
+ */
839
+ type ReplaceRule = NonNullable<SegmentationOptions['replace']>[number];
840
+ /**
841
+ * Applies ordered regex replacements to page content (per page).
842
+ *
843
+ * - Replacement rules are applied in array order.
844
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
845
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
846
+ *
847
+ * This function is intentionally **pure**:
848
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
849
+ */
850
+ declare const applyReplacements: (pages: Page[], rules?: ReplaceRule[]) => Page[];
851
+ //#endregion
802
852
  //#region src/segmentation/tokens.d.ts
803
853
  /**
804
854
  * Token-based template system for Arabic text pattern matching.
@@ -1213,5 +1263,5 @@ declare const analyzeTextForRule: (text: string) => {
1213
1263
  detected: DetectedPattern[];
1214
1264
  } | null;
1215
1265
  //#endregion
1216
- export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
1266
+ export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type DetectedPattern, type ExpandResult, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type Page, type PageRange, type ReplaceRule, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
1217
1267
  //# sourceMappingURL=index.d.mts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AAuRA;AACW,cJ/LE,wBI+LF,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJ9RX;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA,KFjjBK,eAAA,GEijBiF;EAuBzE;EAqBA,QAAA,EAAA,MAAA;AAgBb,CAAA;;;;ACnqBA;AAkEA;AAEA;AAuRA;;;;;;;;ACnVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;KJ3HK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AA1VX;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AAuRA;;;;;;;;ACnVA;AA+EA;AAgEA;AAuBA;AAiCA;;;;;cH2Qa,sBAAuB,iBAAiB,wBAAsB;;;;AF5Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AA+HA;;;;ACjPA;;;;;;;;AC1ZA;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACnqBA;AAkEA;AAEA;AAuRa,cD3RA,sBC8WZ,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;;ACtaD;AA+EA;AAgEA;AAuBA;AAiCA;;;cFwFa;;;;;;;;;;;;;;;;;;;;;;;;;;cAsDA,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AHrmBA,KI9DD,wBAAA,GJ8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAiDA;;EAsDkB,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EAwDL;;AAiBb;;;;ACjPA;;;;EAAkF,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;;;;AC1ZlF;AAuOA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;mBC3mBqB;;;AAxDrB;AAkEA;AAEA;AAuRA;EACW,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACE;AACV,KA5RS,uBAAA,GA4RT;EAAsB,IAAA,EAAA,MAAA;;;KA1Rb,sBAAA;EC5DA,OAAA,EAAA,MAAA;EA+EC,KAAA,EAAA,MAAA;EAgEA,QAAA,EDhFC,uBC+Fb,EAAA;AAQD,CAAA;AAiCA;;;;;;cD4Ia,iCACF,kBACE,6BACV;;;;AJhSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AA8GmB,cInmBN,mBJmmBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GInmB6B,eJmmB7B,EAAA;AAiBnB;;;;ACjPA;;;;;;;;AC1ZA;AAuOa,cEhJA,wBF6JZ,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE7JgE,eF6JhE,EAAA,EAAA,GAAA,MAAA;AAyCD;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEreA,oBFqesD,EAAA,CAAA,QAAA,EEperD,eFoeqD,EAAA,EAAA,GAAA;EAgBtD,WAAA,EAAA,gBAAsF,GAAA,iBAAA;;;;ACnqBnG;AAkEA;AAEA;AAuRA;;;AAGG,cC/IU,kBD+IV,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAAsB,QAAA,EAAA,MAAA;;;;ECtVb,QAAA,EA8ME,eA9Ma,EAAA;AA+E3B,CAAA,GAAa,IAAA"}
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/replace.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6Ca,cJzeA,wBIyeyE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJxkBtF;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;KFbK,eAAA,GEaoE;EAAI;;;;ACH7E;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;;;ACvVA,KL4EK,qBAAA,GK5EsB;EA+Ed;EAgEA,cAAA,EAAA,MAAA,EAAA;AAuBb,CAAA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;KL1FK,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBZ,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CO,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AAtXX;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;;;cH4Ha,sBAAuB,iBAAiB,wBAAsB;;;AF7Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;AD9TxB;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKa,cAzhBA,sBA6hBV,EAAA,CAAA,OAAA,EAAA,MAoBF,EAAA,GAAA,MAAA;AAqBD;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AA2RA;;;;AAGyB,cD5BZ,+BC4BY,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AC1VzB;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;cF6Ka,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AJpoBA,KK9DD,wBAAA,GL8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAWC;AA2DD;EAMc,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EASF;;;;AAuHZ;;;;AC5QA;;EAAqD,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAAsB;;;;;ACnd3E;AA2DA;;;;;;;mBEXqB;EDQR;AAsQb;AAsDA;AA2CA;AAWA;AAuKA;EA6Ca,UAAA,CAAA,EAAA,OAAyE,GAAA,OAAA;AAuBtF,CAAA;AAqBa,KChnBD,uBAAA,GDgnBuD;EAgBtD,IAAA,EAAA,MAAA;;;KC9nBD,sBAAA;EApEA,OAAA,EAAA,MAAA;EAkEA,KAAA,EAAA,MAAA;EAEA,QAAA,EAGE,uBAHoB,EAAA;AA2RlC,CAAA;;;;;;;cAAa,iCACF,kBACE,6BACV;;;;ALpSH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KK7GO,eAAA,GL6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAWC;AA2DD;;;AA6DkB,cKvkBL,mBLukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GKvkB8B,eLukB9B,EAAA;;;AAyElB;;;;AC5QA;;;;;;;cIpUa,mDAAoD;AH/IjE;AA2DA;;;;;cG2Ga,iCACC;;;EF/GD,QAAA,CAAA,EAAA,MAAA;AAsQb,CAAA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEneA,kBFmesD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAgBtD,QAAA,EAAA,MAAA;;;;EClsBD,QAAA,ECsNE,eDtNF,EAAwB;AAkEpC,CAAA,GAAY,IAAA"}
package/dist/index.mjs CHANGED
@@ -974,7 +974,33 @@ const escapeTemplateBrackets = (pattern) => {
974
974
  return `\\${bracket}`;
975
975
  });
976
976
  };
977
- const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|سن|كن|مد|قد|خد|فد|دل|كد|غد|صد|دت|تم|فق|دق|[خرزيمنصسدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
977
+ const RUMUZ_ATOM = `(?:${[
978
+ "خت",
979
+ "خغ",
980
+ "بخ",
981
+ "عخ",
982
+ "مق",
983
+ "مت",
984
+ "عس",
985
+ "سي",
986
+ "سن",
987
+ "كن",
988
+ "مد",
989
+ "قد",
990
+ "خد",
991
+ "فد",
992
+ "دل",
993
+ "كد",
994
+ "غد",
995
+ "صد",
996
+ "دت",
997
+ "دس",
998
+ "تم",
999
+ "فق",
1000
+ "دق",
1001
+ "[خرزيمنصسدفلتقع]",
1002
+ "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
1003
+ ].join("|")})`;
978
1004
  const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
979
1005
  const BASE_TOKENS = {
980
1006
  bab: "باب",
@@ -992,7 +1018,9 @@ const BASE_TOKENS = {
992
1018
  "سمعت",
993
1019
  "أنبأنا",
994
1020
  "وحدثنا",
995
- "أخبرنا"
1021
+ "أخبرنا",
1022
+ "وحدثني",
1023
+ "وحدثنيه"
996
1024
  ].join("|"),
997
1025
  raqm: "[\\u0660-\\u0669]",
998
1026
  raqms: "[\\u0660-\\u0669]+",
@@ -1452,6 +1480,77 @@ const buildRuleRegex = (rule, capturePrefix) => {
1452
1480
  };
1453
1481
  };
1454
1482
 
1483
+ //#endregion
1484
+ //#region src/segmentation/replace.ts
1485
+ const DEFAULT_REPLACE_FLAGS = "gu";
1486
+ const normalizeReplaceFlags = (flags) => {
1487
+ if (!flags) return DEFAULT_REPLACE_FLAGS;
1488
+ const allowed = new Set([
1489
+ "g",
1490
+ "i",
1491
+ "m",
1492
+ "s",
1493
+ "u",
1494
+ "y"
1495
+ ]);
1496
+ const set = /* @__PURE__ */ new Set();
1497
+ for (const ch of flags) {
1498
+ if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
1499
+ set.add(ch);
1500
+ }
1501
+ set.add("g");
1502
+ set.add("u");
1503
+ return [
1504
+ "g",
1505
+ "i",
1506
+ "m",
1507
+ "s",
1508
+ "y",
1509
+ "u"
1510
+ ].filter((c) => set.has(c)).join("");
1511
+ };
1512
+ const compileReplaceRules = (rules) => {
1513
+ const compiled = [];
1514
+ for (const r of rules) {
1515
+ if (r.pageIds && r.pageIds.length === 0) continue;
1516
+ const flags = normalizeReplaceFlags(r.flags);
1517
+ const re = new RegExp(r.regex, flags);
1518
+ compiled.push({
1519
+ pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
1520
+ re,
1521
+ replacement: r.replacement
1522
+ });
1523
+ }
1524
+ return compiled;
1525
+ };
1526
+ /**
1527
+ * Applies ordered regex replacements to page content (per page).
1528
+ *
1529
+ * - Replacement rules are applied in array order.
1530
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
1531
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
1532
+ *
1533
+ * This function is intentionally **pure**:
1534
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
1535
+ */
1536
+ const applyReplacements = (pages, rules) => {
1537
+ if (!rules || rules.length === 0 || pages.length === 0) return pages;
1538
+ const compiled = compileReplaceRules(rules);
1539
+ if (compiled.length === 0) return pages;
1540
+ return pages.map((p) => {
1541
+ let content = p.content;
1542
+ for (const rule of compiled) {
1543
+ if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1544
+ content = content.replace(rule.re, rule.replacement);
1545
+ }
1546
+ if (content === p.content) return p;
1547
+ return {
1548
+ ...p,
1549
+ content
1550
+ };
1551
+ });
1552
+ };
1553
+
1455
1554
  //#endregion
1456
1555
  //#region src/segmentation/fast-fuzzy-prefix.ts
1457
1556
  /**
@@ -2021,12 +2120,13 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2021
2120
  */
2022
2121
  const segmentPages = (pages, options) => {
2023
2122
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2024
- const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
2123
+ const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
2124
+ const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
2025
2125
  let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
2026
- segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
2126
+ segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
2027
2127
  if (maxPages >= 0 && breakpoints.length) {
2028
2128
  const patternProcessor = (p) => processPattern(p, false).pattern;
2029
- return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2129
+ return applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2030
2130
  }
2031
2131
  return segments;
2032
2132
  };
@@ -2122,7 +2222,7 @@ const DEFAULT_OPTIONS = {
2122
2222
  topK: 40,
2123
2223
  whitespace: "regex"
2124
2224
  };
2125
- const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2225
+ const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
2126
2226
  const TOKEN_PRIORITY_ORDER$1 = [
2127
2227
  "basmalah",
2128
2228
  "kitab",
@@ -2170,7 +2270,7 @@ const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
2170
2270
  if (currentPos >= s.length) break;
2171
2271
  const m = re.exec(s.slice(currentPos));
2172
2272
  if (!m || m.index !== 0 || !m[0]) continue;
2173
- currentOut += escapeRegexLiteral(m[0]);
2273
+ currentOut += escapeSignatureLiteral(m[0]);
2174
2274
  currentPos += m[0].length;
2175
2275
  matchedAny = true;
2176
2276
  const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
@@ -2239,7 +2339,7 @@ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallba
2239
2339
  if (matchedAny) {
2240
2340
  const ch = s[pos];
2241
2341
  if (ch && isCommonDelimiter(ch)) {
2242
- out += escapeRegexLiteral(ch);
2342
+ out += escapeSignatureLiteral(ch);
2243
2343
  pos += 1;
2244
2344
  continue;
2245
2345
  }
@@ -2248,14 +2348,14 @@ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallba
2248
2348
  if (includeFirstWordFallback && !matchedToken) {
2249
2349
  const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2250
2350
  if (!firstWord$1) break;
2251
- out += escapeRegexLiteral(firstWord$1);
2351
+ out += escapeSignatureLiteral(firstWord$1);
2252
2352
  }
2253
2353
  break;
2254
2354
  }
2255
2355
  if (!includeFirstWordFallback) return null;
2256
2356
  const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2257
2357
  if (!firstWord) return null;
2258
- out += escapeRegexLiteral(firstWord);
2358
+ out += escapeSignatureLiteral(firstWord);
2259
2359
  return out;
2260
2360
  }
2261
2361
  if (!matchedAny) return null;
@@ -2492,5 +2592,5 @@ const analyzeTextForRule = (text) => {
2492
2592
  };
2493
2593
 
2494
2594
  //#endregion
2495
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2595
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2496
2596
  //# sourceMappingURL=index.mjs.map