flappa-doormal 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -67,22 +67,33 @@ declare const makeDiacriticInsensitive: (text: string) => string;
67
67
  //#endregion
68
68
  //#region src/segmentation/types.d.ts
69
69
  /**
70
- * Literal regex pattern rule - no token expansion is applied.
70
+ * Literal regex pattern rule - no token expansion or auto-escaping is applied.
71
+ *
72
+ * Use this when you need full control over the regex pattern, including:
73
+ * - Character classes like `[أب]` to match أ or ب
74
+ * - Capturing groups like `(test|text)` for alternation
75
+ * - Any other regex syntax that would be escaped in template patterns
71
76
  *
72
- * Use this when you need full control over the regex pattern.
73
77
  * If the regex contains capturing groups, the captured content
74
78
  * will be used as the segment content.
75
79
  *
80
+ * **Note**: Unlike `template`, `lineStartsWith`, etc., this pattern type
81
+ * does NOT auto-escape `()[]`. You have full regex control.
82
+ *
76
83
  * @example
77
84
  * // Match Arabic-Indic numbers followed by a dash
78
85
  * { regex: '^[٠-٩]+ - ', split: 'at' }
79
86
  *
80
87
  * @example
88
+ * // Character class - matches أ or ب
89
+ * { regex: '^[أب] ', split: 'at' }
90
+ *
91
+ * @example
81
92
  * // Capture group - content after the marker becomes segment content
82
93
  * { regex: '^[٠-٩]+ - (.*)', split: 'at' }
83
94
  */
84
95
  type RegexPattern = {
85
- /** Raw regex pattern string (no token expansion) */
96
+ /** Raw regex pattern string (no token expansion, no auto-escaping) */
86
97
  regex: string;
87
98
  };
88
99
  /**
@@ -90,6 +101,10 @@ type RegexPattern = {
90
101
  *
91
102
  * Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
92
103
  *
104
+ * **Auto-escaping**: Parentheses `()` and square brackets `[]` outside of
105
+ * `{{tokens}}` are automatically escaped. Write `({{harf}}):` instead of
106
+ * `\\({{harf}}\\):`. For raw regex control, use `regex` pattern type.
107
+ *
93
108
  * @example
94
109
  * // Using tokens for Arabic-Indic digits
95
110
  * { template: '^{{raqms}} {{dash}}', split: 'at' }
@@ -98,10 +113,14 @@ type RegexPattern = {
98
113
  * // Named capture to extract hadith number into metadata
99
114
  * { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
100
115
  *
116
+ * @example
117
+ * // Auto-escaped brackets - matches literal (أ):
118
+ * { template: '^({{harf}}): ', split: 'at' }
119
+ *
101
120
  * @see TOKEN_PATTERNS for available tokens
102
121
  */
103
122
  type TemplatePattern = {
104
- /** Template string with `{{token}}` or `{{token:name}}` placeholders */
123
+ /** Template string with `{{token}}` or `{{token:name}}` placeholders. Brackets `()[]` are auto-escaped. */
105
124
  template: string;
106
125
  };
107
126
  /**
@@ -113,6 +132,10 @@ type TemplatePattern = {
113
132
  * Token expansion is applied to each pattern. Use `fuzzy: true` for
114
133
  * diacritic-insensitive Arabic matching.
115
134
  *
135
+ * **Auto-escaping**: Parentheses `()` and square brackets `[]` outside of
136
+ * `{{tokens}}` are automatically escaped. Write `({{harf}})` instead of
137
+ * `\\({{harf}}\\)`. For raw regex control, use `regex` pattern type.
138
+ *
116
139
  * @example
117
140
  * // Split at chapter headings (marker included in content)
118
141
  * { lineStartsWith: ['## ', '### '], split: 'at' }
@@ -120,9 +143,13 @@ type TemplatePattern = {
120
143
  * @example
121
144
  * // Split at Arabic book/chapter markers with fuzzy matching
122
145
  * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
146
+ *
147
+ * @example
148
+ * // Auto-escaped brackets - matches literal (أ)
149
+ * { lineStartsWith: ['({{harf}}) '], split: 'at' }
123
150
  */
124
151
  type LineStartsWithPattern = {
125
- /** Array of patterns that mark line beginnings (marker included in content) */
152
+ /** Array of patterns that mark line beginnings (marker included in content). Brackets `()[]` are auto-escaped. */
126
153
  lineStartsWith: string[];
127
154
  };
128
155
  /**
@@ -136,6 +163,10 @@ type LineStartsWithPattern = {
136
163
  * Token expansion is applied to each pattern. Use `fuzzy: true` for
137
164
  * diacritic-insensitive Arabic matching.
138
165
  *
166
+ * **Auto-escaping**: Parentheses `()` and square brackets `[]` outside of
167
+ * `{{tokens}}` are automatically escaped. Write `({{harf}}):` instead of
168
+ * `\\({{harf}}\\):`. For raw regex control, use `regex` pattern type.
169
+ *
139
170
  * @example
140
171
  * // Split at numbered hadiths, capturing content without the number prefix
141
172
  * // Content extends to next split, not just end of that line
@@ -144,9 +175,13 @@ type LineStartsWithPattern = {
144
175
  * @example
145
176
  * // Extract hadith number to metadata while stripping the prefix
146
177
  * { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
178
+ *
179
+ * @example
180
+ * // Auto-escaped brackets - matches literal (أ): prefix
181
+ * { lineStartsAfter: ['({{harf}}): '], split: 'at' }
147
182
  */
148
183
  type LineStartsAfterPattern = {
149
- /** Array of patterns that mark line beginnings (marker excluded from content) */
184
+ /** Array of patterns that mark line beginnings (marker excluded from content). Brackets `()[]` are auto-escaped. */
150
185
  lineStartsAfter: string[];
151
186
  };
152
187
  /**
@@ -157,12 +192,19 @@ type LineStartsAfterPattern = {
157
192
  * Token expansion is applied to each pattern. Use `fuzzy: true` for
158
193
  * diacritic-insensitive Arabic matching.
159
194
  *
195
+ * **Auto-escaping**: Parentheses `()` and square brackets `[]` outside of
196
+ * `{{tokens}}` are automatically escaped. For raw regex control, use `regex` pattern type.
197
+ *
160
198
  * @example
161
199
  * // Split at lines ending with Arabic sentence-ending punctuation
162
200
  * { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
201
+ *
202
+ * @example
203
+ * // Auto-escaped brackets - matches literal (انتهى) suffix
204
+ * { lineEndsWith: ['(انتهى)'], split: 'after' }
163
205
  */
164
206
  type LineEndsWithPattern = {
165
- /** Array of patterns that mark line endings */
207
+ /** Array of patterns that mark line endings. Brackets `()[]` are auto-escaped. */
166
208
  lineEndsWith: string[];
167
209
  };
168
210
  /**
@@ -460,6 +502,42 @@ type BreakpointRule = {
460
502
  * { pattern: '{{tarqim}}\\s*', min: 10 }
461
503
  */
462
504
  type Breakpoint = string | BreakpointRule;
505
+ /**
506
+ * Logger interface for custom logging implementations.
507
+ *
508
+ * All methods are optional - only implement the verbosity levels you need.
509
+ * When no logger is provided, no logging overhead is incurred.
510
+ *
511
+ * Compatible with the Logger interface from ffmpeg-simplified and similar libraries.
512
+ *
513
+ * @example
514
+ * // Simple console logger
515
+ * const logger: Logger = {
516
+ * debug: console.debug,
517
+ * info: console.info,
518
+ * warn: console.warn,
519
+ * error: console.error,
520
+ * };
521
+ *
522
+ * @example
523
+ * // Production logger (only warnings and errors)
524
+ * const prodLogger: Logger = {
525
+ * warn: (msg, ...args) => myLoggingService.warn(msg, args),
526
+ * error: (msg, ...args) => myLoggingService.error(msg, args),
527
+ * };
528
+ */
529
+ interface Logger {
530
+ /** Log a debug message (verbose debugging output) */
531
+ debug?: (message: string, ...args: unknown[]) => void;
532
+ /** Log an error message (critical failures) */
533
+ error?: (message: string, ...args: unknown[]) => void;
534
+ /** Log an informational message (key progress points) */
535
+ info?: (message: string, ...args: unknown[]) => void;
536
+ /** Log a trace message (extremely verbose, per-iteration details) */
537
+ trace?: (message: string, ...args: unknown[]) => void;
538
+ /** Log a warning message (potential issues) */
539
+ warn?: (message: string, ...args: unknown[]) => void;
540
+ }
463
541
  /**
464
542
  * Segmentation options controlling how pages are split.
465
543
  *
@@ -480,6 +558,17 @@ type Breakpoint = string | BreakpointRule;
480
558
  * breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
481
559
  * prefer: 'longer'
482
560
  * };
561
+ *
562
+ * @example
563
+ * // With custom logger for debugging
564
+ * const options: SegmentationOptions = {
565
+ * rules: [...],
566
+ * logger: {
567
+ * debug: console.debug,
568
+ * info: console.info,
569
+ * warn: console.warn,
570
+ * }
571
+ * };
483
572
  */
484
573
  type SegmentationOptions = {
485
574
  /**
@@ -542,6 +631,38 @@ type SegmentationOptions = {
542
631
  * @default 'longer'
543
632
  */
544
633
  prefer?: 'longer' | 'shorter';
634
+ /**
635
+ * Optional logger for debugging segmentation.
636
+ *
637
+ * Provide a logger to receive detailed information about the segmentation
638
+ * process. Useful for debugging pattern matching, page tracking, and
639
+ * breakpoint processing issues.
640
+ *
641
+ * When not provided, no logging overhead is incurred (methods are not called).
642
+ *
643
+ * Verbosity levels:
644
+ * - `trace`: Per-iteration details (very verbose)
645
+ * - `debug`: Detailed operation information
646
+ * - `info`: Key progress points
647
+ * - `warn`: Potential issues
648
+ * - `error`: Critical failures
649
+ *
650
+ * @example
651
+ * // Console logger for development
652
+ * logger: {
653
+ * debug: console.debug,
654
+ * info: console.info,
655
+ * warn: console.warn,
656
+ * }
657
+ *
658
+ * @example
659
+ * // Custom logger integration
660
+ * logger: {
661
+ * debug: (msg, ...args) => winston.debug(msg, { meta: args }),
662
+ * error: (msg, ...args) => winston.error(msg, { meta: args }),
663
+ * }
664
+ */
665
+ logger?: Logger;
545
666
  };
546
667
  /**
547
668
  * Output segment produced by `segmentPages()`.
@@ -670,6 +791,51 @@ declare const normalizeLineEndings: (content: string) => string;
670
791
  * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
671
792
  * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
672
793
  */
794
+ /**
795
+ * Token definitions mapping human-readable token names to regex patterns.
796
+ *
797
+ * Tokens are used in template strings with double-brace syntax:
798
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
799
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
800
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
801
+ *
802
+ * @remarks
803
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
804
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
805
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
806
+ *
807
+ * @example
808
+ * // Using tokens in a split rule
809
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
810
+ *
811
+ * @example
812
+ * // Using tokens with named captures
813
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
814
+ */
815
+ /**
816
+ * Escapes regex metacharacters (parentheses and brackets) in template patterns,
817
+ * but preserves content inside `{{...}}` token delimiters.
818
+ *
819
+ * This allows users to write intuitive patterns like `({{harf}}):` instead of
820
+ * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
821
+ * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
822
+ *
823
+ * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
824
+ * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
825
+ *
826
+ * @example
827
+ * escapeTemplateBrackets('({{harf}}): ')
828
+ * // → '\\({{harf}}\\): '
829
+ *
830
+ * @example
831
+ * escapeTemplateBrackets('[{{raqm}}] ')
832
+ * // → '\\[{{raqm}}\\] '
833
+ *
834
+ * @example
835
+ * escapeTemplateBrackets('{{harf}}')
836
+ * // → '{{harf}}' (unchanged - no brackets outside tokens)
837
+ */
838
+ declare const escapeTemplateBrackets: (pattern: string) => string;
673
839
  /**
674
840
  * Token definitions mapping human-readable token names to regex patterns.
675
841
  *
@@ -846,5 +1012,80 @@ declare const getAvailableTokens: () => string[];
846
1012
  */
847
1013
  declare const getTokenPattern: (tokenName: string) => string | undefined;
848
1014
  //#endregion
849
- export { type Breakpoint, type BreakpointRule, type ExpandResult, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
1015
+ //#region src/pattern-detection.d.ts
1016
+ /**
1017
+ * Pattern detection utilities for recognizing template tokens in Arabic text.
1018
+ * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
1019
+ *
1020
+ * @module pattern-detection
1021
+ */
1022
+ /**
1023
+ * Result of detecting a token pattern in text
1024
+ */
1025
+ type DetectedPattern = {
1026
+ /** Token name from TOKEN_PATTERNS (e.g., 'raqms', 'dash') */
1027
+ token: string;
1028
+ /** The matched text */
1029
+ match: string;
1030
+ /** Start index in the original text */
1031
+ index: number;
1032
+ /** End index (exclusive) */
1033
+ endIndex: number;
1034
+ };
1035
+ /**
1036
+ * Analyzes text and returns all detected token patterns with their positions.
1037
+ * Patterns are detected in priority order to avoid partial matches.
1038
+ *
1039
+ * @param text - The text to analyze for token patterns
1040
+ * @returns Array of detected patterns sorted by position
1041
+ *
1042
+ * @example
1043
+ * detectTokenPatterns("٣٤ - حدثنا")
1044
+ * // Returns: [
1045
+ * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
1046
+ * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
1047
+ * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
1048
+ * // ]
1049
+ */
1050
+ declare const detectTokenPatterns: (text: string) => DetectedPattern[];
1051
+ /**
1052
+ * Generates a template pattern from text using detected tokens.
1053
+ * Replaces matched portions with {{token}} syntax.
1054
+ *
1055
+ * @param text - Original text
1056
+ * @param detected - Array of detected patterns from detectTokenPatterns
1057
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
1058
+ *
1059
+ * @example
1060
+ * const detected = detectTokenPatterns("٣٤ - ");
1061
+ * generateTemplateFromText("٣٤ - ", detected);
1062
+ * // Returns: "{{raqms}} {{dash}} "
1063
+ */
1064
+ declare const generateTemplateFromText: (text: string, detected: DetectedPattern[]) => string;
1065
+ /**
1066
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
1067
+ *
1068
+ * @param detected - Array of detected patterns
1069
+ * @returns Suggested pattern type and whether to use fuzzy matching
1070
+ */
1071
+ declare const suggestPatternConfig: (detected: DetectedPattern[]) => {
1072
+ patternType: "lineStartsWith" | "lineStartsAfter";
1073
+ fuzzy: boolean;
1074
+ metaType?: string;
1075
+ };
1076
+ /**
1077
+ * Analyzes text and generates a complete suggested rule configuration.
1078
+ *
1079
+ * @param text - Highlighted text from the page
1080
+ * @returns Suggested rule configuration or null if no patterns detected
1081
+ */
1082
+ declare const analyzeTextForRule: (text: string) => {
1083
+ template: string;
1084
+ patternType: "lineStartsWith" | "lineStartsAfter";
1085
+ fuzzy: boolean;
1086
+ metaType?: string;
1087
+ detected: DetectedPattern[];
1088
+ } | null;
1089
+ //#endregion
1090
+ export { type Breakpoint, type BreakpointRule, type DetectedPattern, type ExpandResult, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
850
1091
  //# sourceMappingURL=index.d.mts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;AC9IiB;AAoBG;AAsBM;AAyBC;AAiBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EY,cDhYC,WCgYqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AAuBhD;AAgFA;;;;ACoKA;;;;;;;;ACvsBA;AAaA;;;;AC6NA;AA2CA;AAWA;AA2DA;AAqGA;AAuBA;AAqBA;AAgBA;;;;;;;;;;cJjWa;;;;AA/Fb;AA+FA;;;;;AC9IiB;AAoBG;AAsBM;AAyBC;AAiBH;;;;KApFnB,YAAA,GAuGC;EACA;EAAmB,KAAA,EAAA,MAAA;AAAA,CAAA;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AAuBA;AAgFA;;;;ACoKA,KDtqBK,eAAA,GC6vBJ;EAvFmC;EAAiB,QAAA,EAAA,MAAA;CAAsB;;;;;ACvsB3E;AAaA;;;;AC6NA;AA2CA;AAWA;AA2DA;AAqGA;AAuBA;AAqBA;AAgBA;KHrcK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;KAyBA,sBAAA;;;;;;;;;;;;;;;;KAiBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA0CC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;KAuBtB,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;KA0BN,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;AAhfgB;AAiBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AAuBA;AAgFA;;;;ACoKA;;;;;;;;ACvsBA;AAaA;;;;AC6NA;AA2CA;AAWA;AA2DA;AAqGA;AAuBa,cFgPA,YEzOZ,EAPkD,CAAA,KAAM,EFgPrB,IEhPqB,EAAA,EAAA,OAAA,EFgPJ,mBEhPI,EAAA,GFgPkB,OEhPlB,EAAA;;;;AJ3ZzD;AA+FA;;;;AC9IK,cEbQ,aFaI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AAoBG;AAsBM;AAyBC;AAiBH;;;;;AAoBlB,cExGO,oBFwGP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADzDN;AA+FA;;;;;AC9IiB;AAoBG;AAsBM;AAyBC;AAiBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AAuBA;AAgFA;;;;ACoKA;;;;;;;;ACvsBA;AAaA;cC6Na,gBAAgB;;;AAA7B;AA2CA;AAWA;AA2DA;AAqGA;AAuBA;AAqBA;AAgBA;;;;;;cAvOa;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,2FAA0F;;;;;;;;;;;;;;;;;;;;cAqG1F;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAsC;;;;;;;;;;;;;cAqBtC;;;;;;;;;;;;;;;cAgBA"}
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/textUtils.ts","../src/segmentation/tokens.ts","../src/pattern-detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EY,cD1aC,WC0aqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAiDA;;;;;AAiHA;;;;ACoCA;;;;;;;;AC1sBA;AAaA;;;;ACiDA;AAkNA;AA2CA;AAWA;AA2DA;AAqGA;AAuBA;AAqBA;AAgBA;;;;AC5hBY,cLqJC,wBKrJc,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ALsD3B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA,KA/VK,YAAA,GA+VW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAiDA;;;;;AAiHA;;;;ACoCA;;;;;;;;AC1sBA;AAaA;;;;ACiDA;AAkNA,KH5NK,eAAA,GGgOJ;EAuCY;EAWD,QAAA,EAAA,MAAY;AA2DxB,CAAA;AAqGA;AAuBA;AAqBA;AAgBA;;;;AC5hBA;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;KJlGK,qBAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiCA,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA0CC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDL,mBAAA;;;;;;;;UAQA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WA0CL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;AAhlBgB;AAwBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AAiHA;;;;ACoCA;;;;;;;;AC1sBA;AAaA;;;;ACiDa,cF4oBA,YEnoBZ,EAAA,CAAA,KAAA,EFmoBmC,IEnoBnC,EAAA,EAAA,OAAA,EFmoBoD,mBEnoBpD,EAAA,GFmoB0E,OEnoB1E,EAAA;;;;AJXD;AA+FA;;;;ACnIK,cExBQ,aFwBI,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAAA;AA4BG;AA8BM;AAiCC;AAwBH;;;;;AAoBlB,cElJO,oBFkJP,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ADnGN;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;;AAiHA;;;;ACoCA;;;;;;;;AC1sBA;AAaA;;;;ACiDA;AAkNA;AA2CA;AAWA;AA2DA;AAqGA;AAuBA;AAqBA;AAgBA;;;cApea;ACxDb;AA0DA;AA4DA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;;;;cD4Fa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cA2DC,2FAA0F;;;;;;;;;;;;;;;;;;;;cAqG1F;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAsC;;;;;;;;;;;;;cAqBtC;;;;;;;;;;;;;;;cAgBA;;;;AJteb;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KI7GO,eAAA,GJ6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA0FzB;AAAkD;AA4GlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAiDA;;;;AAgGmB,cIrlBN,mBJqlBM,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GIrlBgC,eJqlBhC,EAAA;AAiBnB;;;;ACoCA;;;;;;;;AC1sBA;AAaa,cE+GA,wBF/G2E,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EE+GvB,eF/GuB,EAAA,EAAA,GAAA,MAAA;;;;ACiDxF;AAkNA;AA2CA;AAWY,cCnLC,oBDmLW,EAAA,CAAA,QAAA,EClLV,eDkLU,EAAA,EAAA,GAAA;EA2DX,WAAA,EAAA,gBAgFZ,GAAA,iBAhFsG;EAqG1F,KAAA,EAAA,OAAA;EAuBA,QAAA,CAAA,EAAA,MAAA;AAqBb,CAAA;AAgBA;;;;AC5hBA;AA0DA;AA4Da,cAwDA,kBAzCZ,EAAA,CAfgE,IAAA,EAAA,MAAA,EAAA,GAAe;EAuBnE,QAAA,EAAA,MAAA;EAiCA,WAAA,EAAA,gBAmBZ,GAZa,iBAAe;;;YAAf"}
package/dist/index.mjs CHANGED
@@ -673,6 +673,35 @@ const normalizeLineEndings = (content) => content.replace(/\r\n?/g, "\n");
673
673
  * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
674
674
  */
675
675
  /**
676
+ * Escapes regex metacharacters (parentheses and brackets) in template patterns,
677
+ * but preserves content inside `{{...}}` token delimiters.
678
+ *
679
+ * This allows users to write intuitive patterns like `({{harf}}):` instead of
680
+ * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
681
+ * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
682
+ *
683
+ * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
684
+ * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
685
+ *
686
+ * @example
687
+ * escapeTemplateBrackets('({{harf}}): ')
688
+ * // → '\\({{harf}}\\): '
689
+ *
690
+ * @example
691
+ * escapeTemplateBrackets('[{{raqm}}] ')
692
+ * // → '\\[{{raqm}}\\] '
693
+ *
694
+ * @example
695
+ * escapeTemplateBrackets('{{harf}}')
696
+ * // → '{{harf}}' (unchanged - no brackets outside tokens)
697
+ */
698
+ const escapeTemplateBrackets = (pattern) => {
699
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (match, token, bracket) => {
700
+ if (token) return token;
701
+ return `\\${bracket}`;
702
+ });
703
+ };
704
+ /**
676
705
  * Base token definitions mapping human-readable token names to regex patterns.
677
706
  *
678
707
  * These tokens contain raw regex patterns and do not reference other tokens.
@@ -1000,7 +1029,7 @@ const hasCapturingGroup = (pattern) => {
1000
1029
  * // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
1001
1030
  */
1002
1031
  const processPattern = (pattern, fuzzy) => {
1003
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(pattern, fuzzy ? makeDiacriticInsensitive : void 0);
1032
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
1004
1033
  return {
1005
1034
  captureNames,
1006
1035
  pattern: expanded
@@ -1055,16 +1084,16 @@ const buildRuleRegex = (rule) => {
1055
1084
  const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
1056
1085
  const patterns = processed.map((p) => p.pattern).join("|");
1057
1086
  allCaptureNames = processed.flatMap((p) => p.captureNames);
1058
- s.template = `^(?:${patterns})`;
1087
+ s.regex = `^(?:${patterns})`;
1059
1088
  }
1060
1089
  if (s.lineEndsWith?.length) {
1061
1090
  const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
1062
1091
  const patterns = processed.map((p) => p.pattern).join("|");
1063
1092
  allCaptureNames = processed.flatMap((p) => p.captureNames);
1064
- s.template = `(?:${patterns})$`;
1093
+ s.regex = `(?:${patterns})$`;
1065
1094
  }
1066
1095
  if (s.template) {
1067
- const { pattern, captureNames } = expandTokensWithCaptures(s.template);
1096
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(s.template));
1068
1097
  s.regex = pattern;
1069
1098
  allCaptureNames = [...allCaptureNames, ...captureNames];
1070
1099
  }
@@ -1480,5 +1509,160 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
1480
1509
  };
1481
1510
 
1482
1511
  //#endregion
1483
- export { TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
1512
+ //#region src/pattern-detection.ts
1513
+ /**
1514
+ * Pattern detection utilities for recognizing template tokens in Arabic text.
1515
+ * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
1516
+ *
1517
+ * @module pattern-detection
1518
+ */
1519
+ /**
1520
+ * Token detection order - more specific patterns first to avoid partial matches.
1521
+ * Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
1522
+ *
1523
+ * Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
1524
+ */
1525
+ const TOKEN_PRIORITY_ORDER = [
1526
+ "basmalah",
1527
+ "kitab",
1528
+ "bab",
1529
+ "fasl",
1530
+ "naql",
1531
+ "numbered",
1532
+ "raqms",
1533
+ "raqm",
1534
+ "tarqim",
1535
+ "bullet",
1536
+ "dash",
1537
+ "harf"
1538
+ ];
1539
+ /**
1540
+ * Gets the token detection priority order.
1541
+ * Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
1542
+ */
1543
+ const getTokenPriority = () => {
1544
+ const allTokens = getAvailableTokens();
1545
+ const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
1546
+ const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
1547
+ return [...prioritized, ...remaining];
1548
+ };
1549
+ /**
1550
+ * Analyzes text and returns all detected token patterns with their positions.
1551
+ * Patterns are detected in priority order to avoid partial matches.
1552
+ *
1553
+ * @param text - The text to analyze for token patterns
1554
+ * @returns Array of detected patterns sorted by position
1555
+ *
1556
+ * @example
1557
+ * detectTokenPatterns("٣٤ - حدثنا")
1558
+ * // Returns: [
1559
+ * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
1560
+ * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
1561
+ * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
1562
+ * // ]
1563
+ */
1564
+ const detectTokenPatterns = (text) => {
1565
+ if (!text) return [];
1566
+ const results = [];
1567
+ const coveredRanges = [];
1568
+ const isPositionCovered = (start, end) => {
1569
+ return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
1570
+ };
1571
+ for (const tokenName of getTokenPriority()) {
1572
+ const pattern = TOKEN_PATTERNS[tokenName];
1573
+ if (!pattern) continue;
1574
+ try {
1575
+ const regex = new RegExp(`(${pattern})`, "gu");
1576
+ let match;
1577
+ while ((match = regex.exec(text)) !== null) {
1578
+ const startIndex = match.index;
1579
+ const endIndex = startIndex + match[0].length;
1580
+ if (isPositionCovered(startIndex, endIndex)) continue;
1581
+ results.push({
1582
+ endIndex,
1583
+ index: startIndex,
1584
+ match: match[0],
1585
+ token: tokenName
1586
+ });
1587
+ coveredRanges.push([startIndex, endIndex]);
1588
+ }
1589
+ } catch {}
1590
+ }
1591
+ return results.sort((a, b) => a.index - b.index);
1592
+ };
1593
+ /**
1594
+ * Generates a template pattern from text using detected tokens.
1595
+ * Replaces matched portions with {{token}} syntax.
1596
+ *
1597
+ * @param text - Original text
1598
+ * @param detected - Array of detected patterns from detectTokenPatterns
1599
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
1600
+ *
1601
+ * @example
1602
+ * const detected = detectTokenPatterns("٣٤ - ");
1603
+ * generateTemplateFromText("٣٤ - ", detected);
1604
+ * // Returns: "{{raqms}} {{dash}} "
1605
+ */
1606
+ const generateTemplateFromText = (text, detected) => {
1607
+ if (!text || detected.length === 0) return text;
1608
+ let template = text;
1609
+ const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
1610
+ for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
1611
+ return template;
1612
+ };
1613
+ /**
1614
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
1615
+ *
1616
+ * @param detected - Array of detected patterns
1617
+ * @returns Suggested pattern type and whether to use fuzzy matching
1618
+ */
1619
+ const suggestPatternConfig = (detected) => {
1620
+ const hasStructuralToken = detected.some((d) => [
1621
+ "basmalah",
1622
+ "kitab",
1623
+ "bab",
1624
+ "fasl"
1625
+ ].includes(d.token));
1626
+ const hasNumberedPattern = detected.some((d) => [
1627
+ "raqms",
1628
+ "raqm",
1629
+ "numbered"
1630
+ ].includes(d.token));
1631
+ if (hasStructuralToken) return {
1632
+ fuzzy: true,
1633
+ metaType: detected.find((d) => [
1634
+ "kitab",
1635
+ "bab",
1636
+ "fasl"
1637
+ ].includes(d.token))?.token || "chapter",
1638
+ patternType: "lineStartsWith"
1639
+ };
1640
+ if (hasNumberedPattern) return {
1641
+ fuzzy: false,
1642
+ metaType: "hadith",
1643
+ patternType: "lineStartsAfter"
1644
+ };
1645
+ return {
1646
+ fuzzy: false,
1647
+ patternType: "lineStartsAfter"
1648
+ };
1649
+ };
1650
+ /**
1651
+ * Analyzes text and generates a complete suggested rule configuration.
1652
+ *
1653
+ * @param text - Highlighted text from the page
1654
+ * @returns Suggested rule configuration or null if no patterns detected
1655
+ */
1656
+ const analyzeTextForRule = (text) => {
1657
+ const detected = detectTokenPatterns(text);
1658
+ if (detected.length === 0) return null;
1659
+ return {
1660
+ detected,
1661
+ template: generateTemplateFromText(text, detected),
1662
+ ...suggestPatternConfig(detected)
1663
+ };
1664
+ };
1665
+
1666
+ //#endregion
1667
+ export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
1484
1668
  //# sourceMappingURL=index.mjs.map