baburchi 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -62,7 +62,9 @@ const noiseText = isArabicTextNoise('---'); // true
62
62
 
63
63
  ## API Reference
64
64
 
65
- ### `fixTypo(original, correction, options)`
65
+ ### Core Text Processing
66
+
67
+ #### `fixTypo(original, correction, options)`
66
68
 
67
69
  The main function for correcting typos using text alignment.
68
70
 
@@ -80,7 +82,7 @@ The main function for correcting typos using text alignment.
80
82
 
81
83
  **Returns:** Corrected text string
82
84
 
83
- ### `processTextAlignment(originalText, altText, options)`
85
+ #### `processTextAlignment(originalText, altText, options)`
84
86
 
85
87
  Low-level function for advanced text processing with full configuration control.
86
88
 
@@ -90,6 +92,167 @@ Low-level function for advanced text processing with full configuration control.
90
92
  - `altText` (string): Reference text for alignment
91
93
  - `options` (FixTypoOptions): Complete configuration object
92
94
 
95
+ ### Fuzzy Text Matching
96
+
97
+ #### `findMatches(pages, excerpts, policy?)`
98
+
99
+ Finds the best matching page for each excerpt using exact and fuzzy matching algorithms.
100
+
101
+ **Parameters:**
102
+
103
+ - `pages` (string[]): Array of page texts to search within
104
+ - `excerpts` (string[]): Array of text excerpts to find
105
+ - `policy` (MatchPolicy, optional): Matching configuration
106
+
107
+ **Returns:** `number[]` - Array of page indices (0-based) where each excerpt was found, or -1 if not found
108
+
109
+ **Example:**
110
+
111
+ ```typescript
112
+ import { findMatches } from 'baburchi';
113
+
114
+ const pages = [
115
+ 'هذا النص في الصفحة الأولى مع محتوى إضافي',
116
+ 'النص الثاني يظهر هنا في الصفحة الثانية',
117
+ 'الصفحة الثالثة تحتوي على نص مختلف'
118
+ ];
119
+
120
+ const excerpts = [
121
+ 'النص في الصفحة الأولى',
122
+ 'النص الثاني يظهر',
123
+ 'نص غير موجود'
124
+ ];
125
+
126
+ const matches = findMatches(pages, excerpts);
127
+ console.log(matches); // [0, 1, -1]
128
+ ```
129
+
130
+ #### `findMatchesAll(pages, excerpts, policy?)`
131
+
132
+ Finds all potential matches for each excerpt, ranked by match quality.
133
+
134
+ **Parameters:**
135
+
136
+ - `pages` (string[]): Array of page texts to search within
137
+ - `excerpts` (string[]): Array of text excerpts to find
138
+ - `policy` (MatchPolicy, optional): Matching configuration
139
+
140
+ **Returns:** `number[][]` - Array where each element is an array of page indices ranked by match quality (exact matches first, then fuzzy matches by score)
141
+
142
+ **Example:**
143
+
144
+ ```typescript
145
+ import { findMatchesAll } from 'baburchi';
146
+
147
+ const pages = [
148
+ 'النص الأول مع محتوى مشابه',
149
+ 'محتوى مشابه في النص الثاني',
150
+ 'النص الأول بصيغة مختلفة قليلاً'
151
+ ];
152
+
153
+ const excerpts = ['النص الأول'];
154
+
155
+ const allMatches = findMatchesAll(pages, excerpts);
156
+ console.log(allMatches); // [[0, 2]] - excerpt matches page 0 exactly, page 2 fuzzily
157
+ ```
158
+
159
+ #### Match Policy Configuration
160
+
161
+ The `MatchPolicy` interface allows fine-tuning of the matching algorithm:
162
+
163
+ ```typescript
164
+ interface MatchPolicy {
165
+ enableFuzzy?: boolean; // Enable fuzzy matching (default: true)
166
+ maxEditAbs?: number; // Max absolute edit distance (default: 3)
167
+ maxEditRel?: number; // Max relative edit distance (default: 0.1)
168
+ q?: number; // Q-gram size for indexing (default: 4)
169
+ gramsPerExcerpt?: number; // Q-grams to sample per excerpt (default: 5)
170
+ maxCandidatesPerExcerpt?: number; // Max candidates to evaluate (default: 40)
171
+ seamLen?: number; // Cross-page seam length (default: 512)
172
+ }
173
+ ```
174
+
175
+ **Example with custom policy:**
176
+
177
+ ```typescript
178
+ import { findMatches } from 'baburchi';
179
+
180
+ const customPolicy: MatchPolicy = {
181
+ enableFuzzy: true,
182
+ maxEditAbs: 6, // Allow more character differences
183
+ maxEditRel: 0.3, // Allow 30% character differences
184
+ q: 4, // Use 4-grams for better precision
185
+ gramsPerExcerpt: 30, // Sample more Q-grams
186
+ maxCandidatesPerExcerpt: 150
187
+ };
188
+
189
+ const matches = findMatches(pages, excerpts, customPolicy);
190
+ ```
191
+
192
+ ### Arabic Text Normalization
193
+
194
+ #### `sanitizeArabic(input, optionsOrPreset)`
195
+
196
+ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabic text.
197
+
198
+ **Parameters:**
199
+
200
+ - `input` (string): The Arabic text to sanitize
201
+ - `optionsOrPreset` (string | object): Either a preset name or custom options
202
+
203
+ **Presets:**
204
+
205
+ - `"light"`: Basic cleanup for display (strips zero-width chars, collapses whitespace)
206
+ - `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
207
+ - `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
208
+
209
+ **Custom Options:**
210
+
211
+ ```typescript
212
+ interface SanitizeOptions {
213
+ base?: 'light' | 'search' | 'aggressive' | 'none';
214
+ stripDiacritics?: boolean;
215
+ stripTatweel?: boolean;
216
+ normalizeAlif?: boolean;
217
+ replaceAlifMaqsurah?: boolean;
218
+ replaceTaMarbutahWithHa?: boolean;
219
+ stripZeroWidth?: boolean;
220
+ zeroWidthToSpace?: boolean;
221
+ stripLatinAndSymbols?: boolean;
222
+ lettersAndSpacesOnly?: boolean;
223
+ keepOnlyArabicLetters?: boolean;
224
+ collapseWhitespace?: boolean;
225
+ trim?: boolean;
226
+ removeHijriMarker?: boolean;
227
+ }
228
+ ```
229
+
230
+ **Examples:**
231
+
232
+ ```typescript
233
+ import { sanitizeArabic } from 'baburchi';
234
+
235
+ // Light display cleanup
236
+ sanitizeArabic(' مرحبا\u200C\u200D بالعالم ', 'light'); // → 'مرحبا بالعالم'
237
+
238
+ // Tolerant search normalization
239
+ sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // → 'السلام عليكم'
240
+
241
+ // Indexing-friendly text (letters + spaces only)
242
+ sanitizeArabic('اَلسَّلَامُ 1435/3/29 هـ — www', 'aggressive'); // → 'السلام'
243
+
244
+ // Custom: Tatweel-only, preserving dates/list markers
245
+ sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // → 'أبتِكَةُ'
246
+
247
+ // Zero-width controls → spaces
248
+ sanitizeArabic('يَخْلُوَ ‏. ‏ قَالَ غَرِيبٌ ‏. ‏', {
249
+ base: 'none',
250
+ stripZeroWidth: true,
251
+ zeroWidthToSpace: true
252
+ });
253
+ // → 'يَخْلُوَ . قَالَ غَرِيبٌ . '
254
+ ```
255
+
93
256
  ## Usage Examples
94
257
 
95
258
  ### Basic Arabic Text Correction
@@ -190,7 +353,7 @@ Baburchi uses the **Needleman-Wunsch global sequence alignment algorithm** to op
190
353
  Baburchi works in all modern environments:
191
354
 
192
355
  - ✅ Node.js 22+
193
- - ✅ Bun 1.2.20+
356
+ - ✅ Bun 1.2.21+
194
357
  - ✅ Modern browsers (ES2023+)
195
358
  - ✅ Deno (with npm compatibility)
196
359
 
@@ -367,7 +530,6 @@ The library also exports utility functions for advanced use cases:
367
530
  ```typescript
368
531
  import {
369
532
  calculateSimilarity,
370
- normalizeArabicText,
371
533
  tokenizeText,
372
534
  alignTokenSequences,
373
535
  hasInvalidFootnotes,
@@ -380,9 +542,6 @@ import {
380
542
  // Calculate similarity between two strings
381
543
  const similarity = calculateSimilarity('hello', 'helo'); // 0.8
382
544
 
383
- // Normalize Arabic text
384
- const normalized = normalizeArabicText('اَلسَّلَامُ'); // 'السلام'
385
-
386
545
  // Tokenize with symbol preservation
387
546
  const tokens = tokenizeText('محمد ﷺ رسول', ['ﷺ']); // ['محمد', 'ﷺ', 'رسول']
388
547
 
package/dist/index.d.ts CHANGED
@@ -1,32 +1,3 @@
1
- /**
2
- * Configuration options for fixing typos in OCR text using alignment algorithms.
3
- * These options control how text tokens are compared, aligned, and merged during typo correction.
4
- */
5
- type FixTypoOptions = {
6
- /**
7
- * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
8
- * Used in post-processing to eliminate redundant tokens that are nearly identical.
9
- * Should typically be higher than similarityThreshold to catch only very similar duplicates.
10
- * @default 0.9
11
- * @example 0.95 // Removes tokens that are 95% or more similar
12
- */
13
- readonly highSimilarityThreshold: number;
14
- /**
15
- * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
16
- * Higher values require closer matches, lower values are more permissive.
17
- * Used in the Needleman-Wunsch alignment algorithm for token matching.
18
- * @default 0.7
19
- * @example 0.8 // Requires 80% similarity for token alignment
20
- */
21
- readonly similarityThreshold: number;
22
- /**
23
- * Array of special symbols that should be preserved during typo correction.
24
- * These symbols (like honorifics or religious markers) take precedence in token selection.
25
- * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
26
- */
27
- readonly typoSymbols: string[];
28
- };
29
-
30
1
  /**
31
2
  * Aligns split text segments to match target lines by finding the best order.
32
3
  *
@@ -214,6 +185,92 @@ type TextLine = {
214
185
  */
215
186
  declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
216
187
 
188
+ /**
189
+ * Configuration options for fixing typos in OCR text using alignment algorithms.
190
+ * These options control how text tokens are compared, aligned, and merged during typo correction.
191
+ */
192
+ type FixTypoOptions = {
193
+ /**
194
+ * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
195
+ * Used in post-processing to eliminate redundant tokens that are nearly identical.
196
+ * Should typically be higher than similarityThreshold to catch only very similar duplicates.
197
+ * @default 0.9
198
+ * @example 0.95 // Removes tokens that are 95% or more similar
199
+ */
200
+ readonly highSimilarityThreshold: number;
201
+ /**
202
+ * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
203
+ * Higher values require closer matches, lower values are more permissive.
204
+ * Used in the Needleman-Wunsch alignment algorithm for token matching.
205
+ * @default 0.7
206
+ * @example 0.8 // Requires 80% similarity for token alignment
207
+ */
208
+ readonly similarityThreshold: number;
209
+ /**
210
+ * Array of special symbols that should be preserved during typo correction.
211
+ * These symbols (like honorifics or religious markers) take precedence in token selection.
212
+ * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
213
+ */
214
+ readonly typoSymbols: string[];
215
+ };
216
+ type MatchPolicy = {
217
+ /** Try approximate matches for leftovers (default true). */
218
+ enableFuzzy?: boolean;
219
+ /** Max absolute edit distance accepted in fuzzy (default 3). */
220
+ maxEditAbs?: number;
221
+ /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
222
+ maxEditRel?: number;
223
+ /** q-gram length for candidate generation (default 4). */
224
+ q?: number;
225
+ /** Max rare grams to seed candidates per excerpt (default 5). */
226
+ gramsPerExcerpt?: number;
227
+ /** Max candidate windows verified per excerpt (default 40). */
228
+ maxCandidatesPerExcerpt?: number;
229
+ /** Seam length for bleed windows (default 512). */
230
+ seamLen?: number;
231
+ /**
232
+ * Optional logging function for debugging.
233
+ */
234
+ log?(message?: any, ...optionalParams: any[]): void;
235
+ };
236
+
237
+ /**
238
+ * Main function to find the single best match per excerpt.
239
+ * Combines exact matching with fuzzy matching for comprehensive text search.
240
+ *
241
+ * @param pages - Array of page texts to search within
242
+ * @param excerpts - Array of text excerpts to find matches for
243
+ * @param policy - Optional matching policy configuration
244
+ * @returns Array of page indices (one per excerpt, -1 if no match found)
245
+ *
246
+ * @example
247
+ * ```typescript
248
+ * const pages = ['Hello world', 'Goodbye world'];
249
+ * const excerpts = ['Hello', 'Good bye']; // Note the typo
250
+ * const matches = findMatches(pages, excerpts, { enableFuzzy: true });
251
+ * // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
252
+ * ```
253
+ */
254
+ declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[];
255
+ /**
256
+ * Main function to find all matches per excerpt, ranked by quality.
257
+ * Returns comprehensive results with both exact and fuzzy matches for each excerpt.
258
+ *
259
+ * @param pages - Array of page texts to search within
260
+ * @param excerpts - Array of text excerpts to find matches for
261
+ * @param policy - Optional matching policy configuration
262
+ * @returns Array of page index arrays (one array per excerpt, sorted by match quality)
263
+ *
264
+ * @example
265
+ * ```typescript
266
+ * const pages = ['Hello world', 'Hello there', 'Goodbye world'];
267
+ * const excerpts = ['Hello'];
268
+ * const matches = findMatchesAll(pages, excerpts);
269
+ * // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
270
+ * ```
271
+ */
272
+ declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
273
+
217
274
  /**
218
275
  * Character statistics for analyzing text content and patterns
219
276
  */
@@ -382,6 +439,19 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
382
439
  */
383
440
  declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
384
441
 
442
+ /**
443
+ * Processes text alignment between original and alternate OCR results to fix typos.
444
+ * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
445
+ * then selects the best tokens and performs post-processing.
446
+ *
447
+ * @param originalText - Original OCR text that may contain typos
448
+ * @param altText - Reference text from alternate OCR for comparison
449
+ * @param options - Configuration options for alignment and selection
450
+ * @returns Corrected text with typos fixed
451
+ */
452
+ declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
453
+ declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
454
+
385
455
  /**
386
456
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
387
457
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -396,6 +466,89 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
396
466
  * calculateLevenshteinDistance('', 'hello') // Returns 5
397
467
  */
398
468
  declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
469
+ /**
470
+ * Calculates bounded Levenshtein distance with early termination.
471
+ * More efficient when you only care about distances up to a threshold.
472
+ */
473
+ declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
474
+
475
+ /**
476
+ * Ultra-fast Arabic text sanitizer for search/indexing/display.
477
+ * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
478
+ * Options can merge over a base preset or `'none'` to apply exactly the rules you request.
479
+ */
480
+ type SanitizePreset = 'light' | 'search' | 'aggressive';
481
+ type SanitizeBase = 'none' | SanitizePreset;
482
+ /**
483
+ * Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
484
+ * `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
485
+ * directly into local booleans for speed.
486
+ */
487
+ type SanitizeOptions = {
488
+ /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
489
+ base?: SanitizeBase;
490
+ /** Unicode NFC normalization. Default: `true` in all presets. */
491
+ nfc?: boolean;
492
+ /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
493
+ stripZeroWidth?: boolean;
494
+ /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
495
+ zeroWidthToSpace?: boolean;
496
+ /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
497
+ stripDiacritics?: boolean;
498
+ /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
499
+ stripFootnotes?: boolean;
500
+ /**
501
+ * Remove tatweel (ـ).
502
+ * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
503
+ * - `'safe'` or `'all'` explicitly
504
+ * - `false` to keep tatweel
505
+ * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
506
+ */
507
+ stripTatweel?: boolean | 'safe' | 'all';
508
+ /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
509
+ normalizeAlif?: boolean;
510
+ /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
511
+ replaceAlifMaqsurah?: boolean;
512
+ /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
513
+ replaceTaMarbutahWithHa?: boolean;
514
+ /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
515
+ stripLatinAndSymbols?: boolean;
516
+ /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
517
+ keepOnlyArabicLetters?: boolean;
518
+ /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
519
+ lettersAndSpacesOnly?: boolean;
520
+ /** Collapse runs of whitespace to a single space. Default: `true`. */
521
+ collapseWhitespace?: boolean;
522
+ /** Trim leading/trailing whitespace. Default: `true`. */
523
+ trim?: boolean;
524
+ /**
525
+ * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
526
+ * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
527
+ * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
528
+ */
529
+ removeHijriMarker?: boolean;
530
+ };
531
+ /**
532
+ * Sanitizes Arabic text according to a preset or custom options.
533
+ *
534
+ * Presets:
535
+ * - `'light'`: NFC, zero-width removal, collapse/trim spaces.
536
+ * - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
537
+ * - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
538
+ *
539
+ * Custom options:
540
+ * - Passing an options object overlays the selected `base` preset (default `'light'`).
541
+ * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
542
+ *
543
+ * Examples:
544
+ * ```ts
545
+ * sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
546
+ * sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
547
+ * sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
548
+ * ```
549
+ */
550
+ declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
551
+
399
552
  /**
400
553
  * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
401
554
  * Uses Levenshtein distance normalized by the length of the longer string.
@@ -487,8 +640,6 @@ declare const PATTERNS: {
487
640
  arabicPunctuationAndWhitespace: RegExp;
488
641
  /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
489
642
  arabicReferenceRegex: RegExp;
490
- /** Matches Arabic diacritical marks (harakat, tanween, etc.) */
491
- diacritics: RegExp;
492
643
  /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
493
644
  footnoteEmbedded: RegExp;
494
645
  /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
@@ -499,22 +650,9 @@ declare const PATTERNS: {
499
650
  ocrConfusedFootnoteReferenceRegex: RegExp;
500
651
  /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
501
652
  ocrConfusedReferenceRegex: RegExp;
502
- /** Matches Arabic tatweel (kashida) character used for text stretching */
503
- tatweel: RegExp;
504
653
  /** Matches one or more whitespace characters */
505
654
  whitespace: RegExp;
506
655
  };
507
- /**
508
- * Normalizes Arabic text by removing diacritics, and tatweel marks.
509
- * This normalization enables better text comparison by focusing on core characters
510
- * while ignoring decorative elements that don't affect meaning.
511
- *
512
- * @param text - Arabic text to normalize
513
- * @returns Normalized text with diacritics, tatweel, and basic tags removed
514
- * @example
515
- * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
516
- */
517
- declare const normalizeArabicText: (text: string) => string;
518
656
  /**
519
657
  * Extracts the first sequence of Arabic or Western digits from text.
520
658
  * Used primarily for footnote number comparison to match related footnote elements.
@@ -528,8 +666,8 @@ declare const normalizeArabicText: (text: string) => string;
528
666
  declare const extractDigits: (text: string) => string;
529
667
  /**
530
668
  * Tokenizes text into individual words while preserving special symbols.
531
- * Removes HTML tags, adds spacing around preserved symbols to ensure they
532
- * are tokenized separately, then splits on whitespace.
669
+ * Adds spacing around preserved symbols to ensure they are tokenized separately,
670
+ * then splits on whitespace.
533
671
  *
534
672
  * @param text - Text to tokenize
535
673
  * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
@@ -578,6 +716,36 @@ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null
578
716
  * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
579
717
  */
580
718
  declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
719
+ /**
720
+ * Removes simple footnote references from Arabic text.
721
+ * Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC).
722
+ *
723
+ * @param text - The input text containing footnote references to remove
724
+ * @returns The text with footnote references removed and extra spaces normalized
725
+ *
726
+ * @example
727
+ * ```typescript
728
+ * removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية")
729
+ * // Returns: "هذا النص يحتوي على حاشية"
730
+ * ```
731
+ */
732
+ declare const removeFootnoteReferencesSimple: (text: string) => string;
733
+ /**
734
+ * Removes single digit footnote references and extended footnote formats from Arabic text.
735
+ * Handles footnotes in the format:
736
+ * - ([single Arabic digit]) - e.g., (٣)
737
+ * - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب)
738
+ *
739
+ * @param text - The input text containing footnote references to remove
740
+ * @returns The text with footnote references removed and extra spaces normalized
741
+ *
742
+ * @example
743
+ * ```typescript
744
+ * removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي")
745
+ * // Returns: "هذا النص والآخر والثالث يحتوي على حواشي"
746
+ * ```
747
+ */
748
+ declare const removeSingleDigitFootnoteReferences: (text: string) => string;
581
749
  /**
582
750
  * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
583
751
  * @param text - Input text to process
@@ -591,17 +759,4 @@ declare const standardizeHijriSymbol: (text: string) => string;
591
759
  */
592
760
  declare const standardizeIntahaSymbol: (text: string) => string;
593
761
 
594
- /**
595
- * Processes text alignment between original and alternate OCR results to fix typos.
596
- * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
597
- * then selects the best tokens and performs post-processing.
598
- *
599
- * @param originalText - Original OCR text that may contain typos
600
- * @param altText - Reference text from alternate OCR for comparison
601
- * @param options - Configuration options for alignment and selection
602
- * @returns Corrected text with typos fixed
603
- */
604
- declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
605
- declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
606
-
607
- export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
762
+ export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
package/dist/index.js CHANGED
@@ -1,3 +1,3 @@
1
- var I="\u0627\u0647\u0640",u={arabicCharacters:/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},g=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),T=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},A=(e,t=[])=>{let r=e;for(let n of t){let o=new RegExp(n,"g");r=r.replace(o,` ${n} `)}return r.trim().split(u.whitespace).filter(Boolean)},S=(e,t,r)=>{let n=u.footnoteStandalone.test(t),o=u.footnoteEmbedded.test(r),s=u.footnoteStandalone.test(r),a=u.footnoteEmbedded.test(t),i=T(t),c=T(r);return n&&o&&i===c?(e[e.length-1]=r,!0):!!(a&&s&&i===c)},E=(e,t)=>{let r=u.footnoteEmbedded.test(e),n=u.footnoteEmbedded.test(t);return r&&!n?[e]:n&&!r?[t]:r&&n?[e.length<=t.length?e:t]:null},N=(e,t)=>{let r=u.footnoteStandalone.test(e),n=u.footnoteStandalone.test(t);return r&&!n?[e,t]:n&&!r?[t,e]:r&&n?[e.length<=t.length?e:t]:null},oe=e=>e.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu,"$1 \u0647\u0640"),se=e=>e.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/g,`$1${I}`);var h={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},$=(e,t)=>{let r=e.length,n=t.length;if(r===0)return n;if(n===0)return r;let[o,s]=r<=n?[e,t]:[t,e],a=o.length,i=s.length,c=Array.from({length:a+1},(f,d)=>d);for(let f=1;f<=i;f++){let d=[f];for(let p=1;p<=a;p++){let C=s[f-1]===o[p-1]?0:1,l=Math.min(c[p]+1,d[p-1]+1,c[p-1]+C);d.push(l)}c=d}return c[a]},x=(e,t)=>{let r=Math.max(e.length,t.length)||1,n=$(e,t);return(r-n)/r},R=(e,t,r=.6)=>{let n=g(e),o=g(t);return x(n,o)>=r},v=(e,t,r,n)=>{let o=g(e),s=g(t);if(o===s)return h.PERFECT_MATCH;let a=r.includes(e)||r.includes(t),i=x(o,s)>=n;return a||i?h.SOFT_MATCH:h.MISMATCH_PENALTY},_=(e,t,r)=>{let n=[],o=t.length,s=r.length;for(;o>0||s>0;)switch(e[o][s].direction){case"diagonal":n.push([t[--o],r[--s]]);break;case"left":n.push([null,r[--s]]);break;case"up":n.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return n.reverse()},B=(e,t,r,n)=>{let o=e.length,s=t.length,a=Array.from({length:o+1},()=>Array.from({length:s+1},()=>({direction:null,score:0})));for(let i=1;i<=o;i++)a[i][0]={direction:"up",score:i*h.GAP_PENALTY};for(let i=1;i<=s;i++)a[0][i]={direction:"left",score:i*h.GAP_PENALTY};for(let i=1;i<=o;i++)for(let c=1;c<=s;c++){let f=v(e[i-1],t[c-1],r,n),d=a[i-1][c-1].score+f,p=a[i-1][c].score+h.GAP_PENALTY,C=a[i][c-1].score+h.GAP_PENALTY,l=Math.max(d,p,C),m="left";l===d?m="diagonal":l===p&&(m="up"),a[i][c]={direction:m,score:l}}return _(a,e,t)};var fe=(e,t)=>{let r=[],n=0;for(let o of e){if(n>=t.length)break;if(o){let{result:s,segmentsConsumed:a}=D(o,t,n);s&&r.push(s),n+=a}else r.push(t[n]),n++}return n<t.length&&r.push(...t.slice(n)),r},q=(e,t,r)=>{let n=`${t} ${r}`,o=`${r} ${t}`,s=g(e),a=x(s,g(n)),i=x(s,g(o));return a>=i?n:o},D=(e,t,r)=>{let n=t[r];if(R(e,n))return{result:n,segmentsConsumed:1};let o=t[r],s=t[r+1];return!o||!s?o?{result:o,segmentsConsumed:1}:{result:"",segmentsConsumed:0}:{result:q(e,o,s),segmentsConsumed:2}};var P=e=>{let t=[],r=0,n=-1;for(let s=0;s<e.length;s++)e[s]==='"'&&(r++,n=s);let o=r%2===0;return!o&&n!==-1&&t.push({char:'"',index:n,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},H={"\xAB":"\xBB","(":")","[":"]","{":"}"},L=new Set(["\xAB","(","[","{"]),V=new Set(["\xBB",")","]","}"]),M=e=>{let t=[],r=[];for(let n=0;n<e.length;n++){let o=e[n];if(L.has(o))r.push({char:o,index:n});else if(V.has(o)){let s=r.pop();s?H[s.char]!==o&&(t.push({char:s.char,index:s.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:n,reason:"mismatched",type:"bracket"})):t.push({char:o,index:n,reason:"unmatched",type:"bracket"})}}return r.forEach(({char:n,index:o})=>{t.push({char:n,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},O=e=>{let t=P(e),r=M(e);return{errors:[...t.errors,...r.errors].sort((n,o)=>n.index-o.index),isBalanced:t.isBalanced&&r.isBalanced}},ge=e=>{let t=[],r=e.split(`
2
- `),n=0;return r.forEach((o,s)=>{if(o.length>10){let a=O(o);a.isBalanced||a.errors.forEach(i=>{t.push({absoluteIndex:n+i.index,char:i.char,reason:i.reason,type:i.type})})}n+=o.length+(s<r.length-1?1:0)}),t},me=e=>P(e).isBalanced,pe=e=>M(e).isBalanced,be=e=>O(e).isBalanced;var j="()",Y=e=>u.invalidReferenceRegex.test(e),G=new Intl.NumberFormat("ar-SA"),K=e=>G.format(e),F=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,Z=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},r=e.replace(/[()]/g,""),n="";for(let s of r)n+=t[s];let o=parseInt(n,10);return isNaN(o)?0:o},w=e=>{let t=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.arabicReferenceRegex)||[]),r=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedReferenceRegex)||[]),n=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.arabicFootnoteReferenceRegex)||[]),o=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedFootnoteReferenceRegex)||[]),s=r.map(i=>i.replace(/[.1OV9]/g,c=>F(c))),a=o.map(i=>i.replace(/[.1OV9]/g,c=>F(c)));return{bodyReferences:[...t,...s],footnoteReferences:[...n,...a],ocrConfusedInBody:r,ocrConfusedInFootnotes:o}},Q=(e,t)=>{if(e.some(s=>Y(s.text)))return!0;let n=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(n.size!==o.size)return!0;for(let s of n)if(!o.has(s))return!0;return!1},Ce=e=>{let t=w(e);if(!Q(e,t))return e;let r=e.map(l=>{let m=l.text,y=/\([.1OV9]+\)/g;return m=m.replace(y,b=>b.replace(/[.1OV9]/g,z=>F(z))),{...l,text:m}}),n=w(r),o=new Set(n.bodyReferences),s=new Set(n.footnoteReferences),a=[...new Set(n.bodyReferences)],i=[...new Set(n.footnoteReferences)],c=a.filter(l=>!s.has(l)),f=i.filter(l=>!o.has(l)),d=[...o,...s],C={count:(d.length>0?Math.max(0,...d.map(l=>Z(l))):0)+1};return r.map(l=>{if(!l.text.includes(j))return l;let m=l.text;return m=m.replace(/\(\)/g,()=>{if(l.isFootnote){let b=c.shift();if(b)return b}else{let b=f.shift();if(b)return b}let y=`(${K(C.count)})`;return C.count++,y}),{...l,text:m}})};var Ae=e=>{if(!e||e.trim().length===0)return!0;let t=e.trim(),r=t.length;if(r<2||J(t))return!0;let n=U(t);if(W(n,r))return!0;let o=u.arabicCharacters.test(t);return!o&&/[a-zA-Z]/.test(t)?!0:o?!ee(n,r):X(n,r,t)};function U(e){let t={arabicCount:0,charFreq:new Map,digitCount:0,latinCount:0,punctuationCount:0,spaceCount:0,symbolCount:0},r=Array.from(e);for(let n of r)t.charFreq.set(n,(t.charFreq.get(n)||0)+1),u.arabicCharacters.test(n)?t.arabicCount++:/\d/.test(n)?t.digitCount++:/[a-zA-Z]/.test(n)?t.latinCount++:/\s/.test(n)?t.spaceCount++:/[.,;:()[\]{}"""''`]/.test(n)?t.punctuationCount++:t.symbolCount++;return t}function W(e,t){let r=0,n=["!",".","-","=","_"];for(let[o,s]of e.charFreq)s>=5&&n.includes(o)&&(r+=s);return r/t>.4}function J(e){return[/^[-=_━≺≻\s]*$/,/^[.\s]*$/,/^[!\s]*$/,/^[A-Z\s]*$/,/^[-\d\s]*$/,/^\d+\s*$/,/^[A-Z]\s*$/,/^[—\s]*$/,/^[्र\s-]*$/].some(r=>r.test(e))}function X(e,t,r){let n=e.arabicCount+e.latinCount+e.digitCount;return n===0||k(e,n,t)?!0:/[٠-٩]/.test(r)&&e.digitCount>=3?!1:(e.symbolCount+Math.max(0,e.punctuationCount-5))/Math.max(n,1)>2||t<=5&&e.arabicCount===0&&!(/^\d+$/.test(r)&&e.digitCount>=3)?!0:/^\d{3,4}$/.test(r)?!1:t<=10}function k(e,t,r){let{arabicCount:n,spaceCount:o}=e;return o>0&&t===o+1&&t<=5||r<=10&&o>=2&&n===0||o/r>.6}function ee(e,t){return e.arabicCount>=3||e.arabicCount>=1&&e.digitCount>0&&t<=20||e.arabicCount>=2&&e.punctuationCount<=2&&t<=10||e.arabicCount>=1&&t<=5&&e.punctuationCount<=1}var te=(e,t,{similarityThreshold:r,typoSymbols:n})=>{if(e===null)return[t];if(t===null)return[e];if(g(e)===g(t))return[e];let o=E(e,t);if(o)return o;let s=N(e,t);if(s)return s;if(n.includes(e)||n.includes(t)){let f=n.find(d=>d===e||d===t);return f?[f]:[e]}let a=g(e),i=g(t);return[x(a,i)>r?e:t]},ne=(e,t)=>{if(e.length===0)return e;let r=[];for(let n of e){if(r.length===0){r.push(n);continue}let o=r.at(-1);if(R(o,n,t)){n.length<o.length&&(r[r.length-1]=n);continue}S(r,o,n)||r.push(n)}return r},re=(e,t,r)=>{let n=A(e,r.typoSymbols),o=A(t,r.typoSymbols),a=B(n,o,r.typoSymbols,r.similarityThreshold).flatMap(([c,f])=>te(c,f,r));return ne(a,r.highSimilarityThreshold).join(" ")},Ee=(e,t,{highSimilarityThreshold:r=.8,similarityThreshold:n=.6,typoSymbols:o})=>re(e,t,{highSimilarityThreshold:r,similarityThreshold:n,typoSymbols:o});export{H as BRACKETS,V as CLOSE_BRACKETS,I as INTAHA_ACTUAL,L as OPEN_BRACKETS,u as PATTERNS,fe as alignTextSegments,B as alignTokenSequences,U as analyzeCharacterStats,pe as areBracketsBalanced,me as areQuotesBalanced,R as areSimilarAfterNormalization,_ as backtrackAlignment,v as calculateAlignmentScore,$ as calculateLevenshteinDistance,x as calculateSimilarity,O as checkBalance,Ce as correctReferences,T as extractDigits,Ee as fixTypo,ge as getUnbalancedErrors,S as handleFootnoteFusion,E as handleFootnoteSelection,N as handleStandaloneFootnotes,W as hasExcessiveRepetition,Y as hasInvalidFootnotes,Ae as isArabicTextNoise,be as isBalanced,J as isBasicNoisePattern,X as isNonArabicNoise,k as isSpacingNoise,ee as isValidArabicContent,g as normalizeArabicText,re as processTextAlignment,oe as standardizeHijriSymbol,se as standardizeIntahaSymbol,A as tokenizeText};
1
+ var zt="\u0627\u0647\u0640",h={arabicCharacters:/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,whitespace:/\s+/},N=e=>{let t=e.match(h.arabicDigits);return t?t[0]:""},w=(e,t=[])=>{let n=e;for(let r of t){let o=new RegExp(r,"g");n=n.replace(o,` ${r} `)}return n.trim().split(h.whitespace).filter(Boolean)},k=(e,t,n)=>{let r=h.footnoteStandalone.test(t),o=h.footnoteEmbedded.test(n),s=h.footnoteStandalone.test(n),a=h.footnoteEmbedded.test(t),i=N(t),c=N(n);return r&&o&&i===c?(e[e.length-1]=n,!0):!!(a&&s&&i===c)},W=(e,t)=>{let n=h.footnoteEmbedded.test(e),r=h.footnoteEmbedded.test(t);return n&&!r?[e]:r&&!n?[t]:n&&r?[e.length<=t.length?e:t]:null},$=(e,t)=>{let n=h.footnoteStandalone.test(e),r=h.footnoteStandalone.test(t);return n&&!r?[e,t]:r&&!n?[t,e]:n&&r?[e.length<=t.length?e:t]:null},D=e=>e.replace(/\s*\(\u00AC[\u0660-\u0669]+\)\s*/g," ").replace(/ +/g," ").trim(),j=e=>e.replace(/\s*\([٠-٩]{1}(\s+[\u0600-\u06FF])?\)\s*/g," ").replace(/ +/g," ").trim(),wt=e=>e.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu,"$1 \u0647\u0640"),vt=e=>e.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu,"$1\u0627\u0647\u0640");var se=/\s+/g,Z=/\u0640/g,ae=/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,ie=/[أإآٱ]/g,ce=/\u0649/g,le=/\u0629/g,ue=/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g,fe=/[A-Za-z]+[0-9]*|[0-9]+|[¬§`=]|[/]{2,}|[&]|[ﷺ]/g,me=/[^\u0621-\u063A\u0641-\u064A\u0671\u067E\u0686\u06A4-\u06AF\u06CC\u06D2\u06D3]/g,ge=/[^\u0621-\u063A\u0641-\u064A\u0671\u067E\u0686\u06A4-\u06AF\u06CC\u06D2\u06D3\s]/g,pe=e=>e===32,de=e=>e>=48&&e<=57||e>=1632&&e<=1641,he=e=>e.replace(Z,(t,n,r)=>{let o=n-1;for(;o>=0&&pe(r.charCodeAt(o));)o--;if(o>=0){let s=r.charCodeAt(o);if(de(s)||s===1607)return"\u0640"}return""}),be=e=>e.replace(/([0-9\u0660-\u0669][0-9\u0660-\u0669/\-\s]*?)\s*ه(?:ـ)?(?=(?:\s|$|[^\p{L}\p{N}]))/gu,"$1"),Ae=(e,t)=>t&&e.normalize?e.normalize("NFC"):e,xe=(e,t,n)=>t?e.replace(ue,n?" ":""):e,ye=(e,t,n)=>(t&&(e=e.replace(ae,"")),n==="safe"?he(e):n==="all"?e.replace(Z,""):e),Se=(e,t,n,r)=>(t&&(e=e.replace(ie,"\u0627")),n&&(e=e.replace(ce,"\u064A")),r&&(e=e.replace(le,"\u0647")),e),Me=(e,t)=>t?e.replace(fe," "):e,Ce=(e,t,n)=>t?e.replace(ge," "):n?e.replace(me,""):e,Te=(e,t,n)=>(t&&(e=e.replace(se," ")),n&&(e=e.trim()),e),b=(e,t)=>t===void 0?e:!!t,Re=(e,t)=>t===void 0?e:t===!0?"safe":t===!1?!1:t,G={aggressive:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!0,nfc:!0,normalizeAlif:!0,removeHijriMarker:!0,replaceAlifMaqsurah:!0,replaceTaMarbutahWithHa:!0,stripDiacritics:!0,stripFootnotes:!0,stripLatinAndSymbols:!0,stripTatweel:"all",stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1},light:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!0,normalizeAlif:!1,removeHijriMarker:!1,replaceAlifMaqsurah:!1,replaceTaMarbutahWithHa:!1,stripDiacritics:!1,stripFootnotes:!1,stripLatinAndSymbols:!1,stripTatweel:!1,stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1},search:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!0,normalizeAlif:!0,removeHijriMarker:!0,replaceAlifMaqsurah:!0,replaceTaMarbutahWithHa:!1,stripDiacritics:!0,stripFootnotes:!0,stripLatinAndSymbols:!1,stripTatweel:"all",stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1}},Fe={collapseWhitespace:!1,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!1,normalizeAlif:!1,removeHijriMarker:!1,replaceAlifMaqsurah:!1,replaceTaMarbutahWithHa:!1,stripDiacritics:!1,stripFootnotes:!1,stripLatinAndSymbols:!1,stripTatweel:!1,stripZeroWidth:!1,trim:!1,zeroWidthToSpace:!1},d=(e,t="search")=>{if(!e)return"";let n,r=null;if(typeof t=="string")n=G[t];else{let T=t.base??"light";n=T==="none"?Fe:G[T],r=t}let o=b(n.nfc,r?.nfc),s=b(n.stripZeroWidth,r?.stripZeroWidth),a=b(n.zeroWidthToSpace,r?.zeroWidthToSpace),i=b(n.stripDiacritics,r?.stripDiacritics),c=b(n.stripFootnotes,r?.stripFootnotes),u=b(n.normalizeAlif,r?.normalizeAlif),l=b(n.replaceAlifMaqsurah,r?.replaceAlifMaqsurah),f=b(n.replaceTaMarbutahWithHa,r?.replaceTaMarbutahWithHa),m=b(n.stripLatinAndSymbols,r?.stripLatinAndSymbols),g=b(n.lettersAndSpacesOnly,r?.lettersAndSpacesOnly),A=b(n.keepOnlyArabicLetters,r?.keepOnlyArabicLetters),M=b(n.collapseWhitespace,r?.collapseWhitespace),x=b(n.trim,r?.trim),C=b(n.removeHijriMarker,r?.removeHijriMarker),z=Re(n.stripTatweel,r?.stripTatweel),p=e;return p=Ae(p,o),p=xe(p,s,a),C&&(p=be(p)),p=ye(p,i,z),p=Se(p,u,l,f),c&&(p=D(p),p=j(p)),g||(p=Me(p,m)),p=Ce(p,g,A),p=Te(p,M,x),p};var V=(e,t)=>{let n=e.length,r=t.length;if(n===0)return r;if(r===0)return n;let[o,s]=n<=r?[e,t]:[t,e],a=o.length,i=s.length,c=Array.from({length:a+1},(u,l)=>l);for(let u=1;u<=i;u++){let l=[u];for(let f=1;f<=a;f++){let m=s[u-1]===o[f-1]?0:1,g=Math.min(c[f]+1,l[f-1]+1,c[f-1]+m);l.push(g)}c=l}return c[a]},Pe=(e,t,n)=>Math.abs(e.length-t.length)>n?n+1:e.length===0?t.length<=n?t.length:n+1:t.length===0?e.length<=n?e.length:n+1:null,Ee=e=>{let t=new Int16Array(e+1),n=new Int16Array(e+1);for(let r=0;r<=e;r++)t[r]=r;return[t,n]},ze=(e,t,n)=>({from:Math.max(1,e-t),to:Math.min(n,e+t)}),we=(e,t,n,r,o,s)=>{let a=e[n-1]===t[r-1]?0:1,i=o[r]+1,c=s[r-1]+1,u=o[r-1]+a;return Math.min(i,c,u)},ve=(e,t,n,r,o,s)=>{let a=t.length,i=r+1,{from:c,to:u}=ze(n,r,a);s[0]=n;let l=n;for(let f=1;f<c;f++)s[f]=i;for(let f=u+1;f<=a;f++)s[f]=i;for(let f=c;f<=u;f++){let m=we(e,t,n,f,o,s);s[f]=m,m<l&&(l=m)}return l},v=(e,t,n)=>{let r=n+1,o=Pe(e,t,n);if(o!==null)return o;if(e.length>t.length)return v(t,e,n);let[s,a]=Ee(t.length);for(let i=1;i<=e.length;i++){if(ve(e,t,i,n,s,a)>n)return r;let u=s;s=a,a=u}return s[t.length]<=n?s[t.length]:r};var y={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},S=(e,t)=>{let n=Math.max(e.length,t.length)||1,r=V(e,t);return(n-r)/n},R=(e,t,n=.6)=>{let r=d(e),o=d(t);return S(r,o)>=n},Lt=(e,t,n,r)=>{let o=d(e),s=d(t);if(o===s)return y.PERFECT_MATCH;let a=n.includes(e)||n.includes(t),i=S(o,s)>=r;return a||i?y.SOFT_MATCH:y.MISMATCH_PENALTY},He=(e,t,n)=>{let r=[],o=t.length,s=n.length;for(;o>0||s>0;)switch(e[o][s].direction){case"diagonal":r.push([t[--o],n[--s]]);break;case"left":r.push([null,n[--s]]);break;case"up":r.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return r.reverse()},Ie=(e,t)=>{let n=Array.from({length:e+1},()=>Array.from({length:t+1},()=>({direction:null,score:0})));for(let r=1;r<=e;r++)n[r][0]={direction:"up",score:r*y.GAP_PENALTY};for(let r=1;r<=t;r++)n[0][r]={direction:"left",score:r*y.GAP_PENALTY};return n},Oe=(e,t,n)=>{let r=Math.max(e,t,n);return r===e?{direction:"diagonal",score:r}:r===t?{direction:"up",score:r}:{direction:"left",score:r}},Y=(e,t,n,r)=>{let o=e.length,s=t.length,a=Ie(o,s),i=new Set(n),c=e.map(l=>d(l)),u=t.map(l=>d(l));for(let l=1;l<=o;l++)for(let f=1;f<=s;f++){let m=c[l-1],g=u[f-1],A;if(m===g)A=y.PERFECT_MATCH;else{let T=i.has(e[l-1])||i.has(t[f-1]),oe=S(m,g)>=r;A=T||oe?y.SOFT_MATCH:y.MISMATCH_PENALTY}let M=a[l-1][f-1].score+A,x=a[l-1][f].score+y.GAP_PENALTY,C=a[l][f-1].score+y.GAP_PENALTY,{direction:z,score:p}=Oe(M,x,C);a[l][f]={direction:z,score:p}}return He(a,e,t)};var $t=(e,t)=>{let n=[],r=0;for(let o of e){if(r>=t.length)break;if(o){let{result:s,segmentsConsumed:a}=qe(o,t,r);s&&n.push(s),r+=a}else n.push(t[r]),r++}return r<t.length&&n.push(...t.slice(r)),n},Be=(e,t,n)=>{let r=`${t} ${n}`,o=`${n} ${t}`,s=d(e),a=S(s,d(r)),i=S(s,d(o));return a>=i?r:o},qe=(e,t,n)=>{let r=t[n];if(R(e,r))return{result:r,segmentsConsumed:1};let o=t[n],s=t[n+1];return!o||!s?o?{result:o,segmentsConsumed:1}:{result:"",segmentsConsumed:0}:{result:Be(e,o,s),segmentsConsumed:2}};var U=e=>{let t=[],n=0,r=-1;for(let s=0;s<e.length;s++)e[s]==='"'&&(n++,r=s);let o=n%2===0;return!o&&r!==-1&&t.push({char:'"',index:r,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},_e={"\xAB":"\xBB","(":")","[":"]","{":"}"},Le=new Set(["\xAB","(","[","{"]),Ne=new Set(["\xBB",")","]","}"]),X=e=>{let t=[],n=[];for(let r=0;r<e.length;r++){let o=e[r];if(Le.has(o))n.push({char:o,index:r});else if(Ne.has(o)){let s=n.pop();s?_e[s.char]!==o&&(t.push({char:s.char,index:s.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:r,reason:"mismatched",type:"bracket"})):t.push({char:o,index:r,reason:"unmatched",type:"bracket"})}}return n.forEach(({char:r,index:o})=>{t.push({char:r,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},Q=e=>{let t=U(e),n=X(e);return{errors:[...t.errors,...n.errors].sort((r,o)=>r.index-o.index),isBalanced:t.isBalanced&&n.isBalanced}},jt=e=>{let t=[],n=e.split(`
2
+ `),r=0;return n.forEach((o,s)=>{if(o.length>10){let a=Q(o);a.isBalanced||a.errors.forEach(i=>{t.push({absoluteIndex:r+i.index,char:i.char,reason:i.reason,type:i.type})})}r+=o.length+(s<n.length-1?1:0)}),t},Gt=e=>U(e).isBalanced,Zt=e=>X(e).isBalanced,Vt=e=>Q(e).isBalanced;var ke="()",We=e=>h.invalidReferenceRegex.test(e),$e=new Intl.NumberFormat("ar-SA"),De=e=>$e.format(e),H=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,je=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},n=e.replace(/[()]/g,""),r="";for(let s of n)r+=t[s];let o=parseInt(r,10);return isNaN(o)?0:o},K=e=>{let t=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(h.arabicReferenceRegex)||[]),n=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(h.ocrConfusedReferenceRegex)||[]),r=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(h.arabicFootnoteReferenceRegex)||[]),o=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(h.ocrConfusedFootnoteReferenceRegex)||[]),s=n.map(i=>i.replace(/[.1OV9]/g,c=>H(c))),a=o.map(i=>i.replace(/[.1OV9]/g,c=>H(c)));return{bodyReferences:[...t,...s],footnoteReferences:[...r,...a],ocrConfusedInBody:n,ocrConfusedInFootnotes:o}},Ge=(e,t)=>{if(e.some(s=>We(s.text)))return!0;let r=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(r.size!==o.size)return!0;for(let s of r)if(!o.has(s))return!0;return!1},Xt=e=>{let t=K(e);if(!Ge(e,t))return e;let n=e.map(g=>{let A=g.text,M=/\([.1OV9]+\)/g;return A=A.replace(M,x=>x.replace(/[.1OV9]/g,C=>H(C))),{...g,text:A}}),r=K(n),o=new Set(r.bodyReferences),s=new Set(r.footnoteReferences),a=[...new Set(r.bodyReferences)],i=[...new Set(r.footnoteReferences)],c=a.filter(g=>!s.has(g)),u=i.filter(g=>!o.has(g)),l=[...o,...s],m={count:(l.length>0?Math.max(0,...l.map(g=>je(g))):0)+1};return n.map(g=>{if(!g.text.includes(ke))return g;let A=g.text;return A=A.replace(/\(\)/g,()=>{if(g.isFootnote){let x=c.shift();if(x)return x}else{let x=u.shift();if(x)return x}let M=`(${De(m.count)})`;return m.count++,M}),{...g,text:A}})};var F=class{next=new Map;link=0;out=[]},I=class{nodes=[new F];add(t,n){let r=0;for(let o=0;o<t.length;o++){let s=t[o],a=this.nodes[r].next.get(s);a===void 0&&(a=this.nodes.length,this.nodes[r].next.set(s,a),this.nodes.push(new F)),r=a}this.nodes[r].out.push(n)}build(){let t=[];for(let[,n]of this.nodes[0].next)this.nodes[n].link=0,t.push(n);for(let n=0;n<t.length;n++){let r=t[n];for(let[o,s]of this.nodes[r].next){t.push(s);let a=this.nodes[r].link;for(;a!==0&&!this.nodes[a].next.has(o);)a=this.nodes[a].link;let i=this.nodes[a].next.get(o);this.nodes[s].link=i===void 0?0:i;let c=this.nodes[this.nodes[s].link].out;c.length&&this.nodes[s].out.push(...c)}}}find(t,n){let r=0;for(let o=0;o<t.length;o++){let s=t[o];for(;r!==0&&!this.nodes[r].next.has(s);)r=this.nodes[r].link;let a=this.nodes[r].next.get(s);if(r=a===void 0?0:a,this.nodes[r].out.length)for(let i of this.nodes[r].out)n(i,o+1)}}},P=e=>{let t=new I;for(let n=0;n<e.length;n++){let r=e[n];r.length>0&&t.add(r,n)}return t.build(),t};var O={enableFuzzy:!0,gramsPerExcerpt:5,log:()=>{},maxCandidatesPerExcerpt:40,maxEditAbs:3,maxEditRel:.1,q:4,seamLen:512};var J=200,Ze=80;function B(e){let t=[],n=[],r=[],o=0;for(let s=0;s<e.length;s++){let a=e[s];n.push(o),r.push(a.length),t.push(a),o+=a.length,s+1<e.length&&(t.push(" "),o+=1)}return{book:t.join(""),lens:r,starts:n}}function q(e,t){let n=0,r=t.length-1,o=0;for(;n<=r;){let s=n+r>>1;t[s]<=e?(o=s,n=s+1):r=s-1}return o}function ee(e,t,n,r,o){let s=P(n),a=new Int32Array(o).fill(-1),i=new Uint8Array(o);return s.find(e,(c,u)=>{let l=n[c],f=u-l.length,m=q(f,t);for(let g of r[c])i[g]||(a[g]=m,i[g]=1)}),{result:a,seenExact:i}}function _(e){let t=new Map,n=[],r=[];for(let o=0;o<e.length;o++){let s=e[o],a=t.get(s);a===void 0?(a=r.length,t.set(s,a),r.push(s),n.push([o])):n[a].push(o)}return{keyToPatId:t,patIdToOrigIdxs:n,patterns:r}}var L=(e,t,n,r,o)=>{let s=e.length,a=Math.min(o,Math.max(6,Math.ceil(s*.12))),i=Math.floor(a/2),c=t.start-i,u=t.seam?r[t.page]?.text:n[t.page];if(!u)return null;let l=Ve(t,n,r,c,s,a),f=Ke(l,t,u,c,s,a),m=Je(t,u,c,s,a,o);return et(f,e,m)},Ve=(e,t,n,r,o,s)=>(a=0,i=0)=>e.seam?Ye(n,e.page,r,o,s):Ue(t,e.page,r,o,s,a,i),Ye=(e,t,n,r,o)=>{let s=e[t]?.text;if(!s)return null;let a=Math.max(0,n),i=r+o,c=Math.min(s.length,a+i);return c>a?s.slice(a,c):null},Ue=(e,t,n,r,o,s,a)=>{let i=e[t];if(!i)return null;let c=r+o,u=n,l="";if(u<0){let m=Math.max(0,-u-a);m>0&&(l+=Xe(e,t,m)),u=0}let f=Math.min(i.length-s,Math.max(0,u)+c-l.length);return f>u&&(l+=i.slice(Math.max(0,u),f)),l+=Qe(e,t,c-l.length),l.length?l:null},Xe=(e,t,n)=>{let r=n,o=t-1,s=[];for(;r>0&&o>=0;){let a=e[o];if(!a)break;let i=Math.min(r,a.length),c=a.slice(a.length-i);s.unshift(c),r-=c.length,o--}return s.length?`${s.join(" ")} `:""},Qe=(e,t,n)=>{let r="",o=t+1;for(;n>0&&o<e.length;){let s=e[o];if(!s)break;let a=s.slice(0,n);if(!a.length)break;r+=` ${a}`,n-=a.length,o++}return r},Ke=(e,t,n,r,o,s)=>{let a=[],i=o+s,c=!t.seam&&r+i>n.length,u=!t.seam&&r<0,l=e(0,0);if(l&&a.push(l),c){let f=Math.min(J,Math.max(0,n.length-Math.max(0,r)));if(f>0){let m=e(f,0);m&&a.push(m)}}if(u){let f=e(0,Math.min(J,-r));f&&a.push(f)}return a},Je=(e,t,n,r,o,s)=>{let a=r+o,i=!e.seam&&n+a>t.length,c=!e.seam&&n<0,u=Math.min(2,Math.max(1,Math.ceil(r*.005)));return i||c||e.seam?s+Math.min(Ze,Math.ceil(r*.08)):s+u},et=(e,t,n)=>{let r=null;for(let o of e){let s=v(t,o,n);s<=n&&(r==null||s<r)&&(r=s)}return r==null?null:{acceptance:n,dist:r}};var E=class{q;map=new Map;gramFreq=new Map;constructor(t){this.q=t}addText(t,n,r){let o=this.q,s=n.length;if(!(s<o))for(let a=0;a+o<=s;a++){let i=n.slice(a,a+o),c=this.map.get(i);c||(c=[],this.map.set(i,c)),c.push({page:t,pos:a,seam:r}),this.gramFreq.set(i,(this.gramFreq.get(i)??0)+1)}}pickRare(t,n){n=Math.max(1,Math.floor(n));let r=[],o=new Set,s=this.q;for(let i=0;i+s<=t.length;i++){let c=t.slice(i,i+s);if(o.has(c))continue;o.add(c);let u=this.gramFreq.get(c)??2147483647;r.push({freq:u,gram:c,offset:i})}r.sort((i,c)=>i.freq-c.freq);let a=[];for(let i of r)if(this.map.has(i.gram)&&(a.push({gram:i.gram,offset:i.offset}),a.length>=n))return a;if(a.length<n){let i=new Set(a.map(c=>c.gram));for(let c=r.length-1;c>=0&&a.length<n;c--){let u=r[c];this.map.has(u.gram)&&!i.has(u.gram)&&(a.push({gram:u.gram,offset:u.offset}),i.add(u.gram))}}return a}getPostings(t){return this.map.get(t)}};function te(e,t){let n=[];for(let r=0;r+1<e.length;r++){let o=e[r].slice(-t),s=e[r+1].slice(0,t),a=`${o} ${s}`;n.push({startPage:r,text:a})}return n}function ne(e,t,n){let r=new E(n);for(let o=0;o<e.length;o++)r.addText(o,e[o],!1);for(let o=0;o<t.length;o++)r.addText(o,t[o].text,!0);return r}function re(e,t,n){let r=t.pickRare(e,n.gramsPerExcerpt);if(r.length===0)return[];let o=[],s=new Set,a=e.length;e:for(let{gram:i,offset:c}of r){let u=t.getPostings(i);if(u)for(let l of u){let f=l.pos-c;if(f<-Math.floor(a*.25))continue;let m=Math.max(0,f),g=`${l.page}:${m}:${l.seam?1:0}`;if(!s.has(g)&&(o.push({page:l.page,seam:l.seam,start:m}),s.add(g),o.length>=n.maxCandidatesPerExcerpt))break e}}return o}function tt(e,t,n,r,o){if(e.length===0)return null;let s=nt(e,o);o.log("maxDist",s);let a=new Set,i=null;for(let c of t){if(rt(c,a))continue;let u=ot(c,e,n,r,s,o);if(u&&(i=at(i,u,c),o.log("findBest best",i),u.dist===0))break}return i}function nt(e,t){return Math.max(t.maxEditAbs,Math.ceil(t.maxEditRel*e.length))}function rt(e,t){let n=`${e.page}:${e.start}:${e.seam?1:0}`;return t.has(n)?!0:(t.add(n),!1)}function ot(e,t,n,r,o,s){let a=L(t,e,n,r,o),i=a?.dist??null,c=a?.acceptance??o;return s.log("dist",i),st(i,c)?{acceptance:c,dist:i}:null}function st(e,t){return e!==null&&e<=t}function at(e,t,n){let r={dist:t.dist,page:n.page};return e?it(t.dist,n.page,e.dist,e.page)?r:e:r}function it(e,t,n,r){return e<n||e===n&&t<r}function ct(e,t,n,r,o){if(!o.enableFuzzy)return;let s=te(t,o.seamLen),a=ne(t,s,o.q);for(let i=0;i<e.length;i++){if(n[i])continue;let c=e[i];if(o.log("excerpt",c),!c||c.length<o.q)continue;let u=re(c,a,o);if(o.log("candidates",u),u.length===0)continue;let l=tt(c,u,t,s,o);o.log("best",l),l&&(r[i]=l.page,n[i]=1)}}function un(e,t,n={}){let r={...O,...n},o=e.map(m=>d(m,"aggressive")),s=t.map(m=>d(m,"aggressive"));n.log&&(n.log("pages",e),n.log("excerpts",t),n.log("pagesN",o),n.log("excerptsN",s));let{patIdToOrigIdxs:a,patterns:i}=_(s),{book:c,starts:u}=B(o),{result:l,seenExact:f}=ee(c,u,i,a,t.length);return n.log&&(n.log("findExactMatches result",l),n.log("seenExact",f)),f.every(m=>m===1)||ct(s,o,f,l,r),n.log&&n.log("performFuzzyMatching result",l),Array.from(l)}function lt(e,t,n,r,o){P(n).find(e,(a,i)=>{let c=n[a],u=i-c.length,l=q(u,t);for(let f of r[a]){let m=o[f],g=m.get(l);(!g||!g.exact)&&m.set(l,{exact:!0,score:1,seam:!1})}})}function ut(e,t,n,r,o,s,a){let i=`${e.page}:${e.start}:${e.seam?1:0}`;if(a.has(i))return;a.add(i);let c=L(t,e,n,r,o);if(!c)return;let{dist:u,acceptance:l}=c;if(u>l)return;let f=1-u/l,m=s.get(e.page);(!m||!m.exact&&f>m.score)&&s.set(e.page,{exact:!1,score:f,seam:e.seam})}function ft(e,t,n,r,o,s,a){if(Array.from(s[e].values()).some(m=>m.exact)||!t||t.length<a.q)return;let c=re(t,o,a);if(c.length===0)return;let u=Math.max(a.maxEditAbs,Math.ceil(a.maxEditRel*t.length)),l=new Set,f=s[e];for(let m of c)ut(m,t,n,r,u,f,l)}function mt(e,t,n,r){let o=te(t,r.seamLen),s=ne(t,o,r.q);for(let a=0;a<e.length;a++)ft(a,e[a],t,o,s,n,r)}var gt=e=>e.size===0?[]:(pt(e),bt(e),xt(e)),pt=e=>{let t=Array.from(e.keys()).sort((n,r)=>n-r);for(let n of t){let r=e.get(n),o=e.get(n+1);if(dt(r,o)){let s=ht(n,r,o);e.delete(s)}}},dt=(e,t)=>!!(e?.seam&&t?.seam),ht=(e,t,n)=>n.score>t.score?e:(n.score<t.score,e+1),bt=e=>{let t=Array.from(e.entries()).filter(([,n])=>n.seam).map(([n])=>n);for(let n of t){let r=e.get(n),o=e.get(n+1);At(r,o)&&e.delete(n)}},At=(e,t)=>t?t.exact||!t.seam&&t.score>=e.score:!1,xt=e=>{let t=[],n=[];for(let r of e.entries())r[1].exact?t.push(r):n.push(r);return t.sort((r,o)=>r[0]-o[0]),n.sort((r,o)=>o[1].score-r[1].score||r[0]-o[0]),[...t,...n].map(r=>r[0])};function fn(e,t,n={}){let r={...O,...n},o=e.map(f=>d(f,"aggressive")),s=t.map(f=>d(f,"aggressive"));n.log&&(n.log("pages",e),n.log("excerpts",t),n.log("pagesN",o),n.log("excerptsN",s));let{patIdToOrigIdxs:a,patterns:i}=_(s),{book:c,starts:u}=B(o),l=Array.from({length:t.length},()=>new Map);return lt(c,u,i,a,l),r.enableFuzzy&&mt(s,o,l,r),l.map(f=>gt(f))}var pn=e=>{if(!e||e.trim().length===0)return!0;let t=e.trim(),n=t.length;if(n<2||Mt(t))return!0;let r=yt(t);if(St(r,n))return!0;let o=h.arabicCharacters.test(t);return!o&&/[a-zA-Z]/.test(t)?!0:o?!Rt(r,n):Ct(r,n,t)};function yt(e){let t={arabicCount:0,charFreq:new Map,digitCount:0,latinCount:0,punctuationCount:0,spaceCount:0,symbolCount:0},n=Array.from(e);for(let r of n)t.charFreq.set(r,(t.charFreq.get(r)||0)+1),h.arabicCharacters.test(r)?t.arabicCount++:/\d/.test(r)?t.digitCount++:/[a-zA-Z]/.test(r)?t.latinCount++:/\s/.test(r)?t.spaceCount++:/[.,;:()[\]{}"""''`]/.test(r)?t.punctuationCount++:t.symbolCount++;return t}function St(e,t){let n=0,r=["!",".","-","=","_"];for(let[o,s]of e.charFreq)s>=5&&r.includes(o)&&(n+=s);return n/t>.4}function Mt(e){return[/^[-=_━≺≻\s]*$/,/^[.\s]*$/,/^[!\s]*$/,/^[A-Z\s]*$/,/^[-\d\s]*$/,/^\d+\s*$/,/^[A-Z]\s*$/,/^[—\s]*$/,/^[्र\s-]*$/].some(n=>n.test(e))}function Ct(e,t,n){let r=e.arabicCount+e.latinCount+e.digitCount;return r===0||Tt(e,r,t)?!0:/[٠-٩]/.test(n)&&e.digitCount>=3?!1:(e.symbolCount+Math.max(0,e.punctuationCount-5))/Math.max(r,1)>2||t<=5&&e.arabicCount===0&&!(/^\d+$/.test(n)&&e.digitCount>=3)?!0:/^\d{3,4}$/.test(n)?!1:t<=10}function Tt(e,t,n){let{arabicCount:r,spaceCount:o}=e;return o>0&&t===o+1&&t<=5||n<=10&&o>=2&&r===0||o/n>.6}function Rt(e,t){return e.arabicCount>=3||e.arabicCount>=1&&e.digitCount>0&&t<=20||e.arabicCount>=2&&e.punctuationCount<=2&&t<=10||e.arabicCount>=1&&t<=5&&e.punctuationCount<=1}var Ft=(e,t,{similarityThreshold:n,typoSymbols:r})=>{if(e===null)return[t];if(t===null)return[e];if(d(e)===d(t))return[e];let o=W(e,t);if(o)return o;let s=$(e,t);if(s)return s;if(r.includes(e)||r.includes(t)){let u=r.find(l=>l===e||l===t);return u?[u]:[e]}let a=d(e),i=d(t);return[S(a,i)>n?e:t]},Pt=(e,t)=>{if(e.length===0)return e;let n=[];for(let r of e){if(n.length===0){n.push(r);continue}let o=n.at(-1);if(R(o,r,t)){r.length<o.length&&(n[n.length-1]=r);continue}k(n,o,r)||n.push(r)}return n},Et=(e,t,n)=>{let r=w(e,n.typoSymbols),o=w(t,n.typoSymbols),a=Y(r,o,n.typoSymbols,n.similarityThreshold).flatMap(([c,u])=>Ft(c,u,n));return Pt(a,n.highSimilarityThreshold).join(" ")},xn=(e,t,{highSimilarityThreshold:n=.8,similarityThreshold:r=.6,typoSymbols:o})=>Et(e,t,{highSimilarityThreshold:n,similarityThreshold:r,typoSymbols:o});export{_e as BRACKETS,Ne as CLOSE_BRACKETS,zt as INTAHA_ACTUAL,Le as OPEN_BRACKETS,h as PATTERNS,$t as alignTextSegments,Y as alignTokenSequences,yt as analyzeCharacterStats,Zt as areBracketsBalanced,Gt as areQuotesBalanced,R as areSimilarAfterNormalization,He as backtrackAlignment,v as boundedLevenshtein,Lt as calculateAlignmentScore,V as calculateLevenshteinDistance,S as calculateSimilarity,Q as checkBalance,Xt as correctReferences,N as extractDigits,un as findMatches,fn as findMatchesAll,xn as fixTypo,jt as getUnbalancedErrors,k as handleFootnoteFusion,W as handleFootnoteSelection,$ as handleStandaloneFootnotes,St as hasExcessiveRepetition,We as hasInvalidFootnotes,pn as isArabicTextNoise,Vt as isBalanced,Mt as isBasicNoisePattern,Ct as isNonArabicNoise,Tt as isSpacingNoise,Rt as isValidArabicContent,Et as processTextAlignment,D as removeFootnoteReferencesSimple,j as removeSingleDigitFootnoteReferences,d as sanitizeArabic,wt as standardizeHijriSymbol,vt as standardizeIntahaSymbol,w as tokenizeText};
3
3
  //# sourceMappingURL=index.js.map