baburchi 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -62,7 +62,9 @@ const noiseText = isArabicTextNoise('---'); // true
62
62
 
63
63
  ## API Reference
64
64
 
65
- ### `fixTypo(original, correction, options)`
65
+ ### Core Text Processing
66
+
67
+ #### `fixTypo(original, correction, options)`
66
68
 
67
69
  The main function for correcting typos using text alignment.
68
70
 
@@ -80,7 +82,7 @@ The main function for correcting typos using text alignment.
80
82
 
81
83
  **Returns:** Corrected text string
82
84
 
83
- ### `processTextAlignment(originalText, altText, options)`
85
+ #### `processTextAlignment(originalText, altText, options)`
84
86
 
85
87
  Low-level function for advanced text processing with full configuration control.
86
88
 
@@ -90,6 +92,167 @@ Low-level function for advanced text processing with full configuration control.
90
92
  - `altText` (string): Reference text for alignment
91
93
  - `options` (FixTypoOptions): Complete configuration object
92
94
 
95
+ ### Fuzzy Text Matching
96
+
97
+ #### `findMatches(pages, excerpts, policy?)`
98
+
99
+ Finds the best matching page for each excerpt using exact and fuzzy matching algorithms.
100
+
101
+ **Parameters:**
102
+
103
+ - `pages` (string[]): Array of page texts to search within
104
+ - `excerpts` (string[]): Array of text excerpts to find
105
+ - `policy` (MatchPolicy, optional): Matching configuration
106
+
107
+ **Returns:** `number[]` - Array of page indices (0-based) where each excerpt was found, or -1 if not found
108
+
109
+ **Example:**
110
+
111
+ ```typescript
112
+ import { findMatches } from 'baburchi';
113
+
114
+ const pages = [
115
+ 'هذا النص في الصفحة الأولى مع محتوى إضافي',
116
+ 'النص الثاني يظهر هنا في الصفحة الثانية',
117
+ 'الصفحة الثالثة تحتوي على نص مختلف'
118
+ ];
119
+
120
+ const excerpts = [
121
+ 'النص في الصفحة الأولى',
122
+ 'النص الثاني يظهر',
123
+ 'نص غير موجود'
124
+ ];
125
+
126
+ const matches = findMatches(pages, excerpts);
127
+ console.log(matches); // [0, 1, -1]
128
+ ```
129
+
130
+ #### `findMatchesAll(pages, excerpts, policy?)`
131
+
132
+ Finds all potential matches for each excerpt, ranked by match quality.
133
+
134
+ **Parameters:**
135
+
136
+ - `pages` (string[]): Array of page texts to search within
137
+ - `excerpts` (string[]): Array of text excerpts to find
138
+ - `policy` (MatchPolicy, optional): Matching configuration
139
+
140
+ **Returns:** `number[][]` - Array where each element is an array of page indices ranked by match quality (exact matches first, then fuzzy matches by score)
141
+
142
+ **Example:**
143
+
144
+ ```typescript
145
+ import { findMatchesAll } from 'baburchi';
146
+
147
+ const pages = [
148
+ 'النص الأول مع محتوى مشابه',
149
+ 'محتوى مشابه في النص الثاني',
150
+ 'النص الأول بصيغة مختلفة قليلاً'
151
+ ];
152
+
153
+ const excerpts = ['النص الأول'];
154
+
155
+ const allMatches = findMatchesAll(pages, excerpts);
156
+ console.log(allMatches); // [[0, 2]] - excerpt matches page 0 exactly, page 2 fuzzily
157
+ ```
158
+
159
+ #### Match Policy Configuration
160
+
161
+ The `MatchPolicy` interface allows fine-tuning of the matching algorithm:
162
+
163
+ ```typescript
164
+ interface MatchPolicy {
165
+ enableFuzzy?: boolean; // Enable fuzzy matching (default: true)
166
+ maxEditAbs?: number; // Max absolute edit distance (default: 3)
167
+ maxEditRel?: number; // Max relative edit distance (default: 0.1)
168
+ q?: number; // Q-gram size for indexing (default: 4)
169
+ gramsPerExcerpt?: number; // Q-grams to sample per excerpt (default: 5)
170
+ maxCandidatesPerExcerpt?: number; // Max candidates to evaluate (default: 40)
171
+ seamLen?: number; // Cross-page seam length (default: 512)
172
+ }
173
+ ```
174
+
175
+ **Example with custom policy:**
176
+
177
+ ```typescript
178
+ import { findMatches } from 'baburchi';
179
+
180
+ const customPolicy: MatchPolicy = {
181
+ enableFuzzy: true,
182
+ maxEditAbs: 6, // Allow more character differences
183
+ maxEditRel: 0.3, // Allow 30% character differences
184
+ q: 4, // Use 4-grams for better precision
185
+ gramsPerExcerpt: 30, // Sample more Q-grams
186
+ maxCandidatesPerExcerpt: 150
187
+ };
188
+
189
+ const matches = findMatches(pages, excerpts, customPolicy);
190
+ ```
191
+
192
+ ### Arabic Text Normalization
193
+
194
+ #### `sanitizeArabic(input, optionsOrPreset)`
195
+
196
+ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabic text.
197
+
198
+ **Parameters:**
199
+
200
+ - `input` (string): The Arabic text to sanitize
201
+ - `optionsOrPreset` (string | object): Either a preset name or custom options
202
+
203
+ **Presets:**
204
+
205
+ - `"light"`: Basic cleanup for display (strips zero-width chars, collapses whitespace)
206
+ - `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
207
+ - `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
208
+
209
+ **Custom Options:**
210
+
211
+ ```typescript
212
+ interface SanitizeOptions {
213
+ base?: 'light' | 'search' | 'aggressive' | 'none';
214
+ stripDiacritics?: boolean;
215
+ stripTatweel?: boolean;
216
+ normalizeAlif?: boolean;
217
+ replaceAlifMaqsurah?: boolean;
218
+ replaceTaMarbutahWithHa?: boolean;
219
+ stripZeroWidth?: boolean;
220
+ zeroWidthToSpace?: boolean;
221
+ stripLatinAndSymbols?: boolean;
222
+ lettersAndSpacesOnly?: boolean;
223
+ keepOnlyArabicLetters?: boolean;
224
+ collapseWhitespace?: boolean;
225
+ trim?: boolean;
226
+ removeHijriMarker?: boolean;
227
+ }
228
+ ```
229
+
230
+ **Examples:**
231
+
232
+ ```typescript
233
+ import { sanitizeArabic } from 'baburchi';
234
+
235
+ // Light display cleanup
236
+ sanitizeArabic(' مرحبا\u200C\u200D بالعالم ', 'light'); // → 'مرحبا بالعالم'
237
+
238
+ // Tolerant search normalization
239
+ sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // → 'السلام عليكم'
240
+
241
+ // Indexing-friendly text (letters + spaces only)
242
+ sanitizeArabic('اَلسَّلَامُ 1435/3/29 هـ — www', 'aggressive'); // → 'السلام'
243
+
244
+ // Custom: Tatweel-only, preserving dates/list markers
245
+ sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // → 'أبتِكَةُ'
246
+
247
+ // Zero-width controls → spaces
248
+ sanitizeArabic('يَخْلُوَ ‏. ‏ قَالَ غَرِيبٌ ‏. ‏', {
249
+ base: 'none',
250
+ stripZeroWidth: true,
251
+ zeroWidthToSpace: true
252
+ });
253
+ // → 'يَخْلُوَ . قَالَ غَرِيبٌ . '
254
+ ```
255
+
93
256
  ## Usage Examples
94
257
 
95
258
  ### Basic Arabic Text Correction
@@ -189,9 +352,9 @@ Baburchi uses the **Needleman-Wunsch global sequence alignment algorithm** to op
189
352
 
190
353
  Baburchi works in all modern environments:
191
354
 
192
- - ✅ Node.js 18+
193
- - ✅ Bun 1.0+
194
- - ✅ Modern browsers (ES2020+)
355
+ - ✅ Node.js 22+
356
+ - ✅ Bun 1.2.21+
357
+ - ✅ Modern browsers (ES2023+)
195
358
  - ✅ Deno (with npm compatibility)
196
359
 
197
360
  ## TypeScript Support
@@ -308,6 +471,58 @@ This function is particularly useful for:
308
471
  - Handling cases where text layout affects line ordering
309
472
  - Processing documents where content has been split across multiple detection regions
310
473
 
474
+ ## Hijri Date Standardization
475
+
476
+ Baburchi includes specialized functions for standardizing Hijri date symbols commonly found in Arabic historical and religious texts. These functions help normalize OCR inconsistencies in Hijri date notation.
477
+
478
+ ### `standardizeHijriSymbol(text)`
479
+
480
+ Standardizes standalone ه to هـ when following Arabic digits, ensuring proper Hijri date notation.
481
+
482
+ ```typescript
483
+ import { standardizeHijriSymbol } from 'baburchi';
484
+
485
+ // Standardize after Arabic-Indic digits
486
+ const text1 = standardizeHijriSymbol('سنة ١٤٤٥ ه'); // 'سنة ١٤٤٥ هـ'
487
+ const text2 = standardizeHijriSymbol('عام ٧٥٠ه'); // 'عام ٧٥٠ هـ'
488
+
489
+ // Standardize after Western digits
490
+ const text3 = standardizeHijriSymbol('في عام 1445 ه'); // 'في عام 1445 هـ'
491
+ const text4 = standardizeHijriSymbol('توفي 632ه'); // 'توفي 632 هـ'
492
+
493
+ // Does not affect ه when part of other words
494
+ const text5 = standardizeHijriSymbol('هذا كتاب'); // 'هذا كتاب' (unchanged)
495
+ ```
496
+
497
+ ### `standardizeIntahaSymbol(text)`
498
+
499
+ Standardizes standalone اه to اهـ when appearing as a whole word, typically used in academic and historical texts.
500
+
501
+ ```typescript
502
+ import { standardizeIntahaSymbol } from 'baburchi';
503
+
504
+ // Standardize standalone AH abbreviation
505
+ const text1 = standardizeIntahaSymbol('سنة 1445 اه'); // 'سنة 1445 اهـ'
506
+ const text2 = standardizeIntahaSymbol('في العام اه'); // 'في العام اهـ'
507
+
508
+ // Does not affect اه when part of other words
509
+ const text3 = standardizeIntahaSymbol('الاهتمام بالتاريخ'); // 'الاهتمام بالتاريخ' (unchanged)
510
+ ```
511
+
512
+ ### Combined Hijri Standardization
513
+
514
+ ```typescript
515
+ import { standardizeHijriSymbol, standardizeIntahaSymbol } from 'baburchi';
516
+
517
+ function standardizeAllHijriNotations(text: string): string {
518
+ return standardizeIntahaSymbol(standardizeHijriSymbol(text));
519
+ }
520
+
521
+ const mixedText = 'وُلد سنة 570 ه وتوفي عام 632 اه';
522
+ const standardized = standardizeAllHijriNotations(mixedText);
523
+ console.log(standardized); // 'وُلد سنة 570 هـ وتوفي عام 632 اهـ'
524
+ ```
525
+
311
526
  ## Utilities
312
527
 
313
528
  The library also exports utility functions for advanced use cases:
@@ -315,20 +530,18 @@ The library also exports utility functions for advanced use cases:
315
530
  ```typescript
316
531
  import {
317
532
  calculateSimilarity,
318
- normalizeArabicText,
319
533
  tokenizeText,
320
534
  alignTokenSequences,
321
535
  hasInvalidFootnotes,
322
536
  correctReferences,
323
537
  alignTextSegments,
538
+ standardizeHijriSymbol,
539
+ standardizeIntahaSymbol,
324
540
  } from 'baburchi';
325
541
 
326
542
  // Calculate similarity between two strings
327
543
  const similarity = calculateSimilarity('hello', 'helo'); // 0.8
328
544
 
329
- // Normalize Arabic text
330
- const normalized = normalizeArabicText('اَلسَّلَامُ'); // 'السلام'
331
-
332
545
  // Tokenize with symbol preservation
333
546
  const tokens = tokenizeText('محمد ﷺ رسول', ['ﷺ']); // ['محمد', 'ﷺ', 'رسول']
334
547
 
@@ -347,6 +560,10 @@ const aligned = alignTextSegments(
347
560
  ['target line one', '', 'target line three'],
348
561
  ['segment1', 'segment2', 'segment3', 'segment4'],
349
562
  );
563
+
564
+ // Standardize Hijri date symbols
565
+ const hijriText = standardizeHijriSymbol('سنة 1445 ه'); // 'سنة 1445 هـ'
566
+ const ahText = standardizeIntahaSymbol('عام 632 اه'); // 'عام 632 اهـ'
350
567
  ```
351
568
 
352
569
  ## Noise Detection
package/dist/index.d.ts CHANGED
@@ -1,32 +1,3 @@
1
- /**
2
- * Configuration options for fixing typos in OCR text using alignment algorithms.
3
- * These options control how text tokens are compared, aligned, and merged during typo correction.
4
- */
5
- type FixTypoOptions = {
6
- /**
7
- * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
8
- * Used in post-processing to eliminate redundant tokens that are nearly identical.
9
- * Should typically be higher than similarityThreshold to catch only very similar duplicates.
10
- * @default 0.9
11
- * @example 0.95 // Removes tokens that are 95% or more similar
12
- */
13
- readonly highSimilarityThreshold: number;
14
- /**
15
- * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
16
- * Higher values require closer matches, lower values are more permissive.
17
- * Used in the Needleman-Wunsch alignment algorithm for token matching.
18
- * @default 0.7
19
- * @example 0.8 // Requires 80% similarity for token alignment
20
- */
21
- readonly similarityThreshold: number;
22
- /**
23
- * Array of special symbols that should be preserved during typo correction.
24
- * These symbols (like honorifics or religious markers) take precedence in token selection.
25
- * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
26
- */
27
- readonly typoSymbols: string[];
28
- };
29
-
30
1
  /**
31
2
  * Aligns split text segments to match target lines by finding the best order.
32
3
  *
@@ -214,6 +185,88 @@ type TextLine = {
214
185
  */
215
186
  declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
216
187
 
188
+ /**
189
+ * Configuration options for fixing typos in OCR text using alignment algorithms.
190
+ * These options control how text tokens are compared, aligned, and merged during typo correction.
191
+ */
192
+ type FixTypoOptions = {
193
+ /**
194
+ * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
195
+ * Used in post-processing to eliminate redundant tokens that are nearly identical.
196
+ * Should typically be higher than similarityThreshold to catch only very similar duplicates.
197
+ * @default 0.9
198
+ * @example 0.95 // Removes tokens that are 95% or more similar
199
+ */
200
+ readonly highSimilarityThreshold: number;
201
+ /**
202
+ * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
203
+ * Higher values require closer matches, lower values are more permissive.
204
+ * Used in the Needleman-Wunsch alignment algorithm for token matching.
205
+ * @default 0.7
206
+ * @example 0.8 // Requires 80% similarity for token alignment
207
+ */
208
+ readonly similarityThreshold: number;
209
+ /**
210
+ * Array of special symbols that should be preserved during typo correction.
211
+ * These symbols (like honorifics or religious markers) take precedence in token selection.
212
+ * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
213
+ */
214
+ readonly typoSymbols: string[];
215
+ };
216
+ type MatchPolicy = {
217
+ /** Try approximate matches for leftovers (default true). */
218
+ enableFuzzy?: boolean;
219
+ /** Max absolute edit distance accepted in fuzzy (default 3). */
220
+ maxEditAbs?: number;
221
+ /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
222
+ maxEditRel?: number;
223
+ /** q-gram length for candidate generation (default 4). */
224
+ q?: number;
225
+ /** Max rare grams to seed candidates per excerpt (default 5). */
226
+ gramsPerExcerpt?: number;
227
+ /** Max candidate windows verified per excerpt (default 40). */
228
+ maxCandidatesPerExcerpt?: number;
229
+ /** Seam length for bleed windows (default 512). */
230
+ seamLen?: number;
231
+ };
232
+
233
+ /**
234
+ * Main function to find the single best match per excerpt.
235
+ * Combines exact matching with fuzzy matching for comprehensive text search.
236
+ *
237
+ * @param pages - Array of page texts to search within
238
+ * @param excerpts - Array of text excerpts to find matches for
239
+ * @param policy - Optional matching policy configuration
240
+ * @returns Array of page indices (one per excerpt, -1 if no match found)
241
+ *
242
+ * @example
243
+ * ```typescript
244
+ * const pages = ['Hello world', 'Goodbye world'];
245
+ * const excerpts = ['Hello', 'Good bye']; // Note the typo
246
+ * const matches = findMatches(pages, excerpts, { enableFuzzy: true });
247
+ * // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
248
+ * ```
249
+ */
250
+ declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[];
251
+ /**
252
+ * Main function to find all matches per excerpt, ranked by quality.
253
+ * Returns comprehensive results with both exact and fuzzy matches for each excerpt.
254
+ *
255
+ * @param pages - Array of page texts to search within
256
+ * @param excerpts - Array of text excerpts to find matches for
257
+ * @param policy - Optional matching policy configuration
258
+ * @returns Array of page index arrays (one array per excerpt, sorted by match quality)
259
+ *
260
+ * @example
261
+ * ```typescript
262
+ * const pages = ['Hello world', 'Hello there', 'Goodbye world'];
263
+ * const excerpts = ['Hello'];
264
+ * const matches = findMatchesAll(pages, excerpts);
265
+ * // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
266
+ * ```
267
+ */
268
+ declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
269
+
217
270
  /**
218
271
  * Character statistics for analyzing text content and patterns
219
272
  */
@@ -382,6 +435,19 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
382
435
  */
383
436
  declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
384
437
 
438
+ /**
439
+ * Processes text alignment between original and alternate OCR results to fix typos.
440
+ * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
441
+ * then selects the best tokens and performs post-processing.
442
+ *
443
+ * @param originalText - Original OCR text that may contain typos
444
+ * @param altText - Reference text from alternate OCR for comparison
445
+ * @param options - Configuration options for alignment and selection
446
+ * @returns Corrected text with typos fixed
447
+ */
448
+ declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
449
+ declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
450
+
385
451
  /**
386
452
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
387
453
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -396,6 +462,87 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
396
462
  * calculateLevenshteinDistance('', 'hello') // Returns 5
397
463
  */
398
464
  declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
465
+ /**
466
+ * Calculates bounded Levenshtein distance with early termination.
467
+ * More efficient when you only care about distances up to a threshold.
468
+ */
469
+ declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
470
+
471
+ /**
472
+ * Ultra-fast Arabic text sanitizer for search/indexing/display.
473
+ * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
474
+ * Options can merge over a base preset or `'none'` to apply exactly the rules you request.
475
+ */
476
+ type SanitizePreset = 'light' | 'search' | 'aggressive';
477
+ type SanitizeBase = 'none' | SanitizePreset;
478
+ /**
479
+ * Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
480
+ * `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
481
+ * directly into local booleans for speed.
482
+ */
483
+ type SanitizeOptions = {
484
+ /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
485
+ base?: SanitizeBase;
486
+ /** Unicode NFC normalization. Default: `true` in all presets. */
487
+ nfc?: boolean;
488
+ /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
489
+ stripZeroWidth?: boolean;
490
+ /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
491
+ zeroWidthToSpace?: boolean;
492
+ /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
493
+ stripDiacritics?: boolean;
494
+ /**
495
+ * Remove tatweel (ـ).
496
+ * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
497
+ * - `'safe'` or `'all'` explicitly
498
+ * - `false` to keep tatweel
499
+ * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
500
+ */
501
+ stripTatweel?: boolean | 'safe' | 'all';
502
+ /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
503
+ normalizeAlif?: boolean;
504
+ /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
505
+ replaceAlifMaqsurah?: boolean;
506
+ /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
507
+ replaceTaMarbutahWithHa?: boolean;
508
+ /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
509
+ stripLatinAndSymbols?: boolean;
510
+ /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
511
+ keepOnlyArabicLetters?: boolean;
512
+ /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
513
+ lettersAndSpacesOnly?: boolean;
514
+ /** Collapse runs of whitespace to a single space. Default: `true`. */
515
+ collapseWhitespace?: boolean;
516
+ /** Trim leading/trailing whitespace. Default: `true`. */
517
+ trim?: boolean;
518
+ /**
519
+ * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
520
+ * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
521
+ * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
522
+ */
523
+ removeHijriMarker?: boolean;
524
+ };
525
+ /**
526
+ * Sanitizes Arabic text according to a preset or custom options.
527
+ *
528
+ * Presets:
529
+ * - `'light'`: NFC, zero-width removal, collapse/trim spaces.
530
+ * - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
531
+ * - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
532
+ *
533
+ * Custom options:
534
+ * - Passing an options object overlays the selected `base` preset (default `'light'`).
535
+ * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
536
+ *
537
+ * Examples:
538
+ * ```ts
539
+ * sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
540
+ * sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
541
+ * sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
542
+ * ```
543
+ */
544
+ declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
545
+
399
546
  /**
400
547
  * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
401
548
  * Uses Levenshtein distance normalized by the length of the longer string.
@@ -470,6 +617,7 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
470
617
  */
471
618
  declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
472
619
 
620
+ declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
473
621
  /**
474
622
  * Collection of regex patterns used throughout the library for text processing
475
623
  */
@@ -486,8 +634,6 @@ declare const PATTERNS: {
486
634
  arabicPunctuationAndWhitespace: RegExp;
487
635
  /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
488
636
  arabicReferenceRegex: RegExp;
489
- /** Matches Arabic diacritical marks (harakat, tanween, etc.) */
490
- diacritics: RegExp;
491
637
  /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
492
638
  footnoteEmbedded: RegExp;
493
639
  /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
@@ -498,22 +644,9 @@ declare const PATTERNS: {
498
644
  ocrConfusedFootnoteReferenceRegex: RegExp;
499
645
  /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
500
646
  ocrConfusedReferenceRegex: RegExp;
501
- /** Matches Arabic tatweel (kashida) character used for text stretching */
502
- tatweel: RegExp;
503
647
  /** Matches one or more whitespace characters */
504
648
  whitespace: RegExp;
505
649
  };
506
- /**
507
- * Normalizes Arabic text by removing diacritics, and tatweel marks.
508
- * This normalization enables better text comparison by focusing on core characters
509
- * while ignoring decorative elements that don't affect meaning.
510
- *
511
- * @param text - Arabic text to normalize
512
- * @returns Normalized text with diacritics, tatweel, and basic tags removed
513
- * @example
514
- * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
515
- */
516
- declare const normalizeArabicText: (text: string) => string;
517
650
  /**
518
651
  * Extracts the first sequence of Arabic or Western digits from text.
519
652
  * Used primarily for footnote number comparison to match related footnote elements.
@@ -527,8 +660,8 @@ declare const normalizeArabicText: (text: string) => string;
527
660
  declare const extractDigits: (text: string) => string;
528
661
  /**
529
662
  * Tokenizes text into individual words while preserving special symbols.
530
- * Removes HTML tags, adds spacing around preserved symbols to ensure they
531
- * are tokenized separately, then splits on whitespace.
663
+ * Adds spacing around preserved symbols to ensure they are tokenized separately,
664
+ * then splits on whitespace.
532
665
  *
533
666
  * @param text - Text to tokenize
534
667
  * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
@@ -577,18 +710,17 @@ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null
577
710
  * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
578
711
  */
579
712
  declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
580
-
581
713
  /**
582
- * Processes text alignment between original and alternate OCR results to fix typos.
583
- * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
584
- * then selects the best tokens and performs post-processing.
585
- *
586
- * @param originalText - Original OCR text that may contain typos
587
- * @param altText - Reference text from alternate OCR for comparison
588
- * @param options - Configuration options for alignment and selection
589
- * @returns Corrected text with typos fixed
714
+ * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
715
+ * @param text - Input text to process
716
+ * @returns Text with standardized Hijri symbols
590
717
  */
591
- declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
592
- declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
718
+ declare const standardizeHijriSymbol: (text: string) => string;
719
+ /**
720
+ * Standardizes standalone اه to اهـ when appearing as whole word
721
+ * @param text - Input text to process
722
+ * @returns Text with standardized AH Hijri symbols
723
+ */
724
+ declare const standardizeIntahaSymbol: (text: string) => string;
593
725
 
594
- export { BRACKETS, CLOSE_BRACKETS, type CharacterError, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, tokenizeText };
726
+ export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };