baburchi 1.7.1 โ†’ 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -16,15 +16,15 @@ A lightweight TypeScript library for intelligent OCR text post-processing, speci
16
16
 
17
17
  ## Features
18
18
 
19
- - ๐Ÿง  **Intelligent Text Alignment**: Uses the Needleman-Wunsch algorithm for optimal text sequence alignment
20
- - ๐Ÿ”ค **Arabic Text Specialization**: Advanced normalization and diacritics handling for Arabic text
21
- - ๐Ÿงน **Noise Detection**: Comprehensive Arabic text noise detection and OCR artifact identification
22
- - ๐Ÿ“ **Footnote Management**: Smart handling of embedded and standalone footnotes
23
- - โšก **High Performance**: Space-optimized algorithms with O(min(m,n)) space complexity
24
- - ๐ŸŽฏ **Special Symbol Preservation**: Configurable preservation of religious symbols and honorifics
25
- - ๐Ÿ”ง **Flexible Configuration**: Customizable similarity thresholds and typo symbols
26
- - ๐Ÿ“ฆ **Zero Dependencies**: Pure TypeScript implementation with no external dependencies
27
- - ๐ŸŒ **Universal Compatibility**: Works in Node.js, Bun, and modern browsers
19
+ - ๐Ÿง  **Sequence-Aware Typo Repair** — Needlemanโ€“Wunsch alignment with typo symbol preservation and duplicate pruning.
20
+ - ๐Ÿ“„ **Multi-Page Fuzzy Search** — Hybrid exact/fuzzy matching with q-gram seeding and cross-page seam handling.
21
+ - ๐Ÿ“ **Footnote Normalisation** — Converts OCR-confused numerals, fills empty references, and keeps body/footnote sets in sync.
22
+ - ๐Ÿงฎ **Bracket & Quote Balancing** — Detects mismatched punctuation with positional metadata for editor highlighting.
23
+ - ๐Ÿงน **Noise Classification** — Arabic-aware heuristics for punctuation spam, spacing artefacts, and mixed-script clutter.
24
+ - ๐Ÿงพ **Comprehensive Typings** — Fully documented API surface with rich JSDoc coverage and generated declaration files.
25
+ - โš™๏ธ **Configurable Pipelines** — Fine-grained match policies, sanitisation presets, and typo symbol lists.
26
+ - ๐Ÿงช **High Test Coverage** — Extensive Bun test suite covering alignment, matching, sanitisation, and utility helpers.
27
+ - ๐Ÿงณ **Lightweight Tooling** — Ships with the upstream `tsdown` bundler for fast Bun/Node builds and typed outputs.
28
28
 
29
29
  ## Installation
30
30
 
@@ -53,7 +53,7 @@ const correctedText = 'ู…ุญู…ุฏ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡';
53
53
  const typoSymbols = ['๏ทบ', '๏ทฝ', '๏ทป'];
54
54
 
55
55
  const result = fixTypo(originalText, correctedText, { typoSymbols });
56
- console.log(result); // 'ู…ุญู…ุฏ ุตู„ู‰ ุงู„ู„ู‡ ุนู„ูŠู‡ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡'
56
+ console.log(result); // 'ู…ุญู…ุฏ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡ ุนู„ูŠู‡ ูˆุณู„ู…'
57
57
 
58
58
  // Noise detection for OCR cleanup
59
59
  const cleanText = isArabicTextNoise('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…'); // false
@@ -197,7 +197,7 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
197
197
 
198
198
  **Parameters:**
199
199
 
200
- - `input` (string): The Arabic text to sanitize
200
+ - `input` (string | string[]): The Arabic text to sanitize (or an array for optimized batch processing)
201
201
  - `optionsOrPreset` (string | object): Either a preset name or custom options
202
202
 
203
203
  **Presets:**
@@ -206,13 +206,20 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
206
206
  - `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
207
207
  - `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
208
208
 
209
+ **Batch processing / factory:**
210
+
211
+ - Pass an array to resolve options once and sanitize many strings efficiently.
212
+ - Or pre-resolve options with `createArabicSanitizer(...)` and reuse the returned function.
213
+
209
214
  **Custom Options:**
210
215
 
211
216
  ```typescript
212
217
  interface SanitizeOptions {
213
218
  base?: 'light' | 'search' | 'aggressive' | 'none';
219
+ nfc?: boolean;
214
220
  stripDiacritics?: boolean;
215
- stripTatweel?: boolean;
221
+ stripFootnotes?: boolean;
222
+ stripTatweel?: boolean | 'safe' | 'all';
216
223
  normalizeAlif?: boolean;
217
224
  replaceAlifMaqsurah?: boolean;
218
225
  replaceTaMarbutahWithHa?: boolean;
@@ -227,10 +234,12 @@ interface SanitizeOptions {
227
234
  }
228
235
  ```
229
236
 
237
+ **Note on `nfc`**: NFC normalization does **not** remove diacritics; it canonicalizes equivalent sequences. This library applies an Arabic-focused NFC fast-path for common OCR compositions (e.g., Alif + combining hamza/madda), while `stripDiacritics` controls tashkฤซl removal.
238
+
230
239
  **Examples:**
231
240
 
232
241
  ```typescript
233
- import { sanitizeArabic } from 'baburchi';
242
+ import { createArabicSanitizer, sanitizeArabic } from 'baburchi';
234
243
 
235
244
  // Light display cleanup
236
245
  sanitizeArabic(' ู…ุฑุญุจุง\u200C\u200D ุจุงู„ุนุงู„ู… ', 'light'); // โ†’ 'ู…ุฑุญุจุง ุจุงู„ุนุงู„ู…'
@@ -244,6 +253,13 @@ sanitizeArabic('ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู 1435/3/29 ู‡ู€ โ€” www', 'aggressive'); /
244
253
  // Custom: Tatweel-only, preserving dates/list markers
245
254
  sanitizeArabic('ุฃุจู€ู€ู€ุชูู€ู€ู€ูƒูŽุฉู', { base: 'none', stripTatweel: true }); // โ†’ 'ุฃุจุชููƒูŽุฉู'
246
255
 
256
+ // Batch processing (optimized)
257
+ sanitizeArabic(['ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’', 'ุฃุจู€ู€ู€ุชูู€ู€ู€ูƒูŽุฉู'], 'search'); // โ†’ ['ุงู„ุณู„ุงู… ุนู„ูŠูƒู…', 'ุฃุจุชููƒูŽุฉู']
258
+
259
+ // Factory (pre-resolved options)
260
+ const sanitizeSearch = createArabicSanitizer('search');
261
+ ['ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’', 'ุฃุจู€ู€ู€ุชูู€ู€ู€ูƒูŽุฉู'].map(sanitizeSearch);
262
+
247
263
  // Zero-width controls โ†’ spaces
248
264
  sanitizeArabic('ูŠูŽุฎู’ู„ููˆูŽ โ€. โ€ ู‚ูŽุงู„ูŽ ุบูŽุฑููŠุจูŒ โ€. โ€', {
249
265
  base: 'none',
@@ -953,8 +969,9 @@ Contributions are welcome. Please ensure your contributions adhere to the coding
953
969
  2. Install dependencies: `bun install` (requires [Bun](https://bun.sh/))
954
970
  3. Make your changes
955
971
  4. Run tests: `bun test`
956
- 5. Run linting: `bun run lint`
957
- 6. Submit a pull request
972
+ 5. Build artefacts (optional verification): `bun run build`
973
+ 6. Run linting: `bun run lint`
974
+ 7. Submit a pull request
958
975
 
959
976
  ### Running Tests
960
977
 
package/dist/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ //#region src/alignment.d.ts
1
2
  /**
2
3
  * Aligns split text segments to match target lines by finding the best order.
3
4
  *
@@ -11,35 +12,36 @@
11
12
  * @returns Array of aligned text lines
12
13
  */
13
14
  declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
14
-
15
+ //#endregion
16
+ //#region src/balance.d.ts
15
17
  /**
16
18
  * Represents an error found when checking balance of quotes or brackets in text.
17
19
  */
18
20
  type BalanceError = {
19
- /** The character that caused the error */
20
- char: string;
21
- /** The position of the character in the string */
22
- index: number;
23
- /** The reason for the error */
24
- reason: 'mismatched' | 'unclosed' | 'unmatched';
25
- /** The type of character that caused the error */
26
- type: 'bracket' | 'quote';
21
+ /** The character that caused the error */
22
+ char: string;
23
+ /** The position of the character in the string */
24
+ index: number;
25
+ /** The reason for the error */
26
+ reason: 'mismatched' | 'unclosed' | 'unmatched';
27
+ /** The type of character that caused the error */
28
+ type: 'bracket' | 'quote';
27
29
  };
28
30
  /**
29
31
  * Result of a balance check operation.
30
32
  */
31
33
  type BalanceResult = {
32
- /** Array of errors found during balance checking */
33
- errors: BalanceError[];
34
- /** Whether the text is properly balanced */
35
- isBalanced: boolean;
34
+ /** Array of errors found during balance checking */
35
+ errors: BalanceError[];
36
+ /** Whether the text is properly balanced */
37
+ isBalanced: boolean;
36
38
  };
37
39
  /** Mapping of opening brackets to their corresponding closing brackets */
38
40
  declare const BRACKETS: {
39
- '\u00AB': string;
40
- '(': string;
41
- '[': string;
42
- '{': string;
41
+ '\u00AB': string;
42
+ '(': string;
43
+ '[': string;
44
+ '{': string;
43
45
  };
44
46
  /** Set of all opening bracket characters */
45
47
  declare const OPEN_BRACKETS: Set<string>;
@@ -70,14 +72,14 @@ declare const checkBalance: (str: string) => BalanceResult;
70
72
  * syntax highlighters that need precise character positioning.
71
73
  */
72
74
  interface CharacterError {
73
- /** Absolute character position from the start of the entire text */
74
- absoluteIndex: number;
75
- /** The character that caused the error */
76
- char: string;
77
- /** The reason for the error */
78
- reason: 'mismatched' | 'unclosed' | 'unmatched';
79
- /** The type of character that caused the error */
80
- type: 'bracket' | 'quote';
75
+ /** Absolute character position from the start of the entire text */
76
+ absoluteIndex: number;
77
+ /** The character that caused the error */
78
+ char: string;
79
+ /** The reason for the error */
80
+ reason: 'mismatched' | 'unclosed' | 'unmatched';
81
+ /** The type of character that caused the error */
82
+ type: 'bracket' | 'quote';
81
83
  }
82
84
  /**
83
85
  * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
@@ -148,7 +150,8 @@ declare const areBracketsBalanced: (str: string) => boolean;
148
150
  * ```
149
151
  */
150
152
  declare const isBalanced: (str: string) => boolean;
151
-
153
+ //#endregion
154
+ //#region src/footnotes.d.ts
152
155
  /**
153
156
  * Checks if the given text contains invalid footnote references.
154
157
  * Invalid footnotes include empty parentheses "()" or OCR-confused characters
@@ -163,8 +166,8 @@ declare const isBalanced: (str: string) => boolean;
163
166
  */
164
167
  declare const hasInvalidFootnotes: (text: string) => boolean;
165
168
  type TextLine = {
166
- isFootnote?: boolean;
167
- text: string;
169
+ isFootnote?: boolean;
170
+ text: string;
168
171
  };
169
172
  /**
170
173
  * Corrects footnote references in an array of text lines by:
@@ -184,56 +187,58 @@ type TextLine = {
184
187
  * // Returns lines with "()" replaced by proper Arabic numerals like "(ูก)"
185
188
  */
186
189
  declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
187
-
190
+ //#endregion
191
+ //#region src/types.d.ts
188
192
  /**
189
193
  * Configuration options for fixing typos in OCR text using alignment algorithms.
190
194
  * These options control how text tokens are compared, aligned, and merged during typo correction.
191
195
  */
192
196
  type FixTypoOptions = {
193
- /**
194
- * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
195
- * Used in post-processing to eliminate redundant tokens that are nearly identical.
196
- * Should typically be higher than similarityThreshold to catch only very similar duplicates.
197
- * @default 0.9
198
- * @example 0.95 // Removes tokens that are 95% or more similar
199
- */
200
- readonly highSimilarityThreshold: number;
201
- /**
202
- * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
203
- * Higher values require closer matches, lower values are more permissive.
204
- * Used in the Needleman-Wunsch alignment algorithm for token matching.
205
- * @default 0.7
206
- * @example 0.8 // Requires 80% similarity for token alignment
207
- */
208
- readonly similarityThreshold: number;
209
- /**
210
- * Array of special symbols that should be preserved during typo correction.
211
- * These symbols (like honorifics or religious markers) take precedence in token selection.
212
- * @example ['๏ทบ', '๏ทฝ', '๏ทป'] // Common Arabic religious symbols
213
- */
214
- readonly typoSymbols: string[];
197
+ /**
198
+ * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
199
+ * Used in post-processing to eliminate redundant tokens that are nearly identical.
200
+ * Should typically be higher than similarityThreshold to catch only very similar duplicates.
201
+ * @default 0.9
202
+ * @example 0.95 // Removes tokens that are 95% or more similar
203
+ */
204
+ readonly highSimilarityThreshold: number;
205
+ /**
206
+ * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
207
+ * Higher values require closer matches, lower values are more permissive.
208
+ * Used in the Needleman-Wunsch alignment algorithm for token matching.
209
+ * @default 0.7
210
+ * @example 0.8 // Requires 80% similarity for token alignment
211
+ */
212
+ readonly similarityThreshold: number;
213
+ /**
214
+ * Array of special symbols that should be preserved during typo correction.
215
+ * These symbols (like honorifics or religious markers) take precedence in token selection.
216
+ * @example ['๏ทบ', '๏ทฝ', '๏ทป'] // Common Arabic religious symbols
217
+ */
218
+ readonly typoSymbols: string[];
215
219
  };
216
220
  type MatchPolicy = {
217
- /** Try approximate matches for leftovers (default true). */
218
- enableFuzzy?: boolean;
219
- /** Max absolute edit distance accepted in fuzzy (default 3). */
220
- maxEditAbs?: number;
221
- /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
222
- maxEditRel?: number;
223
- /** q-gram length for candidate generation (default 4). */
224
- q?: number;
225
- /** Max rare grams to seed candidates per excerpt (default 5). */
226
- gramsPerExcerpt?: number;
227
- /** Max candidate windows verified per excerpt (default 40). */
228
- maxCandidatesPerExcerpt?: number;
229
- /** Seam length for bleed windows (default 512). */
230
- seamLen?: number;
231
- /**
232
- * Optional logging function for debugging.
233
- */
234
- log?(message?: any, ...optionalParams: any[]): void;
221
+ /** Try approximate matches for leftovers (default true). */
222
+ enableFuzzy?: boolean;
223
+ /** Max absolute edit distance accepted in fuzzy (default 3). */
224
+ maxEditAbs?: number;
225
+ /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
226
+ maxEditRel?: number;
227
+ /** q-gram length for candidate generation (default 4). */
228
+ q?: number;
229
+ /** Max rare grams to seed candidates per excerpt (default 5). */
230
+ gramsPerExcerpt?: number;
231
+ /** Max candidate windows verified per excerpt (default 40). */
232
+ maxCandidatesPerExcerpt?: number;
233
+ /** Seam length for bleed windows (default 512). */
234
+ seamLen?: number;
235
+ /**
236
+ * Optional logging function for debugging.
237
+ */
238
+ log?(message?: any, ...optionalParams: any[]): void;
235
239
  };
236
-
240
+ //#endregion
241
+ //#region src/fuzzy.d.ts
237
242
  /**
238
243
  * Main function to find the single best match per excerpt.
239
244
  * Combines exact matching with fuzzy matching for comprehensive text search.
@@ -270,25 +275,26 @@ declare function findMatches(pages: string[], excerpts: string[], policy?: Match
270
275
  * ```
271
276
  */
272
277
  declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
273
-
278
+ //#endregion
279
+ //#region src/noise.d.ts
274
280
  /**
275
281
  * Character statistics for analyzing text content and patterns
276
282
  */
277
283
  type CharacterStats = {
278
- /** Number of Arabic script characters in the text */
279
- arabicCount: number;
280
- /** Map of character frequencies for repetition analysis */
281
- charFreq: Map<string, number>;
282
- /** Number of digit characters (0-9) in the text */
283
- digitCount: number;
284
- /** Number of Latin alphabet characters (a-z, A-Z) in the text */
285
- latinCount: number;
286
- /** Number of punctuation characters in the text */
287
- punctuationCount: number;
288
- /** Number of whitespace characters in the text */
289
- spaceCount: number;
290
- /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
291
- symbolCount: number;
284
+ /** Number of Arabic script characters in the text */
285
+ arabicCount: number;
286
+ /** Map of character frequencies for repetition analysis */
287
+ charFreq: Map<string, number>;
288
+ /** Number of digit characters (0-9) in the text */
289
+ digitCount: number;
290
+ /** Number of Latin alphabet characters (a-z, A-Z) in the text */
291
+ latinCount: number;
292
+ /** Number of punctuation characters in the text */
293
+ punctuationCount: number;
294
+ /** Number of whitespace characters in the text */
295
+ spaceCount: number;
296
+ /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
297
+ symbolCount: number;
292
298
  };
293
299
  /**
294
300
  * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
@@ -438,7 +444,8 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
438
444
  * ```
439
445
  */
440
446
  declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
441
-
447
+ //#endregion
448
+ //#region src/typos.d.ts
442
449
  /**
443
450
  * Processes text alignment between original and alternate OCR results to fix typos.
444
451
  * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
@@ -450,8 +457,21 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
450
457
  * @returns Corrected text with typos fixed
451
458
  */
452
459
  declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
453
- declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
454
-
460
+ /**
461
+ * Convenience wrapper around {@link processTextAlignment} that accepts partial options.
462
+ *
463
+ * @param original - The source text that may contain typographical errors.
464
+ * @param correction - The reference text used to correct the {@link original} text.
465
+ * @param options - Partial typo correction options combined with required typo symbols.
466
+ * @returns The corrected text generated from the alignment process.
467
+ */
468
+ declare const fixTypo: (original: string, correction: string, {
469
+ highSimilarityThreshold,
470
+ similarityThreshold,
471
+ typoSymbols
472
+ }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
473
+ //#endregion
474
+ //#region src/utils/levenshthein.d.ts
455
475
  /**
456
476
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
457
477
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -471,7 +491,8 @@ declare const calculateLevenshteinDistance: (textA: string, textB: string) => nu
471
491
  * More efficient when you only care about distances up to a threshold.
472
492
  */
473
493
  declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
474
-
494
+ //#endregion
495
+ //#region src/utils/sanitize.d.ts
475
496
  /**
476
497
  * Ultra-fast Arabic text sanitizer for search/indexing/display.
477
498
  * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
@@ -485,49 +506,69 @@ type SanitizeBase = 'none' | SanitizePreset;
485
506
  * directly into local booleans for speed.
486
507
  */
487
508
  type SanitizeOptions = {
488
- /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
489
- base?: SanitizeBase;
490
- /** Unicode NFC normalization. Default: `true` in all presets. */
491
- nfc?: boolean;
492
- /** Strip zero-width controls (U+200Bโ€“U+200F, U+202Aโ€“U+202E, U+2060โ€“U+2064, U+FEFF). Default: `true` in presets. */
493
- stripZeroWidth?: boolean;
494
- /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
495
- zeroWidthToSpace?: boolean;
496
- /** Remove Arabic diacritics (tashkฤซl). Default: `true` in `'search'`/`'aggressive'`. */
497
- stripDiacritics?: boolean;
498
- /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
499
- stripFootnotes?: boolean;
500
- /**
501
- * Remove tatweel (ู€).
502
- * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ู‡' for dates/list markers)
503
- * - `'safe'` or `'all'` explicitly
504
- * - `false` to keep tatweel
505
- * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
506
- */
507
- stripTatweel?: boolean | 'safe' | 'all';
508
- /** Normalize ุข/ุฃ/ุฅ โ†’ ุง. Default: `true` in `'search'`/`'aggressive'`. */
509
- normalizeAlif?: boolean;
510
- /** Replace ู‰ โ†’ ูŠ. Default: `true` in `'search'`/`'aggressive'`. */
511
- replaceAlifMaqsurah?: boolean;
512
- /** Replace ุฉ โ†’ ู‡ (lossy). Default: `true` in `'aggressive'` only. */
513
- replaceTaMarbutahWithHa?: boolean;
514
- /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
515
- stripLatinAndSymbols?: boolean;
516
- /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
517
- keepOnlyArabicLetters?: boolean;
518
- /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
519
- lettersAndSpacesOnly?: boolean;
520
- /** Collapse runs of whitespace to a single space. Default: `true`. */
521
- collapseWhitespace?: boolean;
522
- /** Trim leading/trailing whitespace. Default: `true`. */
523
- trim?: boolean;
524
- /**
525
- * Remove the Hijri date marker ("ู‡ู€" or bare "ู‡" if tatweel already removed) when it follows a date-like token
526
- * (digits/slashes/hyphens/spaces). Example: `1435/3/29 ู‡ู€` โ†’ `1435/3/29`.
527
- * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
528
- */
529
- removeHijriMarker?: boolean;
509
+ /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
510
+ base?: SanitizeBase;
511
+ /**
512
+ * NFC normalization (fast-path).
513
+ *
514
+ * For performance, this sanitizer avoids calling `String.prototype.normalize('NFC')` and instead
515
+ * applies the key Arabic canonical compositions inline (hamza/madda combining marks).
516
+ * This preserves the NFC behavior that matters for typical Arabic OCR text while keeping throughput high.
517
+ *
518
+ * Default: `true` in all presets.
519
+ */
520
+ nfc?: boolean;
521
+ /** Strip zero-width controls (U+200Bโ€“U+200F, U+202Aโ€“U+202E, U+2060โ€“U+2064, U+FEFF). Default: `true` in presets. */
522
+ stripZeroWidth?: boolean;
523
+ /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
524
+ zeroWidthToSpace?: boolean;
525
+ /** Remove Arabic diacritics (tashkฤซl). Default: `true` in `'search'`/`'aggressive'`. */
526
+ stripDiacritics?: boolean;
527
+ /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
528
+ stripFootnotes?: boolean;
529
+ /**
530
+ * Remove tatweel (ู€).
531
+ * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ู‡' for dates/list markers)
532
+ * - `'safe'` or `'all'` explicitly
533
+ * - `false` to keep tatweel
534
+ * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
535
+ */
536
+ stripTatweel?: boolean | 'safe' | 'all';
537
+ /** Normalize ุข/ุฃ/ุฅ โ†’ ุง. Default: `true` in `'search'`/`'aggressive'`. */
538
+ normalizeAlif?: boolean;
539
+ /** Replace ู‰ โ†’ ูŠ. Default: `true` in `'search'`/`'aggressive'`. */
540
+ replaceAlifMaqsurah?: boolean;
541
+ /** Replace ุฉ โ†’ ู‡ (lossy). Default: `true` in `'aggressive'` only. */
542
+ replaceTaMarbutahWithHa?: boolean;
543
+ /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
544
+ stripLatinAndSymbols?: boolean;
545
+ /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
546
+ keepOnlyArabicLetters?: boolean;
547
+ /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
548
+ lettersAndSpacesOnly?: boolean;
549
+ /** Collapse runs of whitespace to a single space. Default: `true`. */
550
+ collapseWhitespace?: boolean;
551
+ /** Trim leading/trailing whitespace. Default: `true`. */
552
+ trim?: boolean;
553
+ /**
554
+ * Remove the Hijri date marker ("ู‡ู€" or bare "ู‡" if tatweel already removed) when it follows a date-like token
555
+ * (digits/slashes/hyphens/spaces). Example: `1435/3/29 ู‡ู€` โ†’ `1435/3/29`.
556
+ * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
557
+ */
558
+ removeHijriMarker?: boolean;
530
559
  };
560
+ /**
561
+ * Creates a reusable sanitizer function with pre-resolved options.
562
+ * Use this when you need to sanitize many strings with the same options
563
+ * for maximum performance.
564
+ *
565
+ * @example
566
+ * ```ts
567
+ * const sanitize = createArabicSanitizer('search');
568
+ * const results = texts.map(sanitize);
569
+ * ```
570
+ */
571
+ declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | SanitizeOptions) => ((input: string) => string);
531
572
  /**
532
573
  * Sanitizes Arabic text according to a preset or custom options.
533
574
  *
@@ -540,15 +581,24 @@ type SanitizeOptions = {
540
581
  * - Passing an options object overlays the selected `base` preset (default `'light'`).
541
582
  * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
542
583
  *
584
+ * **Batch processing**: Pass an array of strings for optimized batch processing.
585
+ * Options are resolved once and applied to all strings, providing significant
586
+ * performance gains over calling the function in a loop.
587
+ *
543
588
  * Examples:
544
589
  * ```ts
545
590
  * sanitizeArabic('ุฃุจู€ู€ู€ุชูู€ู€ู€ูƒูŽุฉู', { base: 'none', stripTatweel: true }); // 'ุฃุจุชููƒูŽุฉู'
546
591
  * sanitizeArabic('1435/3/29 ู‡ู€', 'aggressive'); // '1435 3 29'
547
592
  * sanitizeArabic('ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’', 'search'); // 'ุงู„ุณู„ุงู… ุนู„ูŠูƒู…'
593
+ *
594
+ * // Batch processing (optimized):
595
+ * sanitizeArabic(['text1', 'text2', 'text3'], 'search'); // ['result1', 'result2', 'result3']
548
596
  * ```
549
597
  */
550
- declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
551
-
598
+ declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
599
+ declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
600
+ //#endregion
601
+ //#region src/utils/similarity.d.ts
552
602
  /**
553
603
  * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
554
604
  * Uses Levenshtein distance normalized by the length of the longer string.
@@ -592,8 +642,8 @@ declare const areSimilarAfterNormalization: (textA: string, textB: string, thres
592
642
  declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
593
643
  type AlignedTokenPair = [null | string, null | string];
594
644
  type AlignmentCell = {
595
- direction: 'diagonal' | 'left' | 'up' | null;
596
- score: number;
645
+ direction: 'diagonal' | 'left' | 'up' | null;
646
+ score: number;
597
647
  };
598
648
  /**
599
649
  * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
@@ -622,36 +672,37 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
622
672
  * // Returns [['a', 'a'], ['b', 'c']]
623
673
  */
624
674
  declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
625
-
675
+ //#endregion
676
+ //#region src/utils/textUtils.d.ts
626
677
  declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
627
678
  /**
628
679
  * Collection of regex patterns used throughout the library for text processing
629
680
  */
630
681
  declare const PATTERNS: {
631
- /** Matches Arabic characters across all Unicode blocks */
632
- arabicCharacters: RegExp;
633
- /** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */
634
- arabicDigits: RegExp;
635
- /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
636
- arabicFootnoteReferenceRegex: RegExp;
637
- /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ู -ูฉ) */
638
- arabicLettersAndDigits: RegExp;
639
- /** Matches Arabic punctuation marks and whitespace characters */
640
- arabicPunctuationAndWhitespace: RegExp;
641
- /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
642
- arabicReferenceRegex: RegExp;
643
- /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
644
- footnoteEmbedded: RegExp;
645
- /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[ุŒ.]?$ */
646
- footnoteStandalone: RegExp;
647
- /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
648
- invalidReferenceRegex: RegExp;
649
- /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
650
- ocrConfusedFootnoteReferenceRegex: RegExp;
651
- /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
652
- ocrConfusedReferenceRegex: RegExp;
653
- /** Matches one or more whitespace characters */
654
- whitespace: RegExp;
682
+ /** Matches Arabic characters across all Unicode blocks */
683
+ arabicCharacters: RegExp;
684
+ /** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */
685
+ arabicDigits: RegExp;
686
+ /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
687
+ arabicFootnoteReferenceRegex: RegExp;
688
+ /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ู -ูฉ) */
689
+ arabicLettersAndDigits: RegExp;
690
+ /** Matches Arabic punctuation marks and whitespace characters */
691
+ arabicPunctuationAndWhitespace: RegExp;
692
+ /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
693
+ arabicReferenceRegex: RegExp;
694
+ /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
695
+ footnoteEmbedded: RegExp;
696
+ /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[ุŒ.]?$ */
697
+ footnoteStandalone: RegExp;
698
+ /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
699
+ invalidReferenceRegex: RegExp;
700
+ /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
701
+ ocrConfusedFootnoteReferenceRegex: RegExp;
702
+ /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
703
+ ocrConfusedReferenceRegex: RegExp;
704
+ /** Matches one or more whitespace characters */
705
+ whitespace: RegExp;
655
706
  };
656
707
  /**
657
708
  * Extracts the first sequence of Arabic or Western digits from text.
@@ -758,5 +809,6 @@ declare const standardizeHijriSymbol: (text: string) => string;
758
809
  * @returns Text with standardized AH Hijri symbols
759
810
  */
760
811
  declare const standardizeIntahaSymbol: (text: string) => string;
761
-
762
- export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
812
+ //#endregion
813
+ export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
814
+ //# sourceMappingURL=index.d.ts.map