npm - baburchi - Versions diffs - 1.7.1 → 1.7.2 - Mend

baburchi 1.7.1 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -16,15 +16,15 @@ A lightweight TypeScript library for intelligent OCR text post-processing, speci
 ## Features
-- 🧠 **Intelligent Text Alignment**: Uses the Needleman-Wunsch algorithm for optimal text sequence alignment
-- 🔤 **Arabic Text Specialization**: Advanced normalization and diacritics handling for Arabic text
-- 🧹 **Noise Detection**: Comprehensive Arabic text noise detection and OCR artifact identification
-- 📝 **Footnote Management**: Smart handling of embedded and standalone footnotes
-- ⚡ **High Performance**: Space-optimized algorithms with O(min(m,n)) space complexity
-- 🎯 **Special Symbol Preservation**: Configurable preservation of religious symbols and honorifics
-- 🔧 **Flexible Configuration**: Customizable similarity thresholds and typo symbols
-- 📦 **Zero Dependencies**: Pure TypeScript implementation with no external dependencies
-- 🌐 **Universal Compatibility**: Works in Node.js, Bun, and modern browsers
+- 🧠 **Sequence-Aware Typo Repair** &mdash; Needleman–Wunsch alignment with typo symbol preservation and duplicate pruning.
+- 📄 **Multi-Page Fuzzy Search** &mdash; Hybrid exact/fuzzy matching with q-gram seeding and cross-page seam handling.
+- 📝 **Footnote Normalisation** &mdash; Converts OCR-confused numerals, fills empty references, and keeps body/footnote sets in sync.
+- 🧮 **Bracket & Quote Balancing** &mdash; Detects mismatched punctuation with positional metadata for editor highlighting.
+- 🧹 **Noise Classification** &mdash; Arabic-aware heuristics for punctuation spam, spacing artefacts, and mixed-script clutter.
+- 🧾 **Comprehensive Typings** &mdash; Fully documented API surface with rich JSDoc coverage and generated declaration files.
+- ⚙️ **Configurable Pipelines** &mdash; Fine-grained match policies, sanitisation presets, and typo symbol lists.
+- 🧪 **High Test Coverage** &mdash; Extensive Bun test suite covering alignment, matching, sanitisation, and utility helpers.
+- 🧳 **Lightweight Tooling** &mdash; Ships with the upstream `tsdown` bundler for fast Bun/Node builds and typed outputs.
 ## Installation
@@ -53,7 +53,7 @@ const correctedText = 'محمد ﷺ رسول الله';
 const typoSymbols = ['ﷺ', '﷽', 'ﷻ'];
 const result = fixTypo(originalText, correctedText, { typoSymbols });
-console.log(result); // 'محمد صلى الله عليه ﷺ رسول الله'
+console.log(result); // 'محمد ﷺ رسول الله عليه وسلم'
 // Noise detection for OCR cleanup
 const cleanText = isArabicTextNoise('السلام عليكم'); // false
@@ -197,7 +197,7 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
 **Parameters:**
-- `input` (string): The Arabic text to sanitize
+- `input` (string | string[]): The Arabic text to sanitize (or an array for optimized batch processing)
 - `optionsOrPreset` (string | object): Either a preset name or custom options
 **Presets:**
@@ -206,13 +206,20 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
 - `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
 - `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
+**Batch processing / factory:**
+- Pass an array to resolve options once and sanitize many strings efficiently.
+- Or pre-resolve options with `createArabicSanitizer(...)` and reuse the returned function.
 **Custom Options:**
 ```typescript
 interface SanitizeOptions {
     base?: 'light' | 'search' | 'aggressive' | 'none';
+    nfc?: boolean;
     stripDiacritics?: boolean;
-    stripTatweel?: boolean;
+    stripFootnotes?: boolean;
+    stripTatweel?: boolean | 'safe' | 'all';
     normalizeAlif?: boolean;
     replaceAlifMaqsurah?: boolean;
     replaceTaMarbutahWithHa?: boolean;
@@ -227,10 +234,12 @@ interface SanitizeOptions {
 }
 ```
+**Note on `nfc`**: NFC normalization does **not** remove diacritics; it canonicalizes equivalent sequences. This library applies an Arabic-focused NFC fast-path for common OCR compositions (e.g., Alif + combining hamza/madda), while `stripDiacritics` controls tashkīl removal.
 **Examples:**
 ```typescript
-import { sanitizeArabic } from 'baburchi';
+import { createArabicSanitizer, sanitizeArabic } from 'baburchi';
 // Light display cleanup
 sanitizeArabic('  مرحبا\u200C\u200D   بالعالم  ', 'light'); // → 'مرحبا بالعالم'
@@ -244,6 +253,13 @@ sanitizeArabic('اَلسَّلَامُ 1435/3/29 هـ — www', 'aggressive'); /
 // Custom: Tatweel-only, preserving dates/list markers
 sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // → 'أبتِكَةُ'
+// Batch processing (optimized)
+sanitizeArabic(['اَلسَّلَامُ عَلَيْكُمْ', 'أبـــتِـــكَةُ'], 'search'); // → ['السلام عليكم', 'أبتِكَةُ']
+// Factory (pre-resolved options)
+const sanitizeSearch = createArabicSanitizer('search');
+['اَلسَّلَامُ عَلَيْكُمْ', 'أبـــتِـــكَةُ'].map(sanitizeSearch);
 // Zero-width controls → spaces
 sanitizeArabic('يَخْلُوَ ‏. ‏ قَالَ غَرِيبٌ ‏. ‏', {
     base: 'none',
@@ -953,8 +969,9 @@ Contributions are welcome. Please ensure your contributions adhere to the coding
 2. Install dependencies: `bun install` (requires [Bun](https://bun.sh/))
 3. Make your changes
 4. Run tests: `bun test`
-5. Run linting: `bun run lint`
-6. Submit a pull request
+5. Build artefacts (optional verification): `bun run build`
+6. Run linting: `bun run lint`
+7. Submit a pull request
 ### Running Tests

package/dist/index.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+//#region src/alignment.d.ts
 /**
  * Aligns split text segments to match target lines by finding the best order.
  *
@@ -11,35 +12,36 @@
  * @returns Array of aligned text lines
  */
 declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
+//#endregion
+//#region src/balance.d.ts
 /**
  * Represents an error found when checking balance of quotes or brackets in text.
  */
 type BalanceError = {
-    /** The character that caused the error */
-    char: string;
-    /** The position of the character in the string */
-    index: number;
-    /** The reason for the error */
-    reason: 'mismatched' | 'unclosed' | 'unmatched';
-    /** The type of character that caused the error */
-    type: 'bracket' | 'quote';
+  /** The character that caused the error */
+  char: string;
+  /** The position of the character in the string */
+  index: number;
+  /** The reason for the error */
+  reason: 'mismatched' | 'unclosed' | 'unmatched';
+  /** The type of character that caused the error */
+  type: 'bracket' | 'quote';
 };
 /**
  * Result of a balance check operation.
  */
 type BalanceResult = {
-    /** Array of errors found during balance checking */
-    errors: BalanceError[];
-    /** Whether the text is properly balanced */
-    isBalanced: boolean;
+  /** Array of errors found during balance checking */
+  errors: BalanceError[];
+  /** Whether the text is properly balanced */
+  isBalanced: boolean;
 };
 /** Mapping of opening brackets to their corresponding closing brackets */
 declare const BRACKETS: {
-    '\u00AB': string;
-    '(': string;
-    '[': string;
-    '{': string;
+  '\u00AB': string;
+  '(': string;
+  '[': string;
+  '{': string;
 };
 /** Set of all opening bracket characters */
 declare const OPEN_BRACKETS: Set<string>;
@@ -70,14 +72,14 @@ declare const checkBalance: (str: string) => BalanceResult;
  * syntax highlighters that need precise character positioning.
  */
 interface CharacterError {
-    /** Absolute character position from the start of the entire text */
-    absoluteIndex: number;
-    /** The character that caused the error */
-    char: string;
-    /** The reason for the error */
-    reason: 'mismatched' | 'unclosed' | 'unmatched';
-    /** The type of character that caused the error */
-    type: 'bracket' | 'quote';
+  /** Absolute character position from the start of the entire text */
+  absoluteIndex: number;
+  /** The character that caused the error */
+  char: string;
+  /** The reason for the error */
+  reason: 'mismatched' | 'unclosed' | 'unmatched';
+  /** The type of character that caused the error */
+  type: 'bracket' | 'quote';
 }
 /**
  * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
@@ -148,7 +150,8 @@ declare const areBracketsBalanced: (str: string) => boolean;
  * ```
  */
 declare const isBalanced: (str: string) => boolean;
+//#endregion
+//#region src/footnotes.d.ts
 /**
  * Checks if the given text contains invalid footnote references.
  * Invalid footnotes include empty parentheses "()" or OCR-confused characters
@@ -163,8 +166,8 @@ declare const isBalanced: (str: string) => boolean;
  */
 declare const hasInvalidFootnotes: (text: string) => boolean;
 type TextLine = {
-    isFootnote?: boolean;
-    text: string;
+  isFootnote?: boolean;
+  text: string;
 };
 /**
  * Corrects footnote references in an array of text lines by:
@@ -184,56 +187,58 @@ type TextLine = {
  * // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
  */
 declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
+//#endregion
+//#region src/types.d.ts
 /**
  * Configuration options for fixing typos in OCR text using alignment algorithms.
  * These options control how text tokens are compared, aligned, and merged during typo correction.
  */
 type FixTypoOptions = {
-    /**
-     * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
-     * Used in post-processing to eliminate redundant tokens that are nearly identical.
-     * Should typically be higher than similarityThreshold to catch only very similar duplicates.
-     * @default 0.9
-     * @example 0.95 // Removes tokens that are 95% or more similar
-     */
-    readonly highSimilarityThreshold: number;
-    /**
-     * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
-     * Higher values require closer matches, lower values are more permissive.
-     * Used in the Needleman-Wunsch alignment algorithm for token matching.
-     * @default 0.7
-     * @example 0.8 // Requires 80% similarity for token alignment
-     */
-    readonly similarityThreshold: number;
-    /**
-     * Array of special symbols that should be preserved during typo correction.
-     * These symbols (like honorifics or religious markers) take precedence in token selection.
-     * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
-     */
-    readonly typoSymbols: string[];
+  /**
+   * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
+   * Used in post-processing to eliminate redundant tokens that are nearly identical.
+   * Should typically be higher than similarityThreshold to catch only very similar duplicates.
+   * @default 0.9
+   * @example 0.95 // Removes tokens that are 95% or more similar
+   */
+  readonly highSimilarityThreshold: number;
+  /**
+   * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
+   * Higher values require closer matches, lower values are more permissive.
+   * Used in the Needleman-Wunsch alignment algorithm for token matching.
+   * @default 0.7
+   * @example 0.8 // Requires 80% similarity for token alignment
+   */
+  readonly similarityThreshold: number;
+  /**
+   * Array of special symbols that should be preserved during typo correction.
+   * These symbols (like honorifics or religious markers) take precedence in token selection.
+   * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
+   */
+  readonly typoSymbols: string[];
 };
 type MatchPolicy = {
-    /** Try approximate matches for leftovers (default true). */
-    enableFuzzy?: boolean;
-    /** Max absolute edit distance accepted in fuzzy (default 3). */
-    maxEditAbs?: number;
-    /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
-    maxEditRel?: number;
-    /** q-gram length for candidate generation (default 4). */
-    q?: number;
-    /** Max rare grams to seed candidates per excerpt (default 5). */
-    gramsPerExcerpt?: number;
-    /** Max candidate windows verified per excerpt (default 40). */
-    maxCandidatesPerExcerpt?: number;
-    /** Seam length for bleed windows (default 512). */
-    seamLen?: number;
-    /**
-     * Optional logging function for debugging.
-     */
-    log?(message?: any, ...optionalParams: any[]): void;
+  /** Try approximate matches for leftovers (default true). */
+  enableFuzzy?: boolean;
+  /** Max absolute edit distance accepted in fuzzy (default 3). */
+  maxEditAbs?: number;
+  /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
+  maxEditRel?: number;
+  /** q-gram length for candidate generation (default 4). */
+  q?: number;
+  /** Max rare grams to seed candidates per excerpt (default 5). */
+  gramsPerExcerpt?: number;
+  /** Max candidate windows verified per excerpt (default 40). */
+  maxCandidatesPerExcerpt?: number;
+  /** Seam length for bleed windows (default 512). */
+  seamLen?: number;
+  /**
+   * Optional logging function for debugging.
+   */
+  log?(message?: any, ...optionalParams: any[]): void;
 };
+//#endregion
+//#region src/fuzzy.d.ts
 /**
  * Main function to find the single best match per excerpt.
  * Combines exact matching with fuzzy matching for comprehensive text search.
@@ -270,25 +275,26 @@ declare function findMatches(pages: string[], excerpts: string[], policy?: Match
  * ```
  */
 declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
+//#endregion
+//#region src/noise.d.ts
 /**
  * Character statistics for analyzing text content and patterns
  */
 type CharacterStats = {
-    /** Number of Arabic script characters in the text */
-    arabicCount: number;
-    /** Map of character frequencies for repetition analysis */
-    charFreq: Map<string, number>;
-    /** Number of digit characters (0-9) in the text */
-    digitCount: number;
-    /** Number of Latin alphabet characters (a-z, A-Z) in the text */
-    latinCount: number;
-    /** Number of punctuation characters in the text */
-    punctuationCount: number;
-    /** Number of whitespace characters in the text */
-    spaceCount: number;
-    /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
-    symbolCount: number;
+  /** Number of Arabic script characters in the text */
+  arabicCount: number;
+  /** Map of character frequencies for repetition analysis */
+  charFreq: Map<string, number>;
+  /** Number of digit characters (0-9) in the text */
+  digitCount: number;
+  /** Number of Latin alphabet characters (a-z, A-Z) in the text */
+  latinCount: number;
+  /** Number of punctuation characters in the text */
+  punctuationCount: number;
+  /** Number of whitespace characters in the text */
+  spaceCount: number;
+  /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
+  symbolCount: number;
 };
 /**
  * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
@@ -438,7 +444,8 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
  * ```
  */
 declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
+//#endregion
+//#region src/typos.d.ts
 /**
  * Processes text alignment between original and alternate OCR results to fix typos.
  * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
@@ -450,8 +457,21 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
  * @returns Corrected text with typos fixed
  */
 declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
-declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
+/**
+ * Convenience wrapper around {@link processTextAlignment} that accepts partial options.
+ *
+ * @param original - The source text that may contain typographical errors.
+ * @param correction - The reference text used to correct the {@link original} text.
+ * @param options - Partial typo correction options combined with required typo symbols.
+ * @returns The corrected text generated from the alignment process.
+ */
+declare const fixTypo: (original: string, correction: string, {
+  highSimilarityThreshold,
+  similarityThreshold,
+  typoSymbols
+}: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
+//#endregion
+//#region src/utils/levenshthein.d.ts
 /**
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -471,7 +491,8 @@ declare const calculateLevenshteinDistance: (textA: string, textB: string) => nu
  * More efficient when you only care about distances up to a threshold.
  */
 declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
+//#endregion
+//#region src/utils/sanitize.d.ts
 /**
  * Ultra-fast Arabic text sanitizer for search/indexing/display.
  * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
@@ -485,49 +506,69 @@ type SanitizeBase = 'none' | SanitizePreset;
  * directly into local booleans for speed.
  */
 type SanitizeOptions = {
-    /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
-    base?: SanitizeBase;
-    /** Unicode NFC normalization. Default: `true` in all presets. */
-    nfc?: boolean;
-    /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
-    stripZeroWidth?: boolean;
-    /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
-    zeroWidthToSpace?: boolean;
-    /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
-    stripDiacritics?: boolean;
-    /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
-    stripFootnotes?: boolean;
-    /**
-     * Remove tatweel (ـ).
-     * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
-     * - `'safe'` or `'all'` explicitly
-     * - `false` to keep tatweel
-     * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
-     */
-    stripTatweel?: boolean | 'safe' | 'all';
-    /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
-    normalizeAlif?: boolean;
-    /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
-    replaceAlifMaqsurah?: boolean;
-    /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
-    replaceTaMarbutahWithHa?: boolean;
-    /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
-    stripLatinAndSymbols?: boolean;
-    /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
-    keepOnlyArabicLetters?: boolean;
-    /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
-    lettersAndSpacesOnly?: boolean;
-    /** Collapse runs of whitespace to a single space. Default: `true`. */
-    collapseWhitespace?: boolean;
-    /** Trim leading/trailing whitespace. Default: `true`. */
-    trim?: boolean;
-    /**
-     * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
-     * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
-     * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
-     */
-    removeHijriMarker?: boolean;
+  /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
+  base?: SanitizeBase;
+  /**
+   * NFC normalization (fast-path).
+   *
+   * For performance, this sanitizer avoids calling `String.prototype.normalize('NFC')` and instead
+   * applies the key Arabic canonical compositions inline (hamza/madda combining marks).
+   * This preserves the NFC behavior that matters for typical Arabic OCR text while keeping throughput high.
+   *
+   * Default: `true` in all presets.
+   */
+  nfc?: boolean;
+  /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
+  stripZeroWidth?: boolean;
+  /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
+  zeroWidthToSpace?: boolean;
+  /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
+  stripDiacritics?: boolean;
+  /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
+  stripFootnotes?: boolean;
+  /**
+   * Remove tatweel (ـ).
+   * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
+   * - `'safe'` or `'all'` explicitly
+   * - `false` to keep tatweel
+   * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
+   */
+  stripTatweel?: boolean | 'safe' | 'all';
+  /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
+  normalizeAlif?: boolean;
+  /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
+  replaceAlifMaqsurah?: boolean;
+  /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
+  replaceTaMarbutahWithHa?: boolean;
+  /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
+  stripLatinAndSymbols?: boolean;
+  /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
+  keepOnlyArabicLetters?: boolean;
+  /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
+  lettersAndSpacesOnly?: boolean;
+  /** Collapse runs of whitespace to a single space. Default: `true`. */
+  collapseWhitespace?: boolean;
+  /** Trim leading/trailing whitespace. Default: `true`. */
+  trim?: boolean;
+  /**
+   * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
+   * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
+   * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
+   */
+  removeHijriMarker?: boolean;
 };
+/**
+ * Creates a reusable sanitizer function with pre-resolved options.
+ * Use this when you need to sanitize many strings with the same options
+ * for maximum performance.
+ *
+ * @example
+ * ```ts
+ * const sanitize = createArabicSanitizer('search');
+ * const results = texts.map(sanitize);
+ * ```
+ */
+declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | SanitizeOptions) => ((input: string) => string);
 /**
  * Sanitizes Arabic text according to a preset or custom options.
  *
@@ -540,15 +581,24 @@ type SanitizeOptions = {
  * - Passing an options object overlays the selected `base` preset (default `'light'`).
  * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
  *
+ * **Batch processing**: Pass an array of strings for optimized batch processing.
+ * Options are resolved once and applied to all strings, providing significant
+ * performance gains over calling the function in a loop.
+ *
  * Examples:
  * ```ts
  * sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
  * sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
  * sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
+ *
+ * // Batch processing (optimized):
+ * sanitizeArabic(['text1', 'text2', 'text3'], 'search'); // ['result1', 'result2', 'result3']
  * ```
  */
-declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
+declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
+declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
+//#endregion
+//#region src/utils/similarity.d.ts
 /**
  * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
  * Uses Levenshtein distance normalized by the length of the longer string.
@@ -592,8 +642,8 @@ declare const areSimilarAfterNormalization: (textA: string, textB: string, thres
 declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
 type AlignedTokenPair = [null | string, null | string];
 type AlignmentCell = {
-    direction: 'diagonal' | 'left' | 'up' | null;
-    score: number;
+  direction: 'diagonal' | 'left' | 'up' | null;
+  score: number;
 };
 /**
  * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
@@ -622,36 +672,37 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
  * // Returns [['a', 'a'], ['b', 'c']]
  */
 declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
+//#endregion
+//#region src/utils/textUtils.d.ts
 declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
 /**
  * Collection of regex patterns used throughout the library for text processing
  */
 declare const PATTERNS: {
-    /** Matches Arabic characters across all Unicode blocks */
-    arabicCharacters: RegExp;
-    /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
-    arabicDigits: RegExp;
-    /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
-    arabicFootnoteReferenceRegex: RegExp;
-    /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
-    arabicLettersAndDigits: RegExp;
-    /** Matches Arabic punctuation marks and whitespace characters */
-    arabicPunctuationAndWhitespace: RegExp;
-    /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
-    arabicReferenceRegex: RegExp;
-    /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
-    footnoteEmbedded: RegExp;
-    /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
-    footnoteStandalone: RegExp;
-    /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
-    invalidReferenceRegex: RegExp;
-    /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
-    ocrConfusedFootnoteReferenceRegex: RegExp;
-    /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
-    ocrConfusedReferenceRegex: RegExp;
-    /** Matches one or more whitespace characters */
-    whitespace: RegExp;
+  /** Matches Arabic characters across all Unicode blocks */
+  arabicCharacters: RegExp;
+  /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
+  arabicDigits: RegExp;
+  /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
+  arabicFootnoteReferenceRegex: RegExp;
+  /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
+  arabicLettersAndDigits: RegExp;
+  /** Matches Arabic punctuation marks and whitespace characters */
+  arabicPunctuationAndWhitespace: RegExp;
+  /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
+  arabicReferenceRegex: RegExp;
+  /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
+  footnoteEmbedded: RegExp;
+  /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
+  footnoteStandalone: RegExp;
+  /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
+  invalidReferenceRegex: RegExp;
+  /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
+  ocrConfusedFootnoteReferenceRegex: RegExp;
+  /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
+  ocrConfusedReferenceRegex: RegExp;
+  /** Matches one or more whitespace characters */
+  whitespace: RegExp;
 };
 /**
  * Extracts the first sequence of Arabic or Western digits from text.
@@ -758,5 +809,6 @@ declare const standardizeHijriSymbol: (text: string) => string;
  * @returns Text with standardized AH Hijri symbols
  */
 declare const standardizeIntahaSymbol: (text: string) => string;
-export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
+//#endregion
+export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
+//# sourceMappingURL=index.d.ts.map