npm - baburchi - Versions diffs - 1.4.0 → 1.6.0 - Mend

baburchi 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -62,7 +62,9 @@ const noiseText = isArabicTextNoise('---'); // true
 ## API Reference
-### `fixTypo(original, correction, options)`
+### Core Text Processing
+#### `fixTypo(original, correction, options)`
 The main function for correcting typos using text alignment.
@@ -80,7 +82,7 @@ The main function for correcting typos using text alignment.
 **Returns:** Corrected text string
-### `processTextAlignment(originalText, altText, options)`
+#### `processTextAlignment(originalText, altText, options)`
 Low-level function for advanced text processing with full configuration control.
@@ -90,6 +92,167 @@ Low-level function for advanced text processing with full configuration control.
 - `altText` (string): Reference text for alignment
 - `options` (FixTypoOptions): Complete configuration object
+### Fuzzy Text Matching
+#### `findMatches(pages, excerpts, policy?)`
+Finds the best matching page for each excerpt using exact and fuzzy matching algorithms.
+**Parameters:**
+- `pages` (string[]): Array of page texts to search within
+- `excerpts` (string[]): Array of text excerpts to find
+- `policy` (MatchPolicy, optional): Matching configuration
+**Returns:** `number[]` - Array of page indices (0-based) where each excerpt was found, or -1 if not found
+**Example:**
+```typescript
+import { findMatches } from 'baburchi';
+const pages = [
+    'هذا النص في الصفحة الأولى مع محتوى إضافي',
+    'النص الثاني يظهر هنا في الصفحة الثانية',
+    'الصفحة الثالثة تحتوي على نص مختلف'
+];
+const excerpts = [
+    'النص في الصفحة الأولى',
+    'النص الثاني يظهر',
+    'نص غير موجود'
+];
+const matches = findMatches(pages, excerpts);
+console.log(matches); // [0, 1, -1]
+```
+#### `findMatchesAll(pages, excerpts, policy?)`
+Finds all potential matches for each excerpt, ranked by match quality.
+**Parameters:**
+- `pages` (string[]): Array of page texts to search within
+- `excerpts` (string[]): Array of text excerpts to find
+- `policy` (MatchPolicy, optional): Matching configuration
+**Returns:** `number[][]` - Array where each element is an array of page indices ranked by match quality (exact matches first, then fuzzy matches by score)
+**Example:**
+```typescript
+import { findMatchesAll } from 'baburchi';
+const pages = [
+    'النص الأول مع محتوى مشابه',
+    'محتوى مشابه في النص الثاني',
+    'النص الأول بصيغة مختلفة قليلاً'
+];
+const excerpts = ['النص الأول'];
+const allMatches = findMatchesAll(pages, excerpts);
+console.log(allMatches); // [[0, 2]] - excerpt matches page 0 exactly, page 2 fuzzily
+```
+#### Match Policy Configuration
+The `MatchPolicy` interface allows fine-tuning of the matching algorithm:
+```typescript
+interface MatchPolicy {
+    enableFuzzy?: boolean;           // Enable fuzzy matching (default: true)
+    maxEditAbs?: number;             // Max absolute edit distance (default: 3)
+    maxEditRel?: number;             // Max relative edit distance (default: 0.1)
+    q?: number;                      // Q-gram size for indexing (default: 4)
+    gramsPerExcerpt?: number;        // Q-grams to sample per excerpt (default: 5)
+    maxCandidatesPerExcerpt?: number; // Max candidates to evaluate (default: 40)
+    seamLen?: number;                // Cross-page seam length (default: 512)
+}
+```
+**Example with custom policy:**
+```typescript
+import { findMatches } from 'baburchi';
+const customPolicy: MatchPolicy = {
+    enableFuzzy: true,
+    maxEditAbs: 6,           // Allow more character differences
+    maxEditRel: 0.3,         // Allow 30% character differences
+    q: 4,                    // Use 4-grams for better precision
+    gramsPerExcerpt: 30,     // Sample more Q-grams
+    maxCandidatesPerExcerpt: 150
+};
+const matches = findMatches(pages, excerpts, customPolicy);
+```
+### Arabic Text Normalization
+#### `sanitizeArabic(input, optionsOrPreset)`
+Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabic text.
+**Parameters:**
+- `input` (string): The Arabic text to sanitize
+- `optionsOrPreset` (string | object): Either a preset name or custom options
+**Presets:**
+- `"light"`: Basic cleanup for display (strips zero-width chars, collapses whitespace)
+- `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
+- `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
+**Custom Options:**
+```typescript
+interface SanitizeOptions {
+    base?: 'light' | 'search' | 'aggressive' | 'none';
+    stripDiacritics?: boolean;
+    stripTatweel?: boolean;
+    normalizeAlif?: boolean;
+    replaceAlifMaqsurah?: boolean;
+    replaceTaMarbutahWithHa?: boolean;
+    stripZeroWidth?: boolean;
+    zeroWidthToSpace?: boolean;
+    stripLatinAndSymbols?: boolean;
+    lettersAndSpacesOnly?: boolean;
+    keepOnlyArabicLetters?: boolean;
+    collapseWhitespace?: boolean;
+    trim?: boolean;
+    removeHijriMarker?: boolean;
+}
+```
+**Examples:**
+```typescript
+import { sanitizeArabic } from 'baburchi';
+// Light display cleanup
+sanitizeArabic('  مرحبا\u200C\u200D   بالعالم  ', 'light'); // → 'مرحبا بالعالم'
+// Tolerant search normalization
+sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // → 'السلام عليكم'
+// Indexing-friendly text (letters + spaces only)
+sanitizeArabic('اَلسَّلَامُ 1435/3/29 هـ — www', 'aggressive'); // → 'السلام'
+// Custom: Tatweel-only, preserving dates/list markers
+sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // → 'أبتِكَةُ'
+// Zero-width controls → spaces
+sanitizeArabic('يَخْلُوَ ‏. ‏ قَالَ غَرِيبٌ ‏. ‏', {
+    base: 'none',
+    stripZeroWidth: true,
+    zeroWidthToSpace: true
+});
+// → 'يَخْلُوَ  .   قَالَ غَرِيبٌ  .  '
+```
 ## Usage Examples
 ### Basic Arabic Text Correction
@@ -189,9 +352,9 @@ Baburchi uses the **Needleman-Wunsch global sequence alignment algorithm** to op
 Baburchi works in all modern environments:
-- ✅ Node.js 18+
-- ✅ Bun 1.0+
-- ✅ Modern browsers (ES2020+)
+- ✅ Node.js 22+
+- ✅ Bun 1.2.21+
+- ✅ Modern browsers (ES2023+)
 - ✅ Deno (with npm compatibility)
 ## TypeScript Support
@@ -308,6 +471,58 @@ This function is particularly useful for:
 - Handling cases where text layout affects line ordering
 - Processing documents where content has been split across multiple detection regions
+## Hijri Date Standardization
+Baburchi includes specialized functions for standardizing Hijri date symbols commonly found in Arabic historical and religious texts. These functions help normalize OCR inconsistencies in Hijri date notation.
+### `standardizeHijriSymbol(text)`
+Standardizes standalone ه to هـ when following Arabic digits, ensuring proper Hijri date notation.
+```typescript
+import { standardizeHijriSymbol } from 'baburchi';
+// Standardize after Arabic-Indic digits
+const text1 = standardizeHijriSymbol('سنة ١٤٤٥ ه'); // 'سنة ١٤٤٥ هـ'
+const text2 = standardizeHijriSymbol('عام ٧٥٠ه'); // 'عام ٧٥٠ هـ'
+// Standardize after Western digits
+const text3 = standardizeHijriSymbol('في عام 1445 ه'); // 'في عام 1445 هـ'
+const text4 = standardizeHijriSymbol('توفي 632ه'); // 'توفي 632 هـ'
+// Does not affect ه when part of other words
+const text5 = standardizeHijriSymbol('هذا كتاب'); // 'هذا كتاب' (unchanged)
+```
+### `standardizeIntahaSymbol(text)`
+Standardizes standalone اه to اهـ when appearing as a whole word, typically used in academic and historical texts.
+```typescript
+import { standardizeIntahaSymbol } from 'baburchi';
+// Standardize standalone AH abbreviation
+const text1 = standardizeIntahaSymbol('سنة 1445 اه'); // 'سنة 1445 اهـ'
+const text2 = standardizeIntahaSymbol('في العام اه'); // 'في العام اهـ'
+// Does not affect اه when part of other words
+const text3 = standardizeIntahaSymbol('الاهتمام بالتاريخ'); // 'الاهتمام بالتاريخ' (unchanged)
+```
+### Combined Hijri Standardization
+```typescript
+import { standardizeHijriSymbol, standardizeIntahaSymbol } from 'baburchi';
+function standardizeAllHijriNotations(text: string): string {
+    return standardizeIntahaSymbol(standardizeHijriSymbol(text));
+}
+const mixedText = 'وُلد سنة 570 ه وتوفي عام 632 اه';
+const standardized = standardizeAllHijriNotations(mixedText);
+console.log(standardized); // 'وُلد سنة 570 هـ وتوفي عام 632 اهـ'
+```
 ## Utilities
 The library also exports utility functions for advanced use cases:
@@ -315,20 +530,18 @@ The library also exports utility functions for advanced use cases:
 ```typescript
 import {
     calculateSimilarity,
-    normalizeArabicText,
     tokenizeText,
     alignTokenSequences,
     hasInvalidFootnotes,
     correctReferences,
     alignTextSegments,
+    standardizeHijriSymbol,
+    standardizeIntahaSymbol,
 } from 'baburchi';
 // Calculate similarity between two strings
 const similarity = calculateSimilarity('hello', 'helo'); // 0.8
-// Normalize Arabic text
-const normalized = normalizeArabicText('اَلسَّلَامُ'); // 'السلام'
 // Tokenize with symbol preservation
 const tokens = tokenizeText('محمد ﷺ رسول', ['ﷺ']); // ['محمد', 'ﷺ', 'رسول']
@@ -347,6 +560,10 @@ const aligned = alignTextSegments(
     ['target line one', '', 'target line three'],
     ['segment1', 'segment2', 'segment3', 'segment4'],
 );
+// Standardize Hijri date symbols
+const hijriText = standardizeHijriSymbol('سنة 1445 ه'); // 'سنة 1445 هـ'
+const ahText = standardizeIntahaSymbol('عام 632 اه'); // 'عام 632 اهـ'
 ```
 ## Noise Detection

package/dist/index.d.ts CHANGED Viewed

@@ -1,32 +1,3 @@
-/**
- * Configuration options for fixing typos in OCR text using alignment algorithms.
- * These options control how text tokens are compared, aligned, and merged during typo correction.
- */
-type FixTypoOptions = {
-    /**
-     * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
-     * Used in post-processing to eliminate redundant tokens that are nearly identical.
-     * Should typically be higher than similarityThreshold to catch only very similar duplicates.
-     * @default 0.9
-     * @example 0.95 // Removes tokens that are 95% or more similar
-     */
-    readonly highSimilarityThreshold: number;
-    /**
-     * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
-     * Higher values require closer matches, lower values are more permissive.
-     * Used in the Needleman-Wunsch alignment algorithm for token matching.
-     * @default 0.7
-     * @example 0.8 // Requires 80% similarity for token alignment
-     */
-    readonly similarityThreshold: number;
-    /**
-     * Array of special symbols that should be preserved during typo correction.
-     * These symbols (like honorifics or religious markers) take precedence in token selection.
-     * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
-     */
-    readonly typoSymbols: string[];
-};
 /**
  * Aligns split text segments to match target lines by finding the best order.
  *
@@ -214,6 +185,88 @@ type TextLine = {
  */
 declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
+/**
+ * Configuration options for fixing typos in OCR text using alignment algorithms.
+ * These options control how text tokens are compared, aligned, and merged during typo correction.
+ */
+type FixTypoOptions = {
+    /**
+     * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
+     * Used in post-processing to eliminate redundant tokens that are nearly identical.
+     * Should typically be higher than similarityThreshold to catch only very similar duplicates.
+     * @default 0.9
+     * @example 0.95 // Removes tokens that are 95% or more similar
+     */
+    readonly highSimilarityThreshold: number;
+    /**
+     * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
+     * Higher values require closer matches, lower values are more permissive.
+     * Used in the Needleman-Wunsch alignment algorithm for token matching.
+     * @default 0.7
+     * @example 0.8 // Requires 80% similarity for token alignment
+     */
+    readonly similarityThreshold: number;
+    /**
+     * Array of special symbols that should be preserved during typo correction.
+     * These symbols (like honorifics or religious markers) take precedence in token selection.
+     * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
+     */
+    readonly typoSymbols: string[];
+};
+type MatchPolicy = {
+    /** Try approximate matches for leftovers (default true). */
+    enableFuzzy?: boolean;
+    /** Max absolute edit distance accepted in fuzzy (default 3). */
+    maxEditAbs?: number;
+    /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
+    maxEditRel?: number;
+    /** q-gram length for candidate generation (default 4). */
+    q?: number;
+    /** Max rare grams to seed candidates per excerpt (default 5). */
+    gramsPerExcerpt?: number;
+    /** Max candidate windows verified per excerpt (default 40). */
+    maxCandidatesPerExcerpt?: number;
+    /** Seam length for bleed windows (default 512). */
+    seamLen?: number;
+};
+/**
+ * Main function to find the single best match per excerpt.
+ * Combines exact matching with fuzzy matching for comprehensive text search.
+ *
+ * @param pages - Array of page texts to search within
+ * @param excerpts - Array of text excerpts to find matches for
+ * @param policy - Optional matching policy configuration
+ * @returns Array of page indices (one per excerpt, -1 if no match found)
+ *
+ * @example
+ * ```typescript
+ * const pages = ['Hello world', 'Goodbye world'];
+ * const excerpts = ['Hello', 'Good bye']; // Note the typo
+ * const matches = findMatches(pages, excerpts, { enableFuzzy: true });
+ * // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
+ * ```
+ */
+declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[];
+/**
+ * Main function to find all matches per excerpt, ranked by quality.
+ * Returns comprehensive results with both exact and fuzzy matches for each excerpt.
+ *
+ * @param pages - Array of page texts to search within
+ * @param excerpts - Array of text excerpts to find matches for
+ * @param policy - Optional matching policy configuration
+ * @returns Array of page index arrays (one array per excerpt, sorted by match quality)
+ *
+ * @example
+ * ```typescript
+ * const pages = ['Hello world', 'Hello there', 'Goodbye world'];
+ * const excerpts = ['Hello'];
+ * const matches = findMatchesAll(pages, excerpts);
+ * // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
+ * ```
+ */
+declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
 /**
  * Character statistics for analyzing text content and patterns
  */
@@ -382,6 +435,19 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
  */
 declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
+/**
+ * Processes text alignment between original and alternate OCR results to fix typos.
+ * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
+ * then selects the best tokens and performs post-processing.
+ *
+ * @param originalText - Original OCR text that may contain typos
+ * @param altText - Reference text from alternate OCR for comparison
+ * @param options - Configuration options for alignment and selection
+ * @returns Corrected text with typos fixed
+ */
+declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
+declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
 /**
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -396,6 +462,87 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
  * calculateLevenshteinDistance('', 'hello') // Returns 5
  */
 declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
+/**
+ * Calculates bounded Levenshtein distance with early termination.
+ * More efficient when you only care about distances up to a threshold.
+ */
+declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
+/**
+ * Ultra-fast Arabic text sanitizer for search/indexing/display.
+ * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
+ * Options can merge over a base preset or `'none'` to apply exactly the rules you request.
+ */
+type SanitizePreset = 'light' | 'search' | 'aggressive';
+type SanitizeBase = 'none' | SanitizePreset;
+/**
+ * Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
+ * `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
+ * directly into local booleans for speed.
+ */
+type SanitizeOptions = {
+    /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
+    base?: SanitizeBase;
+    /** Unicode NFC normalization. Default: `true` in all presets. */
+    nfc?: boolean;
+    /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
+    stripZeroWidth?: boolean;
+    /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
+    zeroWidthToSpace?: boolean;
+    /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
+    stripDiacritics?: boolean;
+    /**
+     * Remove tatweel (ـ).
+     * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
+     * - `'safe'` or `'all'` explicitly
+     * - `false` to keep tatweel
+     * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
+     */
+    stripTatweel?: boolean | 'safe' | 'all';
+    /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
+    normalizeAlif?: boolean;
+    /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
+    replaceAlifMaqsurah?: boolean;
+    /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
+    replaceTaMarbutahWithHa?: boolean;
+    /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
+    stripLatinAndSymbols?: boolean;
+    /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
+    keepOnlyArabicLetters?: boolean;
+    /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
+    lettersAndSpacesOnly?: boolean;
+    /** Collapse runs of whitespace to a single space. Default: `true`. */
+    collapseWhitespace?: boolean;
+    /** Trim leading/trailing whitespace. Default: `true`. */
+    trim?: boolean;
+    /**
+     * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
+     * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
+     * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
+     */
+    removeHijriMarker?: boolean;
+};
+/**
+ * Sanitizes Arabic text according to a preset or custom options.
+ *
+ * Presets:
+ * - `'light'`: NFC, zero-width removal, collapse/trim spaces.
+ * - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
+ * - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
+ *
+ * Custom options:
+ * - Passing an options object overlays the selected `base` preset (default `'light'`).
+ * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
+ *
+ * Examples:
+ * ```ts
+ * sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
+ * sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
+ * sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
+ * ```
+ */
+declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
 /**
  * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
  * Uses Levenshtein distance normalized by the length of the longer string.
@@ -470,6 +617,7 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
  */
 declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
+declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
 /**
  * Collection of regex patterns used throughout the library for text processing
  */
@@ -486,8 +634,6 @@ declare const PATTERNS: {
     arabicPunctuationAndWhitespace: RegExp;
     /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
     arabicReferenceRegex: RegExp;
-    /** Matches Arabic diacritical marks (harakat, tanween, etc.) */
-    diacritics: RegExp;
     /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
     footnoteEmbedded: RegExp;
     /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
@@ -498,22 +644,9 @@ declare const PATTERNS: {
     ocrConfusedFootnoteReferenceRegex: RegExp;
     /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
     ocrConfusedReferenceRegex: RegExp;
-    /** Matches Arabic tatweel (kashida) character used for text stretching */
-    tatweel: RegExp;
     /** Matches one or more whitespace characters */
     whitespace: RegExp;
 };
-/**
- * Normalizes Arabic text by removing diacritics, and tatweel marks.
- * This normalization enables better text comparison by focusing on core characters
- * while ignoring decorative elements that don't affect meaning.
- *
- * @param text - Arabic text to normalize
- * @returns Normalized text with diacritics, tatweel, and basic tags removed
- * @example
- * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
- */
-declare const normalizeArabicText: (text: string) => string;
 /**
  * Extracts the first sequence of Arabic or Western digits from text.
  * Used primarily for footnote number comparison to match related footnote elements.
@@ -527,8 +660,8 @@ declare const normalizeArabicText: (text: string) => string;
 declare const extractDigits: (text: string) => string;
 /**
  * Tokenizes text into individual words while preserving special symbols.
- * Removes HTML tags, adds spacing around preserved symbols to ensure they
- * are tokenized separately, then splits on whitespace.
+ * Adds spacing around preserved symbols to ensure they are tokenized separately,
+ * then splits on whitespace.
  *
  * @param text - Text to tokenize
  * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
@@ -577,18 +710,17 @@ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null
  * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
  */
 declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
 /**
- * Processes text alignment between original and alternate OCR results to fix typos.
- * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
- * then selects the best tokens and performs post-processing.
- *
- * @param originalText - Original OCR text that may contain typos
- * @param altText - Reference text from alternate OCR for comparison
- * @param options - Configuration options for alignment and selection
- * @returns Corrected text with typos fixed
+ * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
+ * @param text - Input text to process
+ * @returns Text with standardized Hijri symbols
  */
-declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
-declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
+declare const standardizeHijriSymbol: (text: string) => string;
+/**
+ * Standardizes standalone اه to اهـ when appearing as whole word
+ * @param text - Input text to process
+ * @returns Text with standardized AH Hijri symbols
+ */
+declare const standardizeIntahaSymbol: (text: string) => string;
-export { BRACKETS, CLOSE_BRACKETS, type CharacterError, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, tokenizeText };
+export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };