npm - baburchi - Versions diffs - 1.7.2 → 1.8.0 - Mend

baburchi 1.7.2 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -14,6 +14,10 @@
 A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms and comprehensive noise detection.
+## Demo
+Explore the interactive demo at <https://baburchi.surge.sh> to browse each exported helper, try Arabic-aware examples, and see formatting results in real time. The demo build ships with a `public/CNAME` file to keep the Surge domain in sync with deployments.
 ## Features
 - 🧠 **Sequence-Aware Typo Repair** &mdash; Needleman–Wunsch alignment with typo symbol preservation and duplicate pruning.
@@ -206,6 +210,27 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
 - `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
 - `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
+#### `sanitizeQuranForSearch(input)`
+Qur'an-oriented search normalization for cases where generic Arabic FTS cleanup is too destructive.
+This helper is intentionally narrower than the generic `"search"` preset:
+- normalizes alif wasla (`ٱ`) to bare alif
+- expands dagger alif where the imla'i form needs a real alif
+- preserves standard hamza letters such as `أ`, `إ`, and `ء`
+- preserves alif maqsurah unless a Qur'anic orthography fix requires otherwise
+- strips diacritics, tatweel, zero-width controls, and non-letter noise
+Example:
+```typescript
+import { sanitizeQuranForSearch } from 'baburchi';
+sanitizeQuranForSearch('ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى لِّلْمُتَّقِينَ');
+// → 'ذلك الكتاب لا ريب فيه هدى للمتقين'
+```
 **Batch processing / factory:**
 - Pass an array to resolve options once and sanitize many strings efficiently.

package/dist/index.d.ts CHANGED Viewed

@@ -18,30 +18,24 @@ declare const alignTextSegments: (targetLines: string[], segmentLines: string[])
  * Represents an error found when checking balance of quotes or brackets in text.
  */
 type BalanceError = {
-  /** The character that caused the error */
-  char: string;
-  /** The position of the character in the string */
-  index: number;
-  /** The reason for the error */
-  reason: 'mismatched' | 'unclosed' | 'unmatched';
-  /** The type of character that caused the error */
+  /** The character that caused the error */char: string; /** The position of the character in the string */
+  index: number; /** The reason for the error */
+  reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */
   type: 'bracket' | 'quote';
 };
 /**
  * Result of a balance check operation.
  */
 type BalanceResult = {
-  /** Array of errors found during balance checking */
-  errors: BalanceError[];
-  /** Whether the text is properly balanced */
+  /** Array of errors found during balance checking */errors: BalanceError[]; /** Whether the text is properly balanced */
   isBalanced: boolean;
 };
 /** Mapping of opening brackets to their corresponding closing brackets */
 declare const BRACKETS: {
-  '\u00AB': string;
   '(': string;
   '[': string;
   '{': string;
+  '\u00AB': string;
 };
 /** Set of all opening bracket characters */
 declare const OPEN_BRACKETS: Set<string>;
@@ -218,19 +212,12 @@ type FixTypoOptions = {
   readonly typoSymbols: string[];
 };
 type MatchPolicy = {
-  /** Try approximate matches for leftovers (default true). */
-  enableFuzzy?: boolean;
-  /** Max absolute edit distance accepted in fuzzy (default 3). */
-  maxEditAbs?: number;
-  /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
-  maxEditRel?: number;
-  /** q-gram length for candidate generation (default 4). */
-  q?: number;
-  /** Max rare grams to seed candidates per excerpt (default 5). */
-  gramsPerExcerpt?: number;
-  /** Max candidate windows verified per excerpt (default 40). */
-  maxCandidatesPerExcerpt?: number;
-  /** Seam length for bleed windows (default 512). */
+  /** Try approximate matches for leftovers (default true). */enableFuzzy?: boolean; /** Max absolute edit distance accepted in fuzzy (default 3). */
+  maxEditAbs?: number; /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
+  maxEditRel?: number; /** q-gram length for candidate generation (default 4). */
+  q?: number; /** Max rare grams to seed candidates per excerpt (default 5). */
+  gramsPerExcerpt?: number; /** Max candidate windows verified per excerpt (default 40). */
+  maxCandidatesPerExcerpt?: number; /** Seam length for bleed windows (default 512). */
   seamLen?: number;
   /**
    * Optional logging function for debugging.
@@ -281,19 +268,12 @@ declare function findMatchesAll(pages: string[], excerpts: string[], policy?: Ma
  * Character statistics for analyzing text content and patterns
  */
 type CharacterStats = {
-  /** Number of Arabic script characters in the text */
-  arabicCount: number;
-  /** Map of character frequencies for repetition analysis */
-  charFreq: Map<string, number>;
-  /** Number of digit characters (0-9) in the text */
-  digitCount: number;
-  /** Number of Latin alphabet characters (a-z, A-Z) in the text */
-  latinCount: number;
-  /** Number of punctuation characters in the text */
-  punctuationCount: number;
-  /** Number of whitespace characters in the text */
-  spaceCount: number;
-  /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
+  /** Number of Arabic script characters in the text */arabicCount: number; /** Map of character frequencies for repetition analysis */
+  charFreq: Map<string, number>; /** Number of digit characters (0-9) in the text */
+  digitCount: number; /** Number of Latin alphabet characters (a-z, A-Z) in the text */
+  latinCount: number; /** Number of punctuation characters in the text */
+  punctuationCount: number; /** Number of whitespace characters in the text */
+  spaceCount: number; /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
   symbolCount: number;
 };
 /**
@@ -506,8 +486,7 @@ type SanitizeBase = 'none' | SanitizePreset;
  * directly into local booleans for speed.
  */
 type SanitizeOptions = {
-  /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
-  base?: SanitizeBase;
+  /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */base?: SanitizeBase;
   /**
    * NFC normalization (fast-path).
    *
@@ -517,14 +496,10 @@ type SanitizeOptions = {
    *
    * Default: `true` in all presets.
    */
-  nfc?: boolean;
-  /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
-  stripZeroWidth?: boolean;
-  /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
-  zeroWidthToSpace?: boolean;
-  /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
-  stripDiacritics?: boolean;
-  /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
+  nfc?: boolean; /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
+  stripZeroWidth?: boolean; /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
+  zeroWidthToSpace?: boolean; /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
+  stripDiacritics?: boolean; /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
   stripFootnotes?: boolean;
   /**
    * Remove tatweel (ـ).
@@ -533,22 +508,14 @@ type SanitizeOptions = {
    * - `false` to keep tatweel
    * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
    */
-  stripTatweel?: boolean | 'safe' | 'all';
-  /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
-  normalizeAlif?: boolean;
-  /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
-  replaceAlifMaqsurah?: boolean;
-  /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
-  replaceTaMarbutahWithHa?: boolean;
-  /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
-  stripLatinAndSymbols?: boolean;
-  /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
-  keepOnlyArabicLetters?: boolean;
-  /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
-  lettersAndSpacesOnly?: boolean;
-  /** Collapse runs of whitespace to a single space. Default: `true`. */
-  collapseWhitespace?: boolean;
-  /** Trim leading/trailing whitespace. Default: `true`. */
+  stripTatweel?: boolean | 'safe' | 'all'; /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
+  normalizeAlif?: boolean; /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
+  replaceAlifMaqsurah?: boolean; /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
+  replaceTaMarbutahWithHa?: boolean; /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
+  stripLatinAndSymbols?: boolean; /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
+  keepOnlyArabicLetters?: boolean; /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
+  lettersAndSpacesOnly?: boolean; /** Collapse runs of whitespace to a single space. Default: `true`. */
+  collapseWhitespace?: boolean; /** Trim leading/trailing whitespace. Default: `true`. */
   trim?: boolean;
   /**
    * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
@@ -597,6 +564,20 @@ declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | Sanitiz
  */
 declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
 declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
+/**
+ * Produces a conservative Qur'an-specific search surface.
+ *
+ * This helper is intentionally narrower than the generic `search` preset:
+ * it preserves standard hamza forms and alif maqsurah while normalizing
+ * Qur'anic orthography that would otherwise damage lexical identity in FTS.
+ *
+ * Current behavior:
+ * - maps alif wasla (`ٱ`) to bare alif (`ا`)
+ * - expands dagger alif (`ٰ`) only in contexts where the imla'i form needs an alif
+ * - strips tashkeel, tatweel, footnotes, zero-width chars, and non-letter noise
+ * - keeps only Arabic letters and spaces
+ */
+declare const sanitizeQuranForSearch: (input: string) => string;
 //#endregion
 //#region src/utils/similarity.d.ts
 /**
@@ -679,29 +660,17 @@ declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
  * Collection of regex patterns used throughout the library for text processing
  */
 declare const PATTERNS: {
-  /** Matches Arabic characters across all Unicode blocks */
-  arabicCharacters: RegExp;
-  /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
-  arabicDigits: RegExp;
-  /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
-  arabicFootnoteReferenceRegex: RegExp;
-  /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
-  arabicLettersAndDigits: RegExp;
-  /** Matches Arabic punctuation marks and whitespace characters */
-  arabicPunctuationAndWhitespace: RegExp;
-  /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
-  arabicReferenceRegex: RegExp;
-  /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
-  footnoteEmbedded: RegExp;
-  /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
-  footnoteStandalone: RegExp;
-  /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
-  invalidReferenceRegex: RegExp;
-  /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
-  ocrConfusedFootnoteReferenceRegex: RegExp;
-  /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
-  ocrConfusedReferenceRegex: RegExp;
-  /** Matches one or more whitespace characters */
+  /** Matches Arabic characters across all Unicode blocks */arabicCharacters: RegExp; /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
+  arabicDigits: RegExp; /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
+  arabicFootnoteReferenceRegex: RegExp; /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
+  arabicLettersAndDigits: RegExp; /** Matches Arabic punctuation marks and whitespace characters */
+  arabicPunctuationAndWhitespace: RegExp; /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
+  arabicReferenceRegex: RegExp; /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
+  footnoteEmbedded: RegExp; /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
+  footnoteStandalone: RegExp; /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
+  invalidReferenceRegex: RegExp; /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
+  ocrConfusedFootnoteReferenceRegex: RegExp; /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
+  ocrConfusedReferenceRegex: RegExp; /** Matches one or more whitespace characters */
   whitespace: RegExp;
 };
 /**
@@ -810,5 +779,5 @@ declare const standardizeHijriSymbol: (text: string) => string;
  */
 declare const standardizeIntahaSymbol: (text: string) => string;
 //#endregion
-export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
+export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, sanitizeQuranForSearch, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
 //# sourceMappingURL=index.d.ts.map