baburchi 1.7.2 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/index.d.ts +56 -87
- package/dist/index.js +331 -263
- package/dist/index.js.map +1 -1
- package/package.json +9 -7
package/README.md
CHANGED
|
@@ -14,6 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms and comprehensive noise detection.
|
|
16
16
|
|
|
17
|
+
## Demo
|
|
18
|
+
|
|
19
|
+
Explore the interactive demo at <https://baburchi.surge.sh> to browse each exported helper, try Arabic-aware examples, and see formatting results in real time. The demo build ships with a `public/CNAME` file to keep the Surge domain in sync with deployments.
|
|
20
|
+
|
|
17
21
|
## Features
|
|
18
22
|
|
|
19
23
|
- 🧠 **Sequence-Aware Typo Repair** — Needleman–Wunsch alignment with typo symbol preservation and duplicate pruning.
|
|
@@ -206,6 +210,27 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
|
|
|
206
210
|
- `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
|
|
207
211
|
- `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
|
|
208
212
|
|
|
213
|
+
#### `sanitizeQuranForSearch(input)`
|
|
214
|
+
|
|
215
|
+
Qur'an-oriented search normalization for cases where generic Arabic FTS cleanup is too destructive.
|
|
216
|
+
|
|
217
|
+
This helper is intentionally narrower than the generic `"search"` preset:
|
|
218
|
+
|
|
219
|
+
- normalizes alif wasla (`ٱ`) to bare alif
|
|
220
|
+
- expands dagger alif where the imla'i form needs a real alif
|
|
221
|
+
- preserves standard hamza letters such as `أ`, `إ`, and `ء`
|
|
222
|
+
- preserves alif maqsurah unless a Qur'anic orthography fix requires otherwise
|
|
223
|
+
- strips diacritics, tatweel, zero-width controls, and non-letter noise
|
|
224
|
+
|
|
225
|
+
Example:
|
|
226
|
+
|
|
227
|
+
```typescript
|
|
228
|
+
import { sanitizeQuranForSearch } from 'baburchi';
|
|
229
|
+
|
|
230
|
+
sanitizeQuranForSearch('ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى لِّلْمُتَّقِينَ');
|
|
231
|
+
// → 'ذلك الكتاب لا ريب فيه هدى للمتقين'
|
|
232
|
+
```
|
|
233
|
+
|
|
209
234
|
**Batch processing / factory:**
|
|
210
235
|
|
|
211
236
|
- Pass an array to resolve options once and sanitize many strings efficiently.
|
package/dist/index.d.ts
CHANGED
|
@@ -18,30 +18,24 @@ declare const alignTextSegments: (targetLines: string[], segmentLines: string[])
|
|
|
18
18
|
* Represents an error found when checking balance of quotes or brackets in text.
|
|
19
19
|
*/
|
|
20
20
|
type BalanceError = {
|
|
21
|
-
/** The character that caused the error */
|
|
22
|
-
|
|
23
|
-
/** The
|
|
24
|
-
index: number;
|
|
25
|
-
/** The reason for the error */
|
|
26
|
-
reason: 'mismatched' | 'unclosed' | 'unmatched';
|
|
27
|
-
/** The type of character that caused the error */
|
|
21
|
+
/** The character that caused the error */char: string; /** The position of the character in the string */
|
|
22
|
+
index: number; /** The reason for the error */
|
|
23
|
+
reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */
|
|
28
24
|
type: 'bracket' | 'quote';
|
|
29
25
|
};
|
|
30
26
|
/**
|
|
31
27
|
* Result of a balance check operation.
|
|
32
28
|
*/
|
|
33
29
|
type BalanceResult = {
|
|
34
|
-
/** Array of errors found during balance checking */
|
|
35
|
-
errors: BalanceError[];
|
|
36
|
-
/** Whether the text is properly balanced */
|
|
30
|
+
/** Array of errors found during balance checking */errors: BalanceError[]; /** Whether the text is properly balanced */
|
|
37
31
|
isBalanced: boolean;
|
|
38
32
|
};
|
|
39
33
|
/** Mapping of opening brackets to their corresponding closing brackets */
|
|
40
34
|
declare const BRACKETS: {
|
|
41
|
-
'\u00AB': string;
|
|
42
35
|
'(': string;
|
|
43
36
|
'[': string;
|
|
44
37
|
'{': string;
|
|
38
|
+
'\u00AB': string;
|
|
45
39
|
};
|
|
46
40
|
/** Set of all opening bracket characters */
|
|
47
41
|
declare const OPEN_BRACKETS: Set<string>;
|
|
@@ -218,19 +212,12 @@ type FixTypoOptions = {
|
|
|
218
212
|
readonly typoSymbols: string[];
|
|
219
213
|
};
|
|
220
214
|
type MatchPolicy = {
|
|
221
|
-
/** Try approximate matches for leftovers (default true). */
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
/** Max
|
|
226
|
-
|
|
227
|
-
/** q-gram length for candidate generation (default 4). */
|
|
228
|
-
q?: number;
|
|
229
|
-
/** Max rare grams to seed candidates per excerpt (default 5). */
|
|
230
|
-
gramsPerExcerpt?: number;
|
|
231
|
-
/** Max candidate windows verified per excerpt (default 40). */
|
|
232
|
-
maxCandidatesPerExcerpt?: number;
|
|
233
|
-
/** Seam length for bleed windows (default 512). */
|
|
215
|
+
/** Try approximate matches for leftovers (default true). */enableFuzzy?: boolean; /** Max absolute edit distance accepted in fuzzy (default 3). */
|
|
216
|
+
maxEditAbs?: number; /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
|
|
217
|
+
maxEditRel?: number; /** q-gram length for candidate generation (default 4). */
|
|
218
|
+
q?: number; /** Max rare grams to seed candidates per excerpt (default 5). */
|
|
219
|
+
gramsPerExcerpt?: number; /** Max candidate windows verified per excerpt (default 40). */
|
|
220
|
+
maxCandidatesPerExcerpt?: number; /** Seam length for bleed windows (default 512). */
|
|
234
221
|
seamLen?: number;
|
|
235
222
|
/**
|
|
236
223
|
* Optional logging function for debugging.
|
|
@@ -281,19 +268,12 @@ declare function findMatchesAll(pages: string[], excerpts: string[], policy?: Ma
|
|
|
281
268
|
* Character statistics for analyzing text content and patterns
|
|
282
269
|
*/
|
|
283
270
|
type CharacterStats = {
|
|
284
|
-
/** Number of Arabic script characters in the text */
|
|
285
|
-
|
|
286
|
-
/**
|
|
287
|
-
|
|
288
|
-
/** Number of
|
|
289
|
-
|
|
290
|
-
/** Number of Latin alphabet characters (a-z, A-Z) in the text */
|
|
291
|
-
latinCount: number;
|
|
292
|
-
/** Number of punctuation characters in the text */
|
|
293
|
-
punctuationCount: number;
|
|
294
|
-
/** Number of whitespace characters in the text */
|
|
295
|
-
spaceCount: number;
|
|
296
|
-
/** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
|
|
271
|
+
/** Number of Arabic script characters in the text */arabicCount: number; /** Map of character frequencies for repetition analysis */
|
|
272
|
+
charFreq: Map<string, number>; /** Number of digit characters (0-9) in the text */
|
|
273
|
+
digitCount: number; /** Number of Latin alphabet characters (a-z, A-Z) in the text */
|
|
274
|
+
latinCount: number; /** Number of punctuation characters in the text */
|
|
275
|
+
punctuationCount: number; /** Number of whitespace characters in the text */
|
|
276
|
+
spaceCount: number; /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
|
|
297
277
|
symbolCount: number;
|
|
298
278
|
};
|
|
299
279
|
/**
|
|
@@ -506,8 +486,7 @@ type SanitizeBase = 'none' | SanitizePreset;
|
|
|
506
486
|
* directly into local booleans for speed.
|
|
507
487
|
*/
|
|
508
488
|
type SanitizeOptions = {
|
|
509
|
-
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
|
|
510
|
-
base?: SanitizeBase;
|
|
489
|
+
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */base?: SanitizeBase;
|
|
511
490
|
/**
|
|
512
491
|
* NFC normalization (fast-path).
|
|
513
492
|
*
|
|
@@ -517,14 +496,10 @@ type SanitizeOptions = {
|
|
|
517
496
|
*
|
|
518
497
|
* Default: `true` in all presets.
|
|
519
498
|
*/
|
|
520
|
-
nfc?: boolean;
|
|
521
|
-
/**
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
zeroWidthToSpace?: boolean;
|
|
525
|
-
/** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
|
|
526
|
-
stripDiacritics?: boolean;
|
|
527
|
-
/** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
|
|
499
|
+
nfc?: boolean; /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
|
|
500
|
+
stripZeroWidth?: boolean; /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
|
|
501
|
+
zeroWidthToSpace?: boolean; /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
|
|
502
|
+
stripDiacritics?: boolean; /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
|
|
528
503
|
stripFootnotes?: boolean;
|
|
529
504
|
/**
|
|
530
505
|
* Remove tatweel (ـ).
|
|
@@ -533,22 +508,14 @@ type SanitizeOptions = {
|
|
|
533
508
|
* - `false` to keep tatweel
|
|
534
509
|
* Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
535
510
|
*/
|
|
536
|
-
stripTatweel?: boolean | 'safe' | 'all';
|
|
537
|
-
/**
|
|
538
|
-
|
|
539
|
-
/**
|
|
540
|
-
|
|
541
|
-
/**
|
|
542
|
-
|
|
543
|
-
/**
|
|
544
|
-
stripLatinAndSymbols?: boolean;
|
|
545
|
-
/** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
|
|
546
|
-
keepOnlyArabicLetters?: boolean;
|
|
547
|
-
/** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
|
|
548
|
-
lettersAndSpacesOnly?: boolean;
|
|
549
|
-
/** Collapse runs of whitespace to a single space. Default: `true`. */
|
|
550
|
-
collapseWhitespace?: boolean;
|
|
551
|
-
/** Trim leading/trailing whitespace. Default: `true`. */
|
|
511
|
+
stripTatweel?: boolean | 'safe' | 'all'; /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
|
|
512
|
+
normalizeAlif?: boolean; /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
|
|
513
|
+
replaceAlifMaqsurah?: boolean; /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
|
|
514
|
+
replaceTaMarbutahWithHa?: boolean; /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
|
|
515
|
+
stripLatinAndSymbols?: boolean; /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
|
|
516
|
+
keepOnlyArabicLetters?: boolean; /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
|
|
517
|
+
lettersAndSpacesOnly?: boolean; /** Collapse runs of whitespace to a single space. Default: `true`. */
|
|
518
|
+
collapseWhitespace?: boolean; /** Trim leading/trailing whitespace. Default: `true`. */
|
|
552
519
|
trim?: boolean;
|
|
553
520
|
/**
|
|
554
521
|
* Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
|
|
@@ -597,6 +564,20 @@ declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | Sanitiz
|
|
|
597
564
|
*/
|
|
598
565
|
declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
|
|
599
566
|
declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
|
|
567
|
+
/**
|
|
568
|
+
* Produces a conservative Qur'an-specific search surface.
|
|
569
|
+
*
|
|
570
|
+
* This helper is intentionally narrower than the generic `search` preset:
|
|
571
|
+
* it preserves standard hamza forms and alif maqsurah while normalizing
|
|
572
|
+
* Qur'anic orthography that would otherwise damage lexical identity in FTS.
|
|
573
|
+
*
|
|
574
|
+
* Current behavior:
|
|
575
|
+
* - maps alif wasla (`ٱ`) to bare alif (`ا`)
|
|
576
|
+
* - expands dagger alif (`ٰ`) only in contexts where the imla'i form needs an alif
|
|
577
|
+
* - strips tashkeel, tatweel, footnotes, zero-width chars, and non-letter noise
|
|
578
|
+
* - keeps only Arabic letters and spaces
|
|
579
|
+
*/
|
|
580
|
+
declare const sanitizeQuranForSearch: (input: string) => string;
|
|
600
581
|
//#endregion
|
|
601
582
|
//#region src/utils/similarity.d.ts
|
|
602
583
|
/**
|
|
@@ -679,29 +660,17 @@ declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
|
|
|
679
660
|
* Collection of regex patterns used throughout the library for text processing
|
|
680
661
|
*/
|
|
681
662
|
declare const PATTERNS: {
|
|
682
|
-
/** Matches Arabic characters across all Unicode blocks */
|
|
683
|
-
|
|
684
|
-
/** Matches Arabic
|
|
685
|
-
|
|
686
|
-
/** Matches footnote references
|
|
687
|
-
|
|
688
|
-
/** Matches
|
|
689
|
-
|
|
690
|
-
/** Matches
|
|
691
|
-
|
|
692
|
-
/** Matches
|
|
693
|
-
arabicReferenceRegex: RegExp;
|
|
694
|
-
/** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
|
|
695
|
-
footnoteEmbedded: RegExp;
|
|
696
|
-
/** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
|
|
697
|
-
footnoteStandalone: RegExp;
|
|
698
|
-
/** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
|
|
699
|
-
invalidReferenceRegex: RegExp;
|
|
700
|
-
/** Matches OCR-confused footnote references at line start with characters like .1OV9 */
|
|
701
|
-
ocrConfusedFootnoteReferenceRegex: RegExp;
|
|
702
|
-
/** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
|
|
703
|
-
ocrConfusedReferenceRegex: RegExp;
|
|
704
|
-
/** Matches one or more whitespace characters */
|
|
663
|
+
/** Matches Arabic characters across all Unicode blocks */arabicCharacters: RegExp; /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
|
|
664
|
+
arabicDigits: RegExp; /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
|
|
665
|
+
arabicFootnoteReferenceRegex: RegExp; /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
|
|
666
|
+
arabicLettersAndDigits: RegExp; /** Matches Arabic punctuation marks and whitespace characters */
|
|
667
|
+
arabicPunctuationAndWhitespace: RegExp; /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
|
|
668
|
+
arabicReferenceRegex: RegExp; /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
|
|
669
|
+
footnoteEmbedded: RegExp; /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
|
|
670
|
+
footnoteStandalone: RegExp; /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
|
|
671
|
+
invalidReferenceRegex: RegExp; /** Matches OCR-confused footnote references at line start with characters like .1OV9 */
|
|
672
|
+
ocrConfusedFootnoteReferenceRegex: RegExp; /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
|
|
673
|
+
ocrConfusedReferenceRegex: RegExp; /** Matches one or more whitespace characters */
|
|
705
674
|
whitespace: RegExp;
|
|
706
675
|
};
|
|
707
676
|
/**
|
|
@@ -810,5 +779,5 @@ declare const standardizeHijriSymbol: (text: string) => string;
|
|
|
810
779
|
*/
|
|
811
780
|
declare const standardizeIntahaSymbol: (text: string) => string;
|
|
812
781
|
//#endregion
|
|
813
|
-
export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|
|
782
|
+
export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, sanitizeQuranForSearch, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|
|
814
783
|
//# sourceMappingURL=index.d.ts.map
|