baburchi 1.7.1 โ 1.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -15
- package/dist/index.d.ts +214 -162
- package/dist/index.js +2649 -2
- package/dist/index.js.map +1 -1
- package/package.json +12 -10
package/README.md
CHANGED
|
@@ -14,17 +14,21 @@
|
|
|
14
14
|
|
|
15
15
|
A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms and comprehensive noise detection.
|
|
16
16
|
|
|
17
|
+
## Demo
|
|
18
|
+
|
|
19
|
+
Explore the interactive demo at <https://baburchi.surge.sh> to browse each exported helper, try Arabic-aware examples, and see formatting results in real time. The demo build ships with a `public/CNAME` file to keep the Surge domain in sync with deployments.
|
|
20
|
+
|
|
17
21
|
## Features
|
|
18
22
|
|
|
19
|
-
- ๐ง **
|
|
20
|
-
-
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
25
|
-
-
|
|
26
|
-
-
|
|
27
|
-
-
|
|
23
|
+
- ๐ง **Sequence-Aware Typo Repair** — NeedlemanโWunsch alignment with typo symbol preservation and duplicate pruning.
|
|
24
|
+
- ๐ **Multi-Page Fuzzy Search** — Hybrid exact/fuzzy matching with q-gram seeding and cross-page seam handling.
|
|
25
|
+
- ๐ **Footnote Normalisation** — Converts OCR-confused numerals, fills empty references, and keeps body/footnote sets in sync.
|
|
26
|
+
- ๐งฎ **Bracket & Quote Balancing** — Detects mismatched punctuation with positional metadata for editor highlighting.
|
|
27
|
+
- ๐งน **Noise Classification** — Arabic-aware heuristics for punctuation spam, spacing artefacts, and mixed-script clutter.
|
|
28
|
+
- ๐งพ **Comprehensive Typings** — Fully documented API surface with rich JSDoc coverage and generated declaration files.
|
|
29
|
+
- โ๏ธ **Configurable Pipelines** — Fine-grained match policies, sanitisation presets, and typo symbol lists.
|
|
30
|
+
- ๐งช **High Test Coverage** — Extensive Bun test suite covering alignment, matching, sanitisation, and utility helpers.
|
|
31
|
+
- ๐งณ **Lightweight Tooling** — Ships with the upstream `tsdown` bundler for fast Bun/Node builds and typed outputs.
|
|
28
32
|
|
|
29
33
|
## Installation
|
|
30
34
|
|
|
@@ -53,7 +57,7 @@ const correctedText = 'ู
ุญู
ุฏ ๏ทบ ุฑุณูู ุงููู';
|
|
|
53
57
|
const typoSymbols = ['๏ทบ', '๏ทฝ', '๏ทป'];
|
|
54
58
|
|
|
55
59
|
const result = fixTypo(originalText, correctedText, { typoSymbols });
|
|
56
|
-
console.log(result); // 'ู
ุญู
ุฏ
|
|
60
|
+
console.log(result); // 'ู
ุญู
ุฏ ๏ทบ ุฑุณูู ุงููู ุนููู ูุณูู
'
|
|
57
61
|
|
|
58
62
|
// Noise detection for OCR cleanup
|
|
59
63
|
const cleanText = isArabicTextNoise('ุงูุณูุงู
ุนูููู
'); // false
|
|
@@ -197,7 +201,7 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
|
|
|
197
201
|
|
|
198
202
|
**Parameters:**
|
|
199
203
|
|
|
200
|
-
- `input` (string): The Arabic text to sanitize
|
|
204
|
+
- `input` (string | string[]): The Arabic text to sanitize (or an array for optimized batch processing)
|
|
201
205
|
- `optionsOrPreset` (string | object): Either a preset name or custom options
|
|
202
206
|
|
|
203
207
|
**Presets:**
|
|
@@ -206,13 +210,20 @@ Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabi
|
|
|
206
210
|
- `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
|
|
207
211
|
- `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
|
|
208
212
|
|
|
213
|
+
**Batch processing / factory:**
|
|
214
|
+
|
|
215
|
+
- Pass an array to resolve options once and sanitize many strings efficiently.
|
|
216
|
+
- Or pre-resolve options with `createArabicSanitizer(...)` and reuse the returned function.
|
|
217
|
+
|
|
209
218
|
**Custom Options:**
|
|
210
219
|
|
|
211
220
|
```typescript
|
|
212
221
|
interface SanitizeOptions {
|
|
213
222
|
base?: 'light' | 'search' | 'aggressive' | 'none';
|
|
223
|
+
nfc?: boolean;
|
|
214
224
|
stripDiacritics?: boolean;
|
|
215
|
-
|
|
225
|
+
stripFootnotes?: boolean;
|
|
226
|
+
stripTatweel?: boolean | 'safe' | 'all';
|
|
216
227
|
normalizeAlif?: boolean;
|
|
217
228
|
replaceAlifMaqsurah?: boolean;
|
|
218
229
|
replaceTaMarbutahWithHa?: boolean;
|
|
@@ -227,10 +238,12 @@ interface SanitizeOptions {
|
|
|
227
238
|
}
|
|
228
239
|
```
|
|
229
240
|
|
|
241
|
+
**Note on `nfc`**: NFC normalization does **not** remove diacritics; it canonicalizes equivalent sequences. This library applies an Arabic-focused NFC fast-path for common OCR compositions (e.g., Alif + combining hamza/madda), while `stripDiacritics` controls tashkฤซl removal.
|
|
242
|
+
|
|
230
243
|
**Examples:**
|
|
231
244
|
|
|
232
245
|
```typescript
|
|
233
|
-
import { sanitizeArabic } from 'baburchi';
|
|
246
|
+
import { createArabicSanitizer, sanitizeArabic } from 'baburchi';
|
|
234
247
|
|
|
235
248
|
// Light display cleanup
|
|
236
249
|
sanitizeArabic(' ู
ุฑุญุจุง\u200C\u200D ุจุงูุนุงูู
', 'light'); // โ 'ู
ุฑุญุจุง ุจุงูุนุงูู
'
|
|
@@ -244,6 +257,13 @@ sanitizeArabic('ุงููุณููููุงู
ู 1435/3/29 ูู โ www', 'aggressive'); /
|
|
|
244
257
|
// Custom: Tatweel-only, preserving dates/list markers
|
|
245
258
|
sanitizeArabic('ุฃุจูููุชููููููุฉู', { base: 'none', stripTatweel: true }); // โ 'ุฃุจุชูููุฉู'
|
|
246
259
|
|
|
260
|
+
// Batch processing (optimized)
|
|
261
|
+
sanitizeArabic(['ุงููุณููููุงู
ู ุนูููููููู
ู', 'ุฃุจูููุชููููููุฉู'], 'search'); // โ ['ุงูุณูุงู
ุนูููู
', 'ุฃุจุชูููุฉู']
|
|
262
|
+
|
|
263
|
+
// Factory (pre-resolved options)
|
|
264
|
+
const sanitizeSearch = createArabicSanitizer('search');
|
|
265
|
+
['ุงููุณููููุงู
ู ุนูููููููู
ู', 'ุฃุจูููุชููููููุฉู'].map(sanitizeSearch);
|
|
266
|
+
|
|
247
267
|
// Zero-width controls โ spaces
|
|
248
268
|
sanitizeArabic('ููุฎููููู โ. โ ููุงูู ุบูุฑููุจู โ. โ', {
|
|
249
269
|
base: 'none',
|
|
@@ -953,8 +973,9 @@ Contributions are welcome. Please ensure your contributions adhere to the coding
|
|
|
953
973
|
2. Install dependencies: `bun install` (requires [Bun](https://bun.sh/))
|
|
954
974
|
3. Make your changes
|
|
955
975
|
4. Run tests: `bun test`
|
|
956
|
-
5.
|
|
957
|
-
6.
|
|
976
|
+
5. Build artefacts (optional verification): `bun run build`
|
|
977
|
+
6. Run linting: `bun run lint`
|
|
978
|
+
7. Submit a pull request
|
|
958
979
|
|
|
959
980
|
### Running Tests
|
|
960
981
|
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
//#region src/alignment.d.ts
|
|
1
2
|
/**
|
|
2
3
|
* Aligns split text segments to match target lines by finding the best order.
|
|
3
4
|
*
|
|
@@ -11,35 +12,36 @@
|
|
|
11
12
|
* @returns Array of aligned text lines
|
|
12
13
|
*/
|
|
13
14
|
declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
|
|
14
|
-
|
|
15
|
+
//#endregion
|
|
16
|
+
//#region src/balance.d.ts
|
|
15
17
|
/**
|
|
16
18
|
* Represents an error found when checking balance of quotes or brackets in text.
|
|
17
19
|
*/
|
|
18
20
|
type BalanceError = {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
21
|
+
/** The character that caused the error */
|
|
22
|
+
char: string;
|
|
23
|
+
/** The position of the character in the string */
|
|
24
|
+
index: number;
|
|
25
|
+
/** The reason for the error */
|
|
26
|
+
reason: 'mismatched' | 'unclosed' | 'unmatched';
|
|
27
|
+
/** The type of character that caused the error */
|
|
28
|
+
type: 'bracket' | 'quote';
|
|
27
29
|
};
|
|
28
30
|
/**
|
|
29
31
|
* Result of a balance check operation.
|
|
30
32
|
*/
|
|
31
33
|
type BalanceResult = {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
/** Array of errors found during balance checking */
|
|
35
|
+
errors: BalanceError[];
|
|
36
|
+
/** Whether the text is properly balanced */
|
|
37
|
+
isBalanced: boolean;
|
|
36
38
|
};
|
|
37
39
|
/** Mapping of opening brackets to their corresponding closing brackets */
|
|
38
40
|
declare const BRACKETS: {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
'\u00AB': string;
|
|
42
|
+
'(': string;
|
|
43
|
+
'[': string;
|
|
44
|
+
'{': string;
|
|
43
45
|
};
|
|
44
46
|
/** Set of all opening bracket characters */
|
|
45
47
|
declare const OPEN_BRACKETS: Set<string>;
|
|
@@ -70,14 +72,14 @@ declare const checkBalance: (str: string) => BalanceResult;
|
|
|
70
72
|
* syntax highlighters that need precise character positioning.
|
|
71
73
|
*/
|
|
72
74
|
interface CharacterError {
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
75
|
+
/** Absolute character position from the start of the entire text */
|
|
76
|
+
absoluteIndex: number;
|
|
77
|
+
/** The character that caused the error */
|
|
78
|
+
char: string;
|
|
79
|
+
/** The reason for the error */
|
|
80
|
+
reason: 'mismatched' | 'unclosed' | 'unmatched';
|
|
81
|
+
/** The type of character that caused the error */
|
|
82
|
+
type: 'bracket' | 'quote';
|
|
81
83
|
}
|
|
82
84
|
/**
|
|
83
85
|
* Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
|
|
@@ -148,7 +150,8 @@ declare const areBracketsBalanced: (str: string) => boolean;
|
|
|
148
150
|
* ```
|
|
149
151
|
*/
|
|
150
152
|
declare const isBalanced: (str: string) => boolean;
|
|
151
|
-
|
|
153
|
+
//#endregion
|
|
154
|
+
//#region src/footnotes.d.ts
|
|
152
155
|
/**
|
|
153
156
|
* Checks if the given text contains invalid footnote references.
|
|
154
157
|
* Invalid footnotes include empty parentheses "()" or OCR-confused characters
|
|
@@ -163,8 +166,8 @@ declare const isBalanced: (str: string) => boolean;
|
|
|
163
166
|
*/
|
|
164
167
|
declare const hasInvalidFootnotes: (text: string) => boolean;
|
|
165
168
|
type TextLine = {
|
|
166
|
-
|
|
167
|
-
|
|
169
|
+
isFootnote?: boolean;
|
|
170
|
+
text: string;
|
|
168
171
|
};
|
|
169
172
|
/**
|
|
170
173
|
* Corrects footnote references in an array of text lines by:
|
|
@@ -184,56 +187,58 @@ type TextLine = {
|
|
|
184
187
|
* // Returns lines with "()" replaced by proper Arabic numerals like "(ูก)"
|
|
185
188
|
*/
|
|
186
189
|
declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
|
|
187
|
-
|
|
190
|
+
//#endregion
|
|
191
|
+
//#region src/types.d.ts
|
|
188
192
|
/**
|
|
189
193
|
* Configuration options for fixing typos in OCR text using alignment algorithms.
|
|
190
194
|
* These options control how text tokens are compared, aligned, and merged during typo correction.
|
|
191
195
|
*/
|
|
192
196
|
type FixTypoOptions = {
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
197
|
+
/**
|
|
198
|
+
* High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
|
|
199
|
+
* Used in post-processing to eliminate redundant tokens that are nearly identical.
|
|
200
|
+
* Should typically be higher than similarityThreshold to catch only very similar duplicates.
|
|
201
|
+
* @default 0.9
|
|
202
|
+
* @example 0.95 // Removes tokens that are 95% or more similar
|
|
203
|
+
*/
|
|
204
|
+
readonly highSimilarityThreshold: number;
|
|
205
|
+
/**
|
|
206
|
+
* Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
|
|
207
|
+
* Higher values require closer matches, lower values are more permissive.
|
|
208
|
+
* Used in the Needleman-Wunsch alignment algorithm for token matching.
|
|
209
|
+
* @default 0.7
|
|
210
|
+
* @example 0.8 // Requires 80% similarity for token alignment
|
|
211
|
+
*/
|
|
212
|
+
readonly similarityThreshold: number;
|
|
213
|
+
/**
|
|
214
|
+
* Array of special symbols that should be preserved during typo correction.
|
|
215
|
+
* These symbols (like honorifics or religious markers) take precedence in token selection.
|
|
216
|
+
* @example ['๏ทบ', '๏ทฝ', '๏ทป'] // Common Arabic religious symbols
|
|
217
|
+
*/
|
|
218
|
+
readonly typoSymbols: string[];
|
|
215
219
|
};
|
|
216
220
|
type MatchPolicy = {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
221
|
+
/** Try approximate matches for leftovers (default true). */
|
|
222
|
+
enableFuzzy?: boolean;
|
|
223
|
+
/** Max absolute edit distance accepted in fuzzy (default 3). */
|
|
224
|
+
maxEditAbs?: number;
|
|
225
|
+
/** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
|
|
226
|
+
maxEditRel?: number;
|
|
227
|
+
/** q-gram length for candidate generation (default 4). */
|
|
228
|
+
q?: number;
|
|
229
|
+
/** Max rare grams to seed candidates per excerpt (default 5). */
|
|
230
|
+
gramsPerExcerpt?: number;
|
|
231
|
+
/** Max candidate windows verified per excerpt (default 40). */
|
|
232
|
+
maxCandidatesPerExcerpt?: number;
|
|
233
|
+
/** Seam length for bleed windows (default 512). */
|
|
234
|
+
seamLen?: number;
|
|
235
|
+
/**
|
|
236
|
+
* Optional logging function for debugging.
|
|
237
|
+
*/
|
|
238
|
+
log?(message?: any, ...optionalParams: any[]): void;
|
|
235
239
|
};
|
|
236
|
-
|
|
240
|
+
//#endregion
|
|
241
|
+
//#region src/fuzzy.d.ts
|
|
237
242
|
/**
|
|
238
243
|
* Main function to find the single best match per excerpt.
|
|
239
244
|
* Combines exact matching with fuzzy matching for comprehensive text search.
|
|
@@ -270,25 +275,26 @@ declare function findMatches(pages: string[], excerpts: string[], policy?: Match
|
|
|
270
275
|
* ```
|
|
271
276
|
*/
|
|
272
277
|
declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
|
|
273
|
-
|
|
278
|
+
//#endregion
|
|
279
|
+
//#region src/noise.d.ts
|
|
274
280
|
/**
|
|
275
281
|
* Character statistics for analyzing text content and patterns
|
|
276
282
|
*/
|
|
277
283
|
type CharacterStats = {
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
284
|
+
/** Number of Arabic script characters in the text */
|
|
285
|
+
arabicCount: number;
|
|
286
|
+
/** Map of character frequencies for repetition analysis */
|
|
287
|
+
charFreq: Map<string, number>;
|
|
288
|
+
/** Number of digit characters (0-9) in the text */
|
|
289
|
+
digitCount: number;
|
|
290
|
+
/** Number of Latin alphabet characters (a-z, A-Z) in the text */
|
|
291
|
+
latinCount: number;
|
|
292
|
+
/** Number of punctuation characters in the text */
|
|
293
|
+
punctuationCount: number;
|
|
294
|
+
/** Number of whitespace characters in the text */
|
|
295
|
+
spaceCount: number;
|
|
296
|
+
/** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
|
|
297
|
+
symbolCount: number;
|
|
292
298
|
};
|
|
293
299
|
/**
|
|
294
300
|
* Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
|
|
@@ -438,7 +444,8 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
|
|
|
438
444
|
* ```
|
|
439
445
|
*/
|
|
440
446
|
declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
|
|
441
|
-
|
|
447
|
+
//#endregion
|
|
448
|
+
//#region src/typos.d.ts
|
|
442
449
|
/**
|
|
443
450
|
* Processes text alignment between original and alternate OCR results to fix typos.
|
|
444
451
|
* Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
|
|
@@ -450,8 +457,21 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
|
|
|
450
457
|
* @returns Corrected text with typos fixed
|
|
451
458
|
*/
|
|
452
459
|
declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
|
|
453
|
-
|
|
454
|
-
|
|
460
|
+
/**
|
|
461
|
+
* Convenience wrapper around {@link processTextAlignment} that accepts partial options.
|
|
462
|
+
*
|
|
463
|
+
* @param original - The source text that may contain typographical errors.
|
|
464
|
+
* @param correction - The reference text used to correct the {@link original} text.
|
|
465
|
+
* @param options - Partial typo correction options combined with required typo symbols.
|
|
466
|
+
* @returns The corrected text generated from the alignment process.
|
|
467
|
+
*/
|
|
468
|
+
declare const fixTypo: (original: string, correction: string, {
|
|
469
|
+
highSimilarityThreshold,
|
|
470
|
+
similarityThreshold,
|
|
471
|
+
typoSymbols
|
|
472
|
+
}: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
|
|
473
|
+
//#endregion
|
|
474
|
+
//#region src/utils/levenshthein.d.ts
|
|
455
475
|
/**
|
|
456
476
|
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
|
|
457
477
|
* The Levenshtein distance is the minimum number of single-character edits (insertions,
|
|
@@ -471,7 +491,8 @@ declare const calculateLevenshteinDistance: (textA: string, textB: string) => nu
|
|
|
471
491
|
* More efficient when you only care about distances up to a threshold.
|
|
472
492
|
*/
|
|
473
493
|
declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
|
|
474
|
-
|
|
494
|
+
//#endregion
|
|
495
|
+
//#region src/utils/sanitize.d.ts
|
|
475
496
|
/**
|
|
476
497
|
* Ultra-fast Arabic text sanitizer for search/indexing/display.
|
|
477
498
|
* Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
|
|
@@ -485,49 +506,69 @@ type SanitizeBase = 'none' | SanitizePreset;
|
|
|
485
506
|
* directly into local booleans for speed.
|
|
486
507
|
*/
|
|
487
508
|
type SanitizeOptions = {
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
509
|
+
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
|
|
510
|
+
base?: SanitizeBase;
|
|
511
|
+
/**
|
|
512
|
+
* NFC normalization (fast-path).
|
|
513
|
+
*
|
|
514
|
+
* For performance, this sanitizer avoids calling `String.prototype.normalize('NFC')` and instead
|
|
515
|
+
* applies the key Arabic canonical compositions inline (hamza/madda combining marks).
|
|
516
|
+
* This preserves the NFC behavior that matters for typical Arabic OCR text while keeping throughput high.
|
|
517
|
+
*
|
|
518
|
+
* Default: `true` in all presets.
|
|
519
|
+
*/
|
|
520
|
+
nfc?: boolean;
|
|
521
|
+
/** Strip zero-width controls (U+200BโU+200F, U+202AโU+202E, U+2060โU+2064, U+FEFF). Default: `true` in presets. */
|
|
522
|
+
stripZeroWidth?: boolean;
|
|
523
|
+
/** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
|
|
524
|
+
zeroWidthToSpace?: boolean;
|
|
525
|
+
/** Remove Arabic diacritics (tashkฤซl). Default: `true` in `'search'`/`'aggressive'`. */
|
|
526
|
+
stripDiacritics?: boolean;
|
|
527
|
+
/** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
|
|
528
|
+
stripFootnotes?: boolean;
|
|
529
|
+
/**
|
|
530
|
+
* Remove tatweel (ู).
|
|
531
|
+
* - `true` is treated as `'safe'` (preserves tatweel after digits or 'ู' for dates/list markers)
|
|
532
|
+
* - `'safe'` or `'all'` explicitly
|
|
533
|
+
* - `false` to keep tatweel
|
|
534
|
+
* Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
535
|
+
*/
|
|
536
|
+
stripTatweel?: boolean | 'safe' | 'all';
|
|
537
|
+
/** Normalize ุข/ุฃ/ุฅ โ ุง. Default: `true` in `'search'`/`'aggressive'`. */
|
|
538
|
+
normalizeAlif?: boolean;
|
|
539
|
+
/** Replace ู โ ู. Default: `true` in `'search'`/`'aggressive'`. */
|
|
540
|
+
replaceAlifMaqsurah?: boolean;
|
|
541
|
+
/** Replace ุฉ โ ู (lossy). Default: `true` in `'aggressive'` only. */
|
|
542
|
+
replaceTaMarbutahWithHa?: boolean;
|
|
543
|
+
/** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
|
|
544
|
+
stripLatinAndSymbols?: boolean;
|
|
545
|
+
/** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
|
|
546
|
+
keepOnlyArabicLetters?: boolean;
|
|
547
|
+
/** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
|
|
548
|
+
lettersAndSpacesOnly?: boolean;
|
|
549
|
+
/** Collapse runs of whitespace to a single space. Default: `true`. */
|
|
550
|
+
collapseWhitespace?: boolean;
|
|
551
|
+
/** Trim leading/trailing whitespace. Default: `true`. */
|
|
552
|
+
trim?: boolean;
|
|
553
|
+
/**
|
|
554
|
+
* Remove the Hijri date marker ("ูู" or bare "ู" if tatweel already removed) when it follows a date-like token
|
|
555
|
+
* (digits/slashes/hyphens/spaces). Example: `1435/3/29 ูู` โ `1435/3/29`.
|
|
556
|
+
* Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
557
|
+
*/
|
|
558
|
+
removeHijriMarker?: boolean;
|
|
530
559
|
};
|
|
560
|
+
/**
|
|
561
|
+
* Creates a reusable sanitizer function with pre-resolved options.
|
|
562
|
+
* Use this when you need to sanitize many strings with the same options
|
|
563
|
+
* for maximum performance.
|
|
564
|
+
*
|
|
565
|
+
* @example
|
|
566
|
+
* ```ts
|
|
567
|
+
* const sanitize = createArabicSanitizer('search');
|
|
568
|
+
* const results = texts.map(sanitize);
|
|
569
|
+
* ```
|
|
570
|
+
*/
|
|
571
|
+
declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | SanitizeOptions) => ((input: string) => string);
|
|
531
572
|
/**
|
|
532
573
|
* Sanitizes Arabic text according to a preset or custom options.
|
|
533
574
|
*
|
|
@@ -540,15 +581,24 @@ type SanitizeOptions = {
|
|
|
540
581
|
* - Passing an options object overlays the selected `base` preset (default `'light'`).
|
|
541
582
|
* - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
|
|
542
583
|
*
|
|
584
|
+
* **Batch processing**: Pass an array of strings for optimized batch processing.
|
|
585
|
+
* Options are resolved once and applied to all strings, providing significant
|
|
586
|
+
* performance gains over calling the function in a loop.
|
|
587
|
+
*
|
|
543
588
|
* Examples:
|
|
544
589
|
* ```ts
|
|
545
590
|
* sanitizeArabic('ุฃุจูููุชููููููุฉู', { base: 'none', stripTatweel: true }); // 'ุฃุจุชูููุฉู'
|
|
546
591
|
* sanitizeArabic('1435/3/29 ูู', 'aggressive'); // '1435 3 29'
|
|
547
592
|
* sanitizeArabic('ุงููุณููููุงู
ู ุนูููููููู
ู', 'search'); // 'ุงูุณูุงู
ุนูููู
'
|
|
593
|
+
*
|
|
594
|
+
* // Batch processing (optimized):
|
|
595
|
+
* sanitizeArabic(['text1', 'text2', 'text3'], 'search'); // ['result1', 'result2', 'result3']
|
|
548
596
|
* ```
|
|
549
597
|
*/
|
|
550
|
-
declare
|
|
551
|
-
|
|
598
|
+
declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
|
|
599
|
+
declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
|
|
600
|
+
//#endregion
|
|
601
|
+
//#region src/utils/similarity.d.ts
|
|
552
602
|
/**
|
|
553
603
|
* Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
|
|
554
604
|
* Uses Levenshtein distance normalized by the length of the longer string.
|
|
@@ -592,8 +642,8 @@ declare const areSimilarAfterNormalization: (textA: string, textB: string, thres
|
|
|
592
642
|
declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
|
|
593
643
|
type AlignedTokenPair = [null | string, null | string];
|
|
594
644
|
type AlignmentCell = {
|
|
595
|
-
|
|
596
|
-
|
|
645
|
+
direction: 'diagonal' | 'left' | 'up' | null;
|
|
646
|
+
score: number;
|
|
597
647
|
};
|
|
598
648
|
/**
|
|
599
649
|
* Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
|
|
@@ -622,36 +672,37 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
|
|
|
622
672
|
* // Returns [['a', 'a'], ['b', 'c']]
|
|
623
673
|
*/
|
|
624
674
|
declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
|
|
625
|
-
|
|
675
|
+
//#endregion
|
|
676
|
+
//#region src/utils/textUtils.d.ts
|
|
626
677
|
declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
|
|
627
678
|
/**
|
|
628
679
|
* Collection of regex patterns used throughout the library for text processing
|
|
629
680
|
*/
|
|
630
681
|
declare const PATTERNS: {
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
682
|
+
/** Matches Arabic characters across all Unicode blocks */
|
|
683
|
+
arabicCharacters: RegExp;
|
|
684
|
+
/** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */
|
|
685
|
+
arabicDigits: RegExp;
|
|
686
|
+
/** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
|
|
687
|
+
arabicFootnoteReferenceRegex: RegExp;
|
|
688
|
+
/** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ู -ูฉ) */
|
|
689
|
+
arabicLettersAndDigits: RegExp;
|
|
690
|
+
/** Matches Arabic punctuation marks and whitespace characters */
|
|
691
|
+
arabicPunctuationAndWhitespace: RegExp;
|
|
692
|
+
/** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
|
|
693
|
+
arabicReferenceRegex: RegExp;
|
|
694
|
+
/** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
|
|
695
|
+
footnoteEmbedded: RegExp;
|
|
696
|
+
/** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[ุ.]?$ */
|
|
697
|
+
footnoteStandalone: RegExp;
|
|
698
|
+
/** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
|
|
699
|
+
invalidReferenceRegex: RegExp;
|
|
700
|
+
/** Matches OCR-confused footnote references at line start with characters like .1OV9 */
|
|
701
|
+
ocrConfusedFootnoteReferenceRegex: RegExp;
|
|
702
|
+
/** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
|
|
703
|
+
ocrConfusedReferenceRegex: RegExp;
|
|
704
|
+
/** Matches one or more whitespace characters */
|
|
705
|
+
whitespace: RegExp;
|
|
655
706
|
};
|
|
656
707
|
/**
|
|
657
708
|
* Extracts the first sequence of Arabic or Western digits from text.
|
|
@@ -758,5 +809,6 @@ declare const standardizeHijriSymbol: (text: string) => string;
|
|
|
758
809
|
* @returns Text with standardized AH Hijri symbols
|
|
759
810
|
*/
|
|
760
811
|
declare const standardizeIntahaSymbol: (text: string) => string;
|
|
761
|
-
|
|
762
|
-
export { BRACKETS, CLOSE_BRACKETS,
|
|
812
|
+
//#endregion
|
|
813
|
+
export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|
|
814
|
+
//# sourceMappingURL=index.d.ts.map
|