baburchi 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +226 -9
- package/dist/index.d.ts +190 -58
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/package.json +11 -21
package/README.md
CHANGED
|
@@ -62,7 +62,9 @@ const noiseText = isArabicTextNoise('---'); // true
|
|
|
62
62
|
|
|
63
63
|
## API Reference
|
|
64
64
|
|
|
65
|
-
###
|
|
65
|
+
### Core Text Processing
|
|
66
|
+
|
|
67
|
+
#### `fixTypo(original, correction, options)`
|
|
66
68
|
|
|
67
69
|
The main function for correcting typos using text alignment.
|
|
68
70
|
|
|
@@ -80,7 +82,7 @@ The main function for correcting typos using text alignment.
|
|
|
80
82
|
|
|
81
83
|
**Returns:** Corrected text string
|
|
82
84
|
|
|
83
|
-
|
|
85
|
+
#### `processTextAlignment(originalText, altText, options)`
|
|
84
86
|
|
|
85
87
|
Low-level function for advanced text processing with full configuration control.
|
|
86
88
|
|
|
@@ -90,6 +92,167 @@ Low-level function for advanced text processing with full configuration control.
|
|
|
90
92
|
- `altText` (string): Reference text for alignment
|
|
91
93
|
- `options` (FixTypoOptions): Complete configuration object
|
|
92
94
|
|
|
95
|
+
### Fuzzy Text Matching
|
|
96
|
+
|
|
97
|
+
#### `findMatches(pages, excerpts, policy?)`
|
|
98
|
+
|
|
99
|
+
Finds the best matching page for each excerpt using exact and fuzzy matching algorithms.
|
|
100
|
+
|
|
101
|
+
**Parameters:**
|
|
102
|
+
|
|
103
|
+
- `pages` (string[]): Array of page texts to search within
|
|
104
|
+
- `excerpts` (string[]): Array of text excerpts to find
|
|
105
|
+
- `policy` (MatchPolicy, optional): Matching configuration
|
|
106
|
+
|
|
107
|
+
**Returns:** `number[]` - Array of page indices (0-based) where each excerpt was found, or -1 if not found
|
|
108
|
+
|
|
109
|
+
**Example:**
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
import { findMatches } from 'baburchi';
|
|
113
|
+
|
|
114
|
+
const pages = [
|
|
115
|
+
'هذا النص في الصفحة الأولى مع محتوى إضافي',
|
|
116
|
+
'النص الثاني يظهر هنا في الصفحة الثانية',
|
|
117
|
+
'الصفحة الثالثة تحتوي على نص مختلف'
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
const excerpts = [
|
|
121
|
+
'النص في الصفحة الأولى',
|
|
122
|
+
'النص الثاني يظهر',
|
|
123
|
+
'نص غير موجود'
|
|
124
|
+
];
|
|
125
|
+
|
|
126
|
+
const matches = findMatches(pages, excerpts);
|
|
127
|
+
console.log(matches); // [0, 1, -1]
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
#### `findMatchesAll(pages, excerpts, policy?)`
|
|
131
|
+
|
|
132
|
+
Finds all potential matches for each excerpt, ranked by match quality.
|
|
133
|
+
|
|
134
|
+
**Parameters:**
|
|
135
|
+
|
|
136
|
+
- `pages` (string[]): Array of page texts to search within
|
|
137
|
+
- `excerpts` (string[]): Array of text excerpts to find
|
|
138
|
+
- `policy` (MatchPolicy, optional): Matching configuration
|
|
139
|
+
|
|
140
|
+
**Returns:** `number[][]` - Array where each element is an array of page indices ranked by match quality (exact matches first, then fuzzy matches by score)
|
|
141
|
+
|
|
142
|
+
**Example:**
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
import { findMatchesAll } from 'baburchi';
|
|
146
|
+
|
|
147
|
+
const pages = [
|
|
148
|
+
'النص الأول مع محتوى مشابه',
|
|
149
|
+
'محتوى مشابه في النص الثاني',
|
|
150
|
+
'النص الأول بصيغة مختلفة قليلاً'
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
const excerpts = ['النص الأول'];
|
|
154
|
+
|
|
155
|
+
const allMatches = findMatchesAll(pages, excerpts);
|
|
156
|
+
console.log(allMatches); // [[0, 2]] - excerpt matches page 0 exactly, page 2 fuzzily
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
#### Match Policy Configuration
|
|
160
|
+
|
|
161
|
+
The `MatchPolicy` interface allows fine-tuning of the matching algorithm:
|
|
162
|
+
|
|
163
|
+
```typescript
|
|
164
|
+
interface MatchPolicy {
|
|
165
|
+
enableFuzzy?: boolean; // Enable fuzzy matching (default: true)
|
|
166
|
+
maxEditAbs?: number; // Max absolute edit distance (default: 3)
|
|
167
|
+
maxEditRel?: number; // Max relative edit distance (default: 0.1)
|
|
168
|
+
q?: number; // Q-gram size for indexing (default: 4)
|
|
169
|
+
gramsPerExcerpt?: number; // Q-grams to sample per excerpt (default: 5)
|
|
170
|
+
maxCandidatesPerExcerpt?: number; // Max candidates to evaluate (default: 40)
|
|
171
|
+
seamLen?: number; // Cross-page seam length (default: 512)
|
|
172
|
+
}
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Example with custom policy:**
|
|
176
|
+
|
|
177
|
+
```typescript
|
|
178
|
+
import { findMatches } from 'baburchi';
|
|
179
|
+
|
|
180
|
+
const customPolicy: MatchPolicy = {
|
|
181
|
+
enableFuzzy: true,
|
|
182
|
+
maxEditAbs: 6, // Allow more character differences
|
|
183
|
+
maxEditRel: 0.3, // Allow 30% character differences
|
|
184
|
+
q: 4, // Use 4-grams for better precision
|
|
185
|
+
gramsPerExcerpt: 30, // Sample more Q-grams
|
|
186
|
+
maxCandidatesPerExcerpt: 150
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
const matches = findMatches(pages, excerpts, customPolicy);
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Arabic Text Normalization
|
|
193
|
+
|
|
194
|
+
#### `sanitizeArabic(input, optionsOrPreset)`
|
|
195
|
+
|
|
196
|
+
Unified Arabic text sanitizer that provides fast, configurable cleanup for Arabic text.
|
|
197
|
+
|
|
198
|
+
**Parameters:**
|
|
199
|
+
|
|
200
|
+
- `input` (string): The Arabic text to sanitize
|
|
201
|
+
- `optionsOrPreset` (string | object): Either a preset name or custom options
|
|
202
|
+
|
|
203
|
+
**Presets:**
|
|
204
|
+
|
|
205
|
+
- `"light"`: Basic cleanup for display (strips zero-width chars, collapses whitespace)
|
|
206
|
+
- `"search"`: Tolerant search normalization (removes diacritics, normalizes letters)
|
|
207
|
+
- `"aggressive"`: Indexing-friendly (letters and spaces only, removes everything else)
|
|
208
|
+
|
|
209
|
+
**Custom Options:**
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
interface SanitizeOptions {
|
|
213
|
+
base?: 'light' | 'search' | 'aggressive' | 'none';
|
|
214
|
+
stripDiacritics?: boolean;
|
|
215
|
+
stripTatweel?: boolean;
|
|
216
|
+
normalizeAlif?: boolean;
|
|
217
|
+
replaceAlifMaqsurah?: boolean;
|
|
218
|
+
replaceTaMarbutahWithHa?: boolean;
|
|
219
|
+
stripZeroWidth?: boolean;
|
|
220
|
+
zeroWidthToSpace?: boolean;
|
|
221
|
+
stripLatinAndSymbols?: boolean;
|
|
222
|
+
lettersAndSpacesOnly?: boolean;
|
|
223
|
+
keepOnlyArabicLetters?: boolean;
|
|
224
|
+
collapseWhitespace?: boolean;
|
|
225
|
+
trim?: boolean;
|
|
226
|
+
removeHijriMarker?: boolean;
|
|
227
|
+
}
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
**Examples:**
|
|
231
|
+
|
|
232
|
+
```typescript
|
|
233
|
+
import { sanitizeArabic } from 'baburchi';
|
|
234
|
+
|
|
235
|
+
// Light display cleanup
|
|
236
|
+
sanitizeArabic(' مرحبا\u200C\u200D بالعالم ', 'light'); // → 'مرحبا بالعالم'
|
|
237
|
+
|
|
238
|
+
// Tolerant search normalization
|
|
239
|
+
sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // → 'السلام عليكم'
|
|
240
|
+
|
|
241
|
+
// Indexing-friendly text (letters + spaces only)
|
|
242
|
+
sanitizeArabic('اَلسَّلَامُ 1435/3/29 هـ — www', 'aggressive'); // → 'السلام'
|
|
243
|
+
|
|
244
|
+
// Custom: Tatweel-only, preserving dates/list markers
|
|
245
|
+
sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // → 'أبتِكَةُ'
|
|
246
|
+
|
|
247
|
+
// Zero-width controls → spaces
|
|
248
|
+
sanitizeArabic('يَخْلُوَ . قَالَ غَرِيبٌ . ', {
|
|
249
|
+
base: 'none',
|
|
250
|
+
stripZeroWidth: true,
|
|
251
|
+
zeroWidthToSpace: true
|
|
252
|
+
});
|
|
253
|
+
// → 'يَخْلُوَ . قَالَ غَرِيبٌ . '
|
|
254
|
+
```
|
|
255
|
+
|
|
93
256
|
## Usage Examples
|
|
94
257
|
|
|
95
258
|
### Basic Arabic Text Correction
|
|
@@ -189,9 +352,9 @@ Baburchi uses the **Needleman-Wunsch global sequence alignment algorithm** to op
|
|
|
189
352
|
|
|
190
353
|
Baburchi works in all modern environments:
|
|
191
354
|
|
|
192
|
-
- ✅ Node.js
|
|
193
|
-
- ✅ Bun 1.
|
|
194
|
-
- ✅ Modern browsers (
|
|
355
|
+
- ✅ Node.js 22+
|
|
356
|
+
- ✅ Bun 1.2.21+
|
|
357
|
+
- ✅ Modern browsers (ES2023+)
|
|
195
358
|
- ✅ Deno (with npm compatibility)
|
|
196
359
|
|
|
197
360
|
## TypeScript Support
|
|
@@ -308,6 +471,58 @@ This function is particularly useful for:
|
|
|
308
471
|
- Handling cases where text layout affects line ordering
|
|
309
472
|
- Processing documents where content has been split across multiple detection regions
|
|
310
473
|
|
|
474
|
+
## Hijri Date Standardization
|
|
475
|
+
|
|
476
|
+
Baburchi includes specialized functions for standardizing Hijri date symbols commonly found in Arabic historical and religious texts. These functions help normalize OCR inconsistencies in Hijri date notation.
|
|
477
|
+
|
|
478
|
+
### `standardizeHijriSymbol(text)`
|
|
479
|
+
|
|
480
|
+
Standardizes standalone ه to هـ when following Arabic digits, ensuring proper Hijri date notation.
|
|
481
|
+
|
|
482
|
+
```typescript
|
|
483
|
+
import { standardizeHijriSymbol } from 'baburchi';
|
|
484
|
+
|
|
485
|
+
// Standardize after Arabic-Indic digits
|
|
486
|
+
const text1 = standardizeHijriSymbol('سنة ١٤٤٥ ه'); // 'سنة ١٤٤٥ هـ'
|
|
487
|
+
const text2 = standardizeHijriSymbol('عام ٧٥٠ه'); // 'عام ٧٥٠ هـ'
|
|
488
|
+
|
|
489
|
+
// Standardize after Western digits
|
|
490
|
+
const text3 = standardizeHijriSymbol('في عام 1445 ه'); // 'في عام 1445 هـ'
|
|
491
|
+
const text4 = standardizeHijriSymbol('توفي 632ه'); // 'توفي 632 هـ'
|
|
492
|
+
|
|
493
|
+
// Does not affect ه when part of other words
|
|
494
|
+
const text5 = standardizeHijriSymbol('هذا كتاب'); // 'هذا كتاب' (unchanged)
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
### `standardizeIntahaSymbol(text)`
|
|
498
|
+
|
|
499
|
+
Standardizes standalone اه to اهـ when appearing as a whole word, typically used in academic and historical texts.
|
|
500
|
+
|
|
501
|
+
```typescript
|
|
502
|
+
import { standardizeIntahaSymbol } from 'baburchi';
|
|
503
|
+
|
|
504
|
+
// Standardize standalone AH abbreviation
|
|
505
|
+
const text1 = standardizeIntahaSymbol('سنة 1445 اه'); // 'سنة 1445 اهـ'
|
|
506
|
+
const text2 = standardizeIntahaSymbol('في العام اه'); // 'في العام اهـ'
|
|
507
|
+
|
|
508
|
+
// Does not affect اه when part of other words
|
|
509
|
+
const text3 = standardizeIntahaSymbol('الاهتمام بالتاريخ'); // 'الاهتمام بالتاريخ' (unchanged)
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### Combined Hijri Standardization
|
|
513
|
+
|
|
514
|
+
```typescript
|
|
515
|
+
import { standardizeHijriSymbol, standardizeIntahaSymbol } from 'baburchi';
|
|
516
|
+
|
|
517
|
+
function standardizeAllHijriNotations(text: string): string {
|
|
518
|
+
return standardizeIntahaSymbol(standardizeHijriSymbol(text));
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
const mixedText = 'وُلد سنة 570 ه وتوفي عام 632 اه';
|
|
522
|
+
const standardized = standardizeAllHijriNotations(mixedText);
|
|
523
|
+
console.log(standardized); // 'وُلد سنة 570 هـ وتوفي عام 632 اهـ'
|
|
524
|
+
```
|
|
525
|
+
|
|
311
526
|
## Utilities
|
|
312
527
|
|
|
313
528
|
The library also exports utility functions for advanced use cases:
|
|
@@ -315,20 +530,18 @@ The library also exports utility functions for advanced use cases:
|
|
|
315
530
|
```typescript
|
|
316
531
|
import {
|
|
317
532
|
calculateSimilarity,
|
|
318
|
-
normalizeArabicText,
|
|
319
533
|
tokenizeText,
|
|
320
534
|
alignTokenSequences,
|
|
321
535
|
hasInvalidFootnotes,
|
|
322
536
|
correctReferences,
|
|
323
537
|
alignTextSegments,
|
|
538
|
+
standardizeHijriSymbol,
|
|
539
|
+
standardizeIntahaSymbol,
|
|
324
540
|
} from 'baburchi';
|
|
325
541
|
|
|
326
542
|
// Calculate similarity between two strings
|
|
327
543
|
const similarity = calculateSimilarity('hello', 'helo'); // 0.8
|
|
328
544
|
|
|
329
|
-
// Normalize Arabic text
|
|
330
|
-
const normalized = normalizeArabicText('اَلسَّلَامُ'); // 'السلام'
|
|
331
|
-
|
|
332
545
|
// Tokenize with symbol preservation
|
|
333
546
|
const tokens = tokenizeText('محمد ﷺ رسول', ['ﷺ']); // ['محمد', 'ﷺ', 'رسول']
|
|
334
547
|
|
|
@@ -347,6 +560,10 @@ const aligned = alignTextSegments(
|
|
|
347
560
|
['target line one', '', 'target line three'],
|
|
348
561
|
['segment1', 'segment2', 'segment3', 'segment4'],
|
|
349
562
|
);
|
|
563
|
+
|
|
564
|
+
// Standardize Hijri date symbols
|
|
565
|
+
const hijriText = standardizeHijriSymbol('سنة 1445 ه'); // 'سنة 1445 هـ'
|
|
566
|
+
const ahText = standardizeIntahaSymbol('عام 632 اه'); // 'عام 632 اهـ'
|
|
350
567
|
```
|
|
351
568
|
|
|
352
569
|
## Noise Detection
|
package/dist/index.d.ts
CHANGED
|
@@ -1,32 +1,3 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Configuration options for fixing typos in OCR text using alignment algorithms.
|
|
3
|
-
* These options control how text tokens are compared, aligned, and merged during typo correction.
|
|
4
|
-
*/
|
|
5
|
-
type FixTypoOptions = {
|
|
6
|
-
/**
|
|
7
|
-
* High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
|
|
8
|
-
* Used in post-processing to eliminate redundant tokens that are nearly identical.
|
|
9
|
-
* Should typically be higher than similarityThreshold to catch only very similar duplicates.
|
|
10
|
-
* @default 0.9
|
|
11
|
-
* @example 0.95 // Removes tokens that are 95% or more similar
|
|
12
|
-
*/
|
|
13
|
-
readonly highSimilarityThreshold: number;
|
|
14
|
-
/**
|
|
15
|
-
* Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
|
|
16
|
-
* Higher values require closer matches, lower values are more permissive.
|
|
17
|
-
* Used in the Needleman-Wunsch alignment algorithm for token matching.
|
|
18
|
-
* @default 0.7
|
|
19
|
-
* @example 0.8 // Requires 80% similarity for token alignment
|
|
20
|
-
*/
|
|
21
|
-
readonly similarityThreshold: number;
|
|
22
|
-
/**
|
|
23
|
-
* Array of special symbols that should be preserved during typo correction.
|
|
24
|
-
* These symbols (like honorifics or religious markers) take precedence in token selection.
|
|
25
|
-
* @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
|
|
26
|
-
*/
|
|
27
|
-
readonly typoSymbols: string[];
|
|
28
|
-
};
|
|
29
|
-
|
|
30
1
|
/**
|
|
31
2
|
* Aligns split text segments to match target lines by finding the best order.
|
|
32
3
|
*
|
|
@@ -214,6 +185,88 @@ type TextLine = {
|
|
|
214
185
|
*/
|
|
215
186
|
declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
|
|
216
187
|
|
|
188
|
+
/**
|
|
189
|
+
* Configuration options for fixing typos in OCR text using alignment algorithms.
|
|
190
|
+
* These options control how text tokens are compared, aligned, and merged during typo correction.
|
|
191
|
+
*/
|
|
192
|
+
type FixTypoOptions = {
|
|
193
|
+
/**
|
|
194
|
+
* High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
|
|
195
|
+
* Used in post-processing to eliminate redundant tokens that are nearly identical.
|
|
196
|
+
* Should typically be higher than similarityThreshold to catch only very similar duplicates.
|
|
197
|
+
* @default 0.9
|
|
198
|
+
* @example 0.95 // Removes tokens that are 95% or more similar
|
|
199
|
+
*/
|
|
200
|
+
readonly highSimilarityThreshold: number;
|
|
201
|
+
/**
|
|
202
|
+
* Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
|
|
203
|
+
* Higher values require closer matches, lower values are more permissive.
|
|
204
|
+
* Used in the Needleman-Wunsch alignment algorithm for token matching.
|
|
205
|
+
* @default 0.7
|
|
206
|
+
* @example 0.8 // Requires 80% similarity for token alignment
|
|
207
|
+
*/
|
|
208
|
+
readonly similarityThreshold: number;
|
|
209
|
+
/**
|
|
210
|
+
* Array of special symbols that should be preserved during typo correction.
|
|
211
|
+
* These symbols (like honorifics or religious markers) take precedence in token selection.
|
|
212
|
+
* @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
|
|
213
|
+
*/
|
|
214
|
+
readonly typoSymbols: string[];
|
|
215
|
+
};
|
|
216
|
+
type MatchPolicy = {
|
|
217
|
+
/** Try approximate matches for leftovers (default true). */
|
|
218
|
+
enableFuzzy?: boolean;
|
|
219
|
+
/** Max absolute edit distance accepted in fuzzy (default 3). */
|
|
220
|
+
maxEditAbs?: number;
|
|
221
|
+
/** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
|
|
222
|
+
maxEditRel?: number;
|
|
223
|
+
/** q-gram length for candidate generation (default 4). */
|
|
224
|
+
q?: number;
|
|
225
|
+
/** Max rare grams to seed candidates per excerpt (default 5). */
|
|
226
|
+
gramsPerExcerpt?: number;
|
|
227
|
+
/** Max candidate windows verified per excerpt (default 40). */
|
|
228
|
+
maxCandidatesPerExcerpt?: number;
|
|
229
|
+
/** Seam length for bleed windows (default 512). */
|
|
230
|
+
seamLen?: number;
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Main function to find the single best match per excerpt.
|
|
235
|
+
* Combines exact matching with fuzzy matching for comprehensive text search.
|
|
236
|
+
*
|
|
237
|
+
* @param pages - Array of page texts to search within
|
|
238
|
+
* @param excerpts - Array of text excerpts to find matches for
|
|
239
|
+
* @param policy - Optional matching policy configuration
|
|
240
|
+
* @returns Array of page indices (one per excerpt, -1 if no match found)
|
|
241
|
+
*
|
|
242
|
+
* @example
|
|
243
|
+
* ```typescript
|
|
244
|
+
* const pages = ['Hello world', 'Goodbye world'];
|
|
245
|
+
* const excerpts = ['Hello', 'Good bye']; // Note the typo
|
|
246
|
+
* const matches = findMatches(pages, excerpts, { enableFuzzy: true });
|
|
247
|
+
* // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
|
|
248
|
+
* ```
|
|
249
|
+
*/
|
|
250
|
+
declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[];
|
|
251
|
+
/**
|
|
252
|
+
* Main function to find all matches per excerpt, ranked by quality.
|
|
253
|
+
* Returns comprehensive results with both exact and fuzzy matches for each excerpt.
|
|
254
|
+
*
|
|
255
|
+
* @param pages - Array of page texts to search within
|
|
256
|
+
* @param excerpts - Array of text excerpts to find matches for
|
|
257
|
+
* @param policy - Optional matching policy configuration
|
|
258
|
+
* @returns Array of page index arrays (one array per excerpt, sorted by match quality)
|
|
259
|
+
*
|
|
260
|
+
* @example
|
|
261
|
+
* ```typescript
|
|
262
|
+
* const pages = ['Hello world', 'Hello there', 'Goodbye world'];
|
|
263
|
+
* const excerpts = ['Hello'];
|
|
264
|
+
* const matches = findMatchesAll(pages, excerpts);
|
|
265
|
+
* // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
|
|
269
|
+
|
|
217
270
|
/**
|
|
218
271
|
* Character statistics for analyzing text content and patterns
|
|
219
272
|
*/
|
|
@@ -382,6 +435,19 @@ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number,
|
|
|
382
435
|
*/
|
|
383
436
|
declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
|
|
384
437
|
|
|
438
|
+
/**
|
|
439
|
+
* Processes text alignment between original and alternate OCR results to fix typos.
|
|
440
|
+
* Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
|
|
441
|
+
* then selects the best tokens and performs post-processing.
|
|
442
|
+
*
|
|
443
|
+
* @param originalText - Original OCR text that may contain typos
|
|
444
|
+
* @param altText - Reference text from alternate OCR for comparison
|
|
445
|
+
* @param options - Configuration options for alignment and selection
|
|
446
|
+
* @returns Corrected text with typos fixed
|
|
447
|
+
*/
|
|
448
|
+
declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
|
|
449
|
+
declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
|
|
450
|
+
|
|
385
451
|
/**
|
|
386
452
|
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
|
|
387
453
|
* The Levenshtein distance is the minimum number of single-character edits (insertions,
|
|
@@ -396,6 +462,87 @@ declare function isValidArabicContent(charStats: CharacterStats, textLength: num
|
|
|
396
462
|
* calculateLevenshteinDistance('', 'hello') // Returns 5
|
|
397
463
|
*/
|
|
398
464
|
declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
|
|
465
|
+
/**
|
|
466
|
+
* Calculates bounded Levenshtein distance with early termination.
|
|
467
|
+
* More efficient when you only care about distances up to a threshold.
|
|
468
|
+
*/
|
|
469
|
+
declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Ultra-fast Arabic text sanitizer for search/indexing/display.
|
|
473
|
+
* Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
|
|
474
|
+
* Options can merge over a base preset or `'none'` to apply exactly the rules you request.
|
|
475
|
+
*/
|
|
476
|
+
type SanitizePreset = 'light' | 'search' | 'aggressive';
|
|
477
|
+
type SanitizeBase = 'none' | SanitizePreset;
|
|
478
|
+
/**
|
|
479
|
+
* Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
|
|
480
|
+
* `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
|
|
481
|
+
* directly into local booleans for speed.
|
|
482
|
+
*/
|
|
483
|
+
type SanitizeOptions = {
|
|
484
|
+
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
|
|
485
|
+
base?: SanitizeBase;
|
|
486
|
+
/** Unicode NFC normalization. Default: `true` in all presets. */
|
|
487
|
+
nfc?: boolean;
|
|
488
|
+
/** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
|
|
489
|
+
stripZeroWidth?: boolean;
|
|
490
|
+
/** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
|
|
491
|
+
zeroWidthToSpace?: boolean;
|
|
492
|
+
/** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
|
|
493
|
+
stripDiacritics?: boolean;
|
|
494
|
+
/**
|
|
495
|
+
* Remove tatweel (ـ).
|
|
496
|
+
* - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
|
|
497
|
+
* - `'safe'` or `'all'` explicitly
|
|
498
|
+
* - `false` to keep tatweel
|
|
499
|
+
* Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
500
|
+
*/
|
|
501
|
+
stripTatweel?: boolean | 'safe' | 'all';
|
|
502
|
+
/** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
|
|
503
|
+
normalizeAlif?: boolean;
|
|
504
|
+
/** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
|
|
505
|
+
replaceAlifMaqsurah?: boolean;
|
|
506
|
+
/** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
|
|
507
|
+
replaceTaMarbutahWithHa?: boolean;
|
|
508
|
+
/** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
|
|
509
|
+
stripLatinAndSymbols?: boolean;
|
|
510
|
+
/** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
|
|
511
|
+
keepOnlyArabicLetters?: boolean;
|
|
512
|
+
/** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
|
|
513
|
+
lettersAndSpacesOnly?: boolean;
|
|
514
|
+
/** Collapse runs of whitespace to a single space. Default: `true`. */
|
|
515
|
+
collapseWhitespace?: boolean;
|
|
516
|
+
/** Trim leading/trailing whitespace. Default: `true`. */
|
|
517
|
+
trim?: boolean;
|
|
518
|
+
/**
|
|
519
|
+
* Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
|
|
520
|
+
* (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
|
|
521
|
+
* Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
522
|
+
*/
|
|
523
|
+
removeHijriMarker?: boolean;
|
|
524
|
+
};
|
|
525
|
+
/**
|
|
526
|
+
* Sanitizes Arabic text according to a preset or custom options.
|
|
527
|
+
*
|
|
528
|
+
* Presets:
|
|
529
|
+
* - `'light'`: NFC, zero-width removal, collapse/trim spaces.
|
|
530
|
+
* - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
|
|
531
|
+
* - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
|
|
532
|
+
*
|
|
533
|
+
* Custom options:
|
|
534
|
+
* - Passing an options object overlays the selected `base` preset (default `'light'`).
|
|
535
|
+
* - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
|
|
536
|
+
*
|
|
537
|
+
* Examples:
|
|
538
|
+
* ```ts
|
|
539
|
+
* sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
|
|
540
|
+
* sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
|
|
541
|
+
* sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
|
|
542
|
+
* ```
|
|
543
|
+
*/
|
|
544
|
+
declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
|
|
545
|
+
|
|
399
546
|
/**
|
|
400
547
|
* Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
|
|
401
548
|
* Uses Levenshtein distance normalized by the length of the longer string.
|
|
@@ -470,6 +617,7 @@ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[],
|
|
|
470
617
|
*/
|
|
471
618
|
declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
|
|
472
619
|
|
|
620
|
+
declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
|
|
473
621
|
/**
|
|
474
622
|
* Collection of regex patterns used throughout the library for text processing
|
|
475
623
|
*/
|
|
@@ -486,8 +634,6 @@ declare const PATTERNS: {
|
|
|
486
634
|
arabicPunctuationAndWhitespace: RegExp;
|
|
487
635
|
/** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
|
|
488
636
|
arabicReferenceRegex: RegExp;
|
|
489
|
-
/** Matches Arabic diacritical marks (harakat, tanween, etc.) */
|
|
490
|
-
diacritics: RegExp;
|
|
491
637
|
/** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
|
|
492
638
|
footnoteEmbedded: RegExp;
|
|
493
639
|
/** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
|
|
@@ -498,22 +644,9 @@ declare const PATTERNS: {
|
|
|
498
644
|
ocrConfusedFootnoteReferenceRegex: RegExp;
|
|
499
645
|
/** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
|
|
500
646
|
ocrConfusedReferenceRegex: RegExp;
|
|
501
|
-
/** Matches Arabic tatweel (kashida) character used for text stretching */
|
|
502
|
-
tatweel: RegExp;
|
|
503
647
|
/** Matches one or more whitespace characters */
|
|
504
648
|
whitespace: RegExp;
|
|
505
649
|
};
|
|
506
|
-
/**
|
|
507
|
-
* Normalizes Arabic text by removing diacritics, and tatweel marks.
|
|
508
|
-
* This normalization enables better text comparison by focusing on core characters
|
|
509
|
-
* while ignoring decorative elements that don't affect meaning.
|
|
510
|
-
*
|
|
511
|
-
* @param text - Arabic text to normalize
|
|
512
|
-
* @returns Normalized text with diacritics, tatweel, and basic tags removed
|
|
513
|
-
* @example
|
|
514
|
-
* normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
|
|
515
|
-
*/
|
|
516
|
-
declare const normalizeArabicText: (text: string) => string;
|
|
517
650
|
/**
|
|
518
651
|
* Extracts the first sequence of Arabic or Western digits from text.
|
|
519
652
|
* Used primarily for footnote number comparison to match related footnote elements.
|
|
@@ -527,8 +660,8 @@ declare const normalizeArabicText: (text: string) => string;
|
|
|
527
660
|
declare const extractDigits: (text: string) => string;
|
|
528
661
|
/**
|
|
529
662
|
* Tokenizes text into individual words while preserving special symbols.
|
|
530
|
-
*
|
|
531
|
-
*
|
|
663
|
+
* Adds spacing around preserved symbols to ensure they are tokenized separately,
|
|
664
|
+
* then splits on whitespace.
|
|
532
665
|
*
|
|
533
666
|
* @param text - Text to tokenize
|
|
534
667
|
* @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
|
|
@@ -577,18 +710,17 @@ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null
|
|
|
577
710
|
* handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
|
|
578
711
|
*/
|
|
579
712
|
declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
|
|
580
|
-
|
|
581
713
|
/**
|
|
582
|
-
*
|
|
583
|
-
*
|
|
584
|
-
*
|
|
585
|
-
*
|
|
586
|
-
* @param originalText - Original OCR text that may contain typos
|
|
587
|
-
* @param altText - Reference text from alternate OCR for comparison
|
|
588
|
-
* @param options - Configuration options for alignment and selection
|
|
589
|
-
* @returns Corrected text with typos fixed
|
|
714
|
+
* Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
|
|
715
|
+
* @param text - Input text to process
|
|
716
|
+
* @returns Text with standardized Hijri symbols
|
|
590
717
|
*/
|
|
591
|
-
declare const
|
|
592
|
-
|
|
718
|
+
declare const standardizeHijriSymbol: (text: string) => string;
|
|
719
|
+
/**
|
|
720
|
+
* Standardizes standalone اه to اهـ when appearing as whole word
|
|
721
|
+
* @param text - Input text to process
|
|
722
|
+
* @returns Text with standardized AH Hijri symbols
|
|
723
|
+
*/
|
|
724
|
+
declare const standardizeIntahaSymbol: (text: string) => string;
|
|
593
725
|
|
|
594
|
-
export { BRACKETS, CLOSE_BRACKETS, type CharacterError, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent,
|
|
726
|
+
export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, type SanitizeBase, type SanitizeOptions, type SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|