baburchi 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.md ADDED
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ragaeeb Haq
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,262 @@
1
+ # baburchi
2
+
3
+ [![wakatime](https://wakatime.com/badge/user/a0b906ce-b8e7-4463-8bce-383238df6d4b/project/84c1b69b-fd5f-4e84-9c10-545c723a0fa9.svg)](https://wakatime.com/badge/user/a0b906ce-b8e7-4463-8bce-383238df6d4b/project/84c1b69b-fd5f-4e84-9c10-545c723a0fa9)
4
+ ![Bun](https://img.shields.io/badge/Bun-%23000000.svg?style=for-the-badge&logo=bun&logoColor=white)
5
+ [![Node.js CI](https://github.com/ragaeeb/baburchi/actions/workflows/build.yml/badge.svg)](https://github.com/ragaeeb/baburchi/actions/workflows/build.yml)
6
+ ![GitHub License](https://img.shields.io/github/license/ragaeeb/baburchi)
7
+ ![GitHub Release](https://img.shields.io/github/v/release/ragaeeb/baburchi)
8
+ [![codecov](https://codecov.io/gh/ragaeeb/baburchi/graph/badge.svg?token=R3BOH5KVXM)](https://codecov.io/gh/ragaeeb/baburchi)
9
+ [![Size](https://deno.bundlejs.com/badge?q=baburchi@latest&badge=detailed)](https://bundlejs.com/?q=baburchi%40latest)
10
+ ![typescript](https://badgen.net/badge/icon/typescript?icon=typescript&label&color=blue)
11
+ ![npm](https://img.shields.io/npm/dm/baburchi)
12
+ ![GitHub issues](https://img.shields.io/github/issues/ragaeeb/baburchi)
13
+ ![GitHub stars](https://img.shields.io/github/stars/ragaeeb/baburchi?style=social)
14
+
15
+ A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms.
16
+
17
+ ## Features
18
+
19
+ - 🧠 **Intelligent Text Alignment**: Uses the Needleman-Wunsch algorithm for optimal text sequence alignment
20
+ - 🔤 **Arabic Text Specialization**: Advanced normalization and diacritics handling for Arabic text
21
+ - 📝 **Footnote Management**: Smart handling of embedded and standalone footnotes
22
+ - ⚡ **High Performance**: Space-optimized algorithms with O(min(m,n)) space complexity
23
+ - 🎯 **Special Symbol Preservation**: Configurable preservation of religious symbols and honorifics
24
+ - 🔧 **Flexible Configuration**: Customizable similarity thresholds and typo symbols
25
+ - 📦 **Zero Dependencies**: Pure TypeScript implementation with no external dependencies
26
+ - 🌐 **Universal Compatibility**: Works in Node.js, Bun, and modern browsers
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ # Using npm
32
+ npm install baburchi
33
+
34
+ # Using yarn
35
+ yarn add baburchi
36
+
37
+ # Using pnpm
38
+ pnpm add baburchi
39
+
40
+ # Using bun
41
+ bun add baburchi
42
+ ```
43
+
44
+ ## Quick Start
45
+
46
+ ```typescript
47
+ import { fixTypo } from 'baburchi';
48
+
49
+ // Basic usage with Arabic text
50
+ const originalText = 'محمد صلى الله عليه وسلم رسول الله';
51
+ const correctedText = 'محمد ﷺ رسول الله';
52
+ const typoSymbols = ['ﷺ', '﷽', 'ﷻ'];
53
+
54
+ const result = fixTypo(originalText, correctedText, { typoSymbols });
55
+ console.log(result); // 'محمد صلى الله عليه ﷺ رسول الله'
56
+ ```
57
+
58
+ ## API Reference
59
+
60
+ ### `fixTypo(original, correction, options)`
61
+
62
+ The main function for correcting typos using text alignment.
63
+
64
+ **Parameters:**
65
+
66
+ - `original` (string): The original OCR text that may contain typos
67
+ - `correction` (string): The reference text for comparison
68
+ - `options` (object): Configuration options
69
+
70
+ **Options:**
71
+
72
+ - `typoSymbols` (string[], required): Array of special symbols to preserve
73
+ - `similarityThreshold` (number, optional): Threshold for token alignment (default: 0.6)
74
+ - `highSimilarityThreshold` (number, optional): Threshold for duplicate detection (default: 0.8)
75
+
76
+ **Returns:** Corrected text string
77
+
78
+ ### `processTextAlignment(originalText, altText, options)`
79
+
80
+ Low-level function for advanced text processing with full configuration control.
81
+
82
+ **Parameters:**
83
+
84
+ - `originalText` (string): Original text to process
85
+ - `altText` (string): Reference text for alignment
86
+ - `options` (FixTypoOptions): Complete configuration object
87
+
88
+ ## Usage Examples
89
+
90
+ ### Basic Arabic Text Correction
91
+
92
+ ```typescript
93
+ import { fixTypo } from 'baburchi';
94
+
95
+ const original = 'النص الأصلي مع أخطاء إملائية';
96
+ const reference = 'النص الأصلي مع أخطاء إملائية';
97
+ const typoSymbols = ['ﷺ', '﷽', 'ﷻ'];
98
+
99
+ const corrected = fixTypo(original, reference, { typoSymbols });
100
+ ```
101
+
102
+ ### Handling Religious Symbols
103
+
104
+ ```typescript
105
+ import { fixTypo } from 'baburchi';
106
+
107
+ // OCR might split religious phrases
108
+ const ocrText = 'محمد صلى الله عليه وسلم خير الأنام';
109
+ const referenceText = 'محمد ﷺ خير الأنام';
110
+
111
+ const result = fixTypo(ocrText, referenceText, {
112
+ typoSymbols: ['ﷺ', '﷽', 'ﷻ'],
113
+ similarityThreshold: 0.7,
114
+ });
115
+
116
+ console.log(result); // 'محمد صلى الله عليه ﷺ خير الأنام'
117
+ ```
118
+
119
+ ### Custom Similarity Thresholds
120
+
121
+ ```typescript
122
+ import { fixTypo } from 'baburchi';
123
+
124
+ const result = fixTypo(original, reference, {
125
+ typoSymbols: ['ﷺ'],
126
+ similarityThreshold: 0.8, // Stricter alignment
127
+ highSimilarityThreshold: 0.95, // Very strict duplicate detection
128
+ });
129
+ ```
130
+
131
+ ### Advanced Usage with Full Configuration
132
+
133
+ ```typescript
134
+ import { processTextAlignment } from 'baburchi';
135
+
136
+ const options = {
137
+ typoSymbols: ['ﷺ', '﷽', 'ﷻ'],
138
+ similarityThreshold: 0.7,
139
+ highSimilarityThreshold: 0.9,
140
+ };
141
+
142
+ const result = processTextAlignment('Original text with typos', 'Reference text for correction', options);
143
+ ```
144
+
145
+ ### Footnote Handling
146
+
147
+ ```typescript
148
+ import { fixTypo } from 'baburchi';
149
+
150
+ // Handles embedded and standalone footnotes intelligently
151
+ const textWithFootnotes = 'النص (١) مع الحواشي (٢)أخرجه البخاري';
152
+ const reference = 'النص (١) مع الحواشي (٢)';
153
+
154
+ const corrected = fixTypo(textWithFootnotes, reference, {
155
+ typoSymbols: [],
156
+ });
157
+ // Result preserves footnote formatting
158
+ ```
159
+
160
+ ## Algorithm Overview
161
+
162
+ Baburchi uses the **Needleman-Wunsch global sequence alignment algorithm** to optimally align text tokens:
163
+
164
+ 1. **Tokenization**: Text is split into tokens while preserving special symbols
165
+ 2. **Normalization**: Arabic text is normalized by removing diacritics and tatweel marks
166
+ 3. **Alignment**: Tokens are aligned using dynamic programming with custom scoring
167
+ 4. **Selection**: Best tokens are selected based on similarity and special rules
168
+ 5. **Post-processing**: Duplicates are removed and footnotes are fused
169
+
170
+ ### Scoring System
171
+
172
+ - **Perfect Match** (+2): Identical tokens after normalization
173
+ - **Soft Match** (+1): High similarity or contains typo symbols
174
+ - **Mismatch** (-2): Dissimilar tokens
175
+ - **Gap Penalty** (-1): Insertion or deletion
176
+
177
+ ## Performance
178
+
179
+ - **Time Complexity**: O(m×n) for alignment, where m and n are token sequence lengths
180
+ - **Space Complexity**: O(min(m,n)) using space-optimized dynamic programming
181
+ - **Memory Efficient**: Processes text in chunks without storing large matrices
182
+
183
+ ## Browser Support
184
+
185
+ Baburchi works in all modern environments:
186
+
187
+ - ✅ Node.js 18+
188
+ - ✅ Bun 1.0+
189
+ - ✅ Modern browsers (ES2020+)
190
+ - ✅ Deno (with npm compatibility)
191
+
192
+ ## TypeScript Support
193
+
194
+ Baburchi is written in TypeScript and provides full type definitions:
195
+
196
+ ```typescript
197
+ import type { FixTypoOptions } from 'baburchi';
198
+
199
+ const options: FixTypoOptions = {
200
+ typoSymbols: ['ﷺ'],
201
+ similarityThreshold: 0.7,
202
+ highSimilarityThreshold: 0.9,
203
+ };
204
+ ```
205
+
206
+ ## Utilities
207
+
208
+ The library also exports utility functions for advanced use cases:
209
+
210
+ ```typescript
211
+ import { calculateSimilarity, normalizeArabicText, tokenizeText, alignTokenSequences } from 'baburchi';
212
+
213
+ // Calculate similarity between two strings
214
+ const similarity = calculateSimilarity('hello', 'helo'); // 0.8
215
+
216
+ // Normalize Arabic text
217
+ const normalized = normalizeArabicText('اَلسَّلَامُ'); // 'السلام'
218
+
219
+ // Tokenize with symbol preservation
220
+ const tokens = tokenizeText('محمد ﷺ رسول', ['ﷺ']); // ['محمد', 'ﷺ', 'رسول']
221
+ ```
222
+
223
+ ## Contributing
224
+
225
+ Contributions are welcome. Please ensure your contributions adhere to the coding standards and include relevant tests.
226
+
227
+ ### Development Setup
228
+
229
+ 1. Fork the repository
230
+ 2. Install dependencies: `bun install` (requires [Bun](https://bun.sh/))
231
+ 3. Make your changes
232
+ 4. Run tests: `bun test`
233
+ 5. Run linting: `bun run lint`
234
+ 6. Submit a pull request
235
+
236
+ ### Running Tests
237
+
238
+ ```bash
239
+ # Run tests with coverage
240
+ bun test --coverage
241
+
242
+ # Run tests in watch mode
243
+ bun test --watch
244
+ ```
245
+
246
+ ## Changelog
247
+
248
+ See [CHANGELOG.md](./CHANGELOG.md) for release history.
249
+
250
+ ## License
251
+
252
+ `baburchi` is released under the MIT License. See the [LICENSE.md](./LICENSE.md) file for more details.
253
+
254
+ ## Author
255
+
256
+ Ragaeeb Haq
257
+
258
+ - GitHub: [@ragaeeb](https://github.com/ragaeeb)
259
+
260
+ ---
261
+
262
+ Built with ❤️ using TypeScript and Bun. Optimized for Arabic text processing and OCR post-processing.
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Configuration options for fixing typos in OCR text using alignment algorithms.
3
+ * These options control how text tokens are compared, aligned, and merged during typo correction.
4
+ */
5
+ type FixTypoOptions = {
6
+ /**
7
+ * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
8
+ * Used in post-processing to eliminate redundant tokens that are nearly identical.
9
+ * Should typically be higher than similarityThreshold to catch only very similar duplicates.
10
+ * @default 0.9
11
+ * @example 0.95 // Removes tokens that are 95% or more similar
12
+ */
13
+ readonly highSimilarityThreshold: number;
14
+ /**
15
+ * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
16
+ * Higher values require closer matches, lower values are more permissive.
17
+ * Used in the Needleman-Wunsch alignment algorithm for token matching.
18
+ * @default 0.7
19
+ * @example 0.8 // Requires 80% similarity for token alignment
20
+ */
21
+ readonly similarityThreshold: number;
22
+ /**
23
+ * Array of special symbols that should be preserved during typo correction.
24
+ * These symbols (like honorifics or religious markers) take precedence in token selection.
25
+ * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
26
+ */
27
+ readonly typoSymbols: string[];
28
+ };
29
+
30
+ /**
31
+ * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
32
+ * The Levenshtein distance is the minimum number of single-character edits (insertions,
33
+ * deletions, or substitutions) required to change one string into another.
34
+ *
35
+ * @param textA - First string to compare
36
+ * @param textB - Second string to compare
37
+ * @returns Minimum edit distance between the two strings
38
+ * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths
39
+ * @example
40
+ * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3
41
+ * calculateLevenshteinDistance('', 'hello') // Returns 5
42
+ */
43
+ declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
44
+ /**
45
+ * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
46
+ * Uses Levenshtein distance normalized by the length of the longer string.
47
+ * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.
48
+ *
49
+ * @param textA - First string to compare
50
+ * @param textB - Second string to compare
51
+ * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)
52
+ * @example
53
+ * calculateSimilarity('hello', 'hello') // Returns 1.0
54
+ * calculateSimilarity('hello', 'help') // Returns 0.6
55
+ */
56
+ declare const calculateSimilarity: (textA: string, textB: string) => number;
57
+ /**
58
+ * Checks if two texts are similar after Arabic normalization.
59
+ * Normalizes both texts by removing diacritics and decorative elements,
60
+ * then compares their similarity against the provided threshold.
61
+ *
62
+ * @param textA - First text to compare
63
+ * @param textB - Second text to compare
64
+ * @param threshold - Similarity threshold (0.0 to 1.0)
65
+ * @returns True if normalized texts meet the similarity threshold
66
+ * @example
67
+ * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true
68
+ */
69
+ declare const areSimilarAfterNormalization: (textA: string, textB: string, threshold?: number) => boolean;
70
+ /**
71
+ * Calculates alignment score for two tokens in sequence alignment.
72
+ * Uses different scoring criteria: perfect match after normalization gets highest score,
73
+ * typo symbols or highly similar tokens get soft match score, mismatches get penalty.
74
+ *
75
+ * @param tokenA - First token to score
76
+ * @param tokenB - Second token to score
77
+ * @param typoSymbols - Array of special symbols that get preferential treatment
78
+ * @param similarityThreshold - Threshold for considering tokens highly similar
79
+ * @returns Alignment score (higher is better match)
80
+ * @example
81
+ * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)
82
+ * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity
83
+ */
84
+ declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
85
+ type AlignedTokenPair = [null | string, null | string];
86
+ type AlignmentCell = {
87
+ direction: 'diagonal' | 'left' | 'up' | null;
88
+ score: number;
89
+ };
90
+ /**
91
+ * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
92
+ * Follows the directional indicators in the matrix to build the sequence of aligned
93
+ * token pairs from the Needleman-Wunsch algorithm.
94
+ *
95
+ * @param matrix - Scoring matrix with directional information from alignment
96
+ * @param tokensA - First sequence of tokens
97
+ * @param tokensB - Second sequence of tokens
98
+ * @returns Array of aligned token pairs, where null indicates a gap
99
+ * @throws Error if invalid alignment direction is encountered
100
+ */
101
+ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[], tokensB: string[]) => AlignedTokenPair[];
102
+ /**
103
+ * Performs global sequence alignment using the Needleman-Wunsch algorithm.
104
+ * Aligns two token sequences to find the optimal pairing that maximizes
105
+ * the total alignment score, handling insertions, deletions, and substitutions.
106
+ *
107
+ * @param tokensA - First sequence of tokens to align
108
+ * @param tokensB - Second sequence of tokens to align
109
+ * @param typoSymbols - Special symbols that affect scoring
110
+ * @param similarityThreshold - Threshold for high similarity scoring
111
+ * @returns Array of aligned token pairs, with null indicating gaps
112
+ * @example
113
+ * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)
114
+ * // Returns [['a', 'a'], ['b', 'c']]
115
+ */
116
+ declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
117
+
118
+ declare const PATTERNS: {
119
+ arabicDigits: RegExp;
120
+ arabicLettersAndDigits: RegExp;
121
+ arabicPunctuationAndWhitespace: RegExp;
122
+ diacritics: RegExp;
123
+ footnoteEmbedded: RegExp;
124
+ footnoteStandalone: RegExp;
125
+ tatweel: RegExp;
126
+ whitespace: RegExp;
127
+ };
128
+ /**
129
+ * Normalizes Arabic text by removing diacritics, and tatweel marks.
130
+ * This normalization enables better text comparison by focusing on core characters
131
+ * while ignoring decorative elements that don't affect meaning.
132
+ *
133
+ * @param text - Arabic text to normalize
134
+ * @returns Normalized text with diacritics, tatweel, and basic tags removed
135
+ * @example
136
+ * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
137
+ */
138
+ declare const normalizeArabicText: (text: string) => string;
139
+ /**
140
+ * Extracts the first sequence of Arabic or Western digits from text.
141
+ * Used primarily for footnote number comparison to match related footnote elements.
142
+ *
143
+ * @param text - Text containing digits to extract
144
+ * @returns First digit sequence found, or empty string if none found
145
+ * @example
146
+ * extractDigits('(٥)أخرجه البخاري') // Returns '٥'
147
+ * extractDigits('See note (123)') // Returns '123'
148
+ */
149
+ declare const extractDigits: (text: string) => string;
150
+ /**
151
+ * Tokenizes text into individual words while preserving special symbols.
152
+ * Removes HTML tags, adds spacing around preserved symbols to ensure they
153
+ * are tokenized separately, then splits on whitespace.
154
+ *
155
+ * @param text - Text to tokenize
156
+ * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
157
+ * @returns Array of tokens, or empty array if input is empty/whitespace
158
+ * @example
159
+ * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']
160
+ */
161
+ declare const tokenizeText: (text: string, preserveSymbols?: string[]) => string[];
162
+ /**
163
+ * Handles fusion of standalone and embedded footnotes during token processing.
164
+ * Detects patterns where standalone footnotes should be merged with embedded ones
165
+ * or where trailing standalone footnotes should be skipped.
166
+ *
167
+ * @param result - Current result array being built
168
+ * @param previousToken - The previous token in the sequence
169
+ * @param currentToken - The current token being processed
170
+ * @returns True if the current token was handled (fused or skipped), false otherwise
171
+ * @example
172
+ * // (٥) + (٥)أخرجه → result gets (٥)أخرجه
173
+ * // (٥)أخرجه + (٥) → (٥) is skipped
174
+ */
175
+ declare const handleFootnoteFusion: (result: string[], previousToken: string, currentToken: string) => boolean;
176
+ /**
177
+ * Handles selection logic for tokens with embedded footnotes during alignment.
178
+ * Prefers tokens that contain embedded footnotes over plain text, and among
179
+ * tokens with embedded footnotes, prefers the shorter one.
180
+ *
181
+ * @param tokenA - First token to compare
182
+ * @param tokenB - Second token to compare
183
+ * @returns Array containing selected token(s), or null if no special handling needed
184
+ * @example
185
+ * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']
186
+ * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']
187
+ */
188
+ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null | string[];
189
+ /**
190
+ * Handles selection logic for standalone footnote tokens during alignment.
191
+ * Manages cases where one or both tokens are standalone footnotes, preserving
192
+ * both tokens when one is a footnote and the other is regular text.
193
+ *
194
+ * @param tokenA - First token to compare
195
+ * @param tokenB - Second token to compare
196
+ * @returns Array containing selected token(s), or null if no special handling needed
197
+ * @example
198
+ * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']
199
+ * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
200
+ */
201
+ declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
202
+
203
+ /**
204
+ * Processes text alignment between original and alternate OCR results to fix typos.
205
+ * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
206
+ * then selects the best tokens and performs post-processing.
207
+ *
208
+ * @param originalText - Original OCR text that may contain typos
209
+ * @param altText - Reference text from alternate OCR for comparison
210
+ * @param options - Configuration options for alignment and selection
211
+ * @returns Corrected text with typos fixed
212
+ */
213
+ declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
214
+ declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
215
+
216
+ export { PATTERNS, alignTokenSequences, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, extractDigits, fixTypo, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, normalizeArabicText, processTextAlignment, tokenizeText };
package/dist/index.js ADDED
@@ -0,0 +1,2 @@
1
+ var u={arabicDigits:/[0-9\u0660-\u0669]+/,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,tatweel:/\u0640/g,whitespace:/\s+/},d=t=>t.replace(u.tatweel,"").replace(u.diacritics,"").trim(),x=t=>{let n=t.match(u.arabicDigits);return n?n[0]:""},A=(t,n=[])=>{let e=t;for(let r of n){let i=new RegExp(r,"g");e=e.replace(i,` ${r} `)}return e.trim().split(u.whitespace).filter(Boolean)},T=(t,n,e)=>{let r=u.footnoteStandalone.test(n),i=u.footnoteEmbedded.test(e),s=u.footnoteStandalone.test(e),l=u.footnoteEmbedded.test(n),o=x(n),c=x(e);return r&&i&&o===c?(t[t.length-1]=e,!0):!!(l&&s&&o===c)},y=(t,n)=>{let e=u.footnoteEmbedded.test(t),r=u.footnoteEmbedded.test(n);return e&&!r?[t]:r&&!e?[n]:e&&r?[t.length<=n.length?t:n]:null},E=(t,n)=>{let e=u.footnoteStandalone.test(t),r=u.footnoteStandalone.test(n);return e&&!r?[t,n]:r&&!e?[n,t]:e&&r?[t.length<=n.length?t:n]:null};var f={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},C=(t,n)=>{let e=t.length,r=n.length;if(e===0)return r;if(r===0)return e;let[i,s]=e<=r?[t,n]:[n,t],l=i.length,o=s.length,c=Array.from({length:l+1},(a,g)=>g);for(let a=1;a<=o;a++){let g=[a];for(let m=1;m<=l;m++){let b=s[a-1]===i[m-1]?0:1,h=Math.min(c[m]+1,g[m-1]+1,c[m-1]+b);g.push(h)}c=g}return c[l]},p=(t,n)=>{let e=Math.max(t.length,n.length)||1,r=C(t,n);return(e-r)/e},P=(t,n,e=.6)=>{let r=d(t),i=d(n);return p(r,i)>=e},M=(t,n,e,r)=>{let i=d(t),s=d(n);if(i===s)return f.PERFECT_MATCH;let l=e.includes(t)||e.includes(n),o=p(i,s)>=r;return l||o?f.SOFT_MATCH:f.MISMATCH_PENALTY},z=(t,n,e)=>{let r=[],i=n.length,s=e.length;for(;i>0||s>0;)switch(t[i][s].direction){case"diagonal":r.push([n[--i],e[--s]]);break;case"left":r.push([null,e[--s]]);break;case"up":r.push([n[--i],null]);break;default:throw new Error("Invalid alignment direction")}return r.reverse()},F=(t,n,e,r)=>{let i=t.length,s=n.length,l=Array.from({length:i+1},()=>Array.from({length:s+1},()=>({direction:null,score:0})));for(let o=1;o<=i;o++)l[o][0]={direction:"up",score:o*f.GAP_PENALTY};for(let o=1;o<=s;o++)l[0][o]={direction:"left",score:o*f.GAP_PENALTY};for(let o=1;o<=i;o++)for(let c=1;c<=s;c++){let a=M(t[o-1],n[c-1],e,r),g=l[o-1][c-1].score+a,m=l[o-1][c].score+f.GAP_PENALTY,b=l[o][c-1].score+f.GAP_PENALTY,h=Math.max(g,m,b),S="left";h===g?S="diagonal":h===m&&(S="up"),l[o][c]={direction:S,score:h}}return z(l,t,n)};var L=(t,n,{similarityThreshold:e,typoSymbols:r})=>{if(t===null)return[n];if(n===null)return[t];if(d(t)===d(n))return[t];let i=y(t,n);if(i)return i;let s=E(t,n);if(s)return s;if(r.includes(t)||r.includes(n)){let a=r.find(g=>g===t||g===n);return a?[a]:[t]}let l=d(t),o=d(n);return[p(l,o)>e?t:n]},_=(t,n)=>{if(t.length===0)return t;let e=[];for(let r of t){if(e.length===0){e.push(r);continue}let i=e.at(-1);if(P(i,r,n)){r.length<i.length&&(e[e.length-1]=r);continue}T(e,i,r)||e.push(r)}return e},D=(t,n,e)=>{let r=A(t,e.typoSymbols),i=A(n,e.typoSymbols),l=F(r,i,e.typoSymbols,e.similarityThreshold).flatMap(([c,a])=>L(c,a,e));return _(l,e.highSimilarityThreshold).join(" ")},I=(t,n,{highSimilarityThreshold:e=.8,similarityThreshold:r=.6,typoSymbols:i})=>D(t,n,{highSimilarityThreshold:e,similarityThreshold:r,typoSymbols:i});export{u as PATTERNS,F as alignTokenSequences,P as areSimilarAfterNormalization,z as backtrackAlignment,M as calculateAlignmentScore,C as calculateLevenshteinDistance,p as calculateSimilarity,x as extractDigits,I as fixTypo,T as handleFootnoteFusion,y as handleFootnoteSelection,E as handleStandaloneFootnotes,d as normalizeArabicText,D as processTextAlignment,A as tokenizeText};
2
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/textUtils.ts","../src/similarity.ts","../src/index.ts"],"sourcesContent":["export const PATTERNS = {\n arabicDigits: /[0-9\\u0660-\\u0669]+/,\n arabicLettersAndDigits: /[0-9\\u0621-\\u063A\\u0641-\\u064A\\u0660-\\u0669]+/g,\n arabicPunctuationAndWhitespace: /[\\s\\u060C\\u061B\\u061F\\u06D4]+/,\n diacritics: /[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]/g,\n footnoteEmbedded: /\\([0-9\\u0660-\\u0669]+\\)/,\n footnoteStandalone: /^\\(?[0-9\\u0660-\\u0669]+\\)?[،.]?$/,\n tatweel: /\\u0640/g,\n whitespace: /\\s+/,\n};\n\n/**\n * Normalizes Arabic text by removing diacritics, and tatweel marks.\n * This normalization enables better text comparison by focusing on core characters\n * while ignoring decorative elements that don't affect meaning.\n *\n * @param text - Arabic text to normalize\n * @returns Normalized text with diacritics, tatweel, and basic tags removed\n * @example\n * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'\n */\nexport const normalizeArabicText = (text: string): string => {\n return text.replace(PATTERNS.tatweel, '').replace(PATTERNS.diacritics, '').trim();\n};\n\n/**\n * Extracts the first sequence of Arabic or Western digits from text.\n * Used primarily for footnote number comparison to match related footnote elements.\n *\n * @param text - Text containing digits to extract\n * @returns First digit sequence found, or empty string if none found\n * @example\n * extractDigits('(٥)أخرجه البخاري') // Returns '٥'\n * extractDigits('See note (123)') // Returns '123'\n */\nexport const extractDigits = (text: string): string => {\n const match = text.match(PATTERNS.arabicDigits);\n return match ? match[0] : '';\n};\n\n/**\n * Tokenizes text into individual words while preserving special symbols.\n * Removes HTML tags, adds spacing around preserved symbols to ensure they\n * are tokenized separately, then splits on whitespace.\n *\n * @param text - Text to tokenize\n * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens\n * @returns Array of tokens, or empty array if input is empty/whitespace\n * @example\n * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']\n */\nexport const tokenizeText = (text: string, preserveSymbols: string[] = []): string[] => {\n let processedText = text;\n\n // Add spaces around each preserve symbol to ensure they're tokenized separately\n for (const symbol of preserveSymbols) {\n const symbolRegex = new RegExp(symbol, 'g');\n processedText = processedText.replace(symbolRegex, ` ${symbol} `);\n }\n\n return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);\n};\n\n/**\n * Handles fusion of standalone and embedded footnotes during token processing.\n * Detects patterns where standalone footnotes should be merged with embedded ones\n * or where trailing standalone footnotes should be skipped.\n *\n * @param result - Current result array being built\n * @param previousToken - The previous token in the sequence\n * @param currentToken - The current token being processed\n * @returns True if the current token was handled (fused or skipped), false otherwise\n * @example\n * // (٥) + (٥)أخرجه → result gets (٥)أخرجه\n * // (٥)أخرجه + (٥) → (٥) is skipped\n */\nexport const handleFootnoteFusion = (result: string[], previousToken: string, currentToken: string): boolean => {\n const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);\n const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);\n const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);\n const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);\n\n const prevDigits = extractDigits(previousToken);\n const currDigits = extractDigits(currentToken);\n\n // Replace standalone with fused version: (٥) + (٥)أخرجه → (٥)أخرجه\n if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {\n result[result.length - 1] = currentToken;\n return true;\n }\n\n // Skip trailing standalone: (٥)أخرجه + (٥) → (٥)أخرجه\n if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) {\n return true;\n }\n\n return false;\n};\n\n/**\n * Handles selection logic for tokens with embedded footnotes during alignment.\n * Prefers tokens that contain embedded footnotes over plain text, and among\n * tokens with embedded footnotes, prefers the shorter one.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']\n * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']\n */\nexport const handleFootnoteSelection = (tokenA: string, tokenB: string): null | string[] => {\n const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);\n const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);\n\n if (aHasEmbedded && !bHasEmbedded) return [tokenA];\n if (bHasEmbedded && !aHasEmbedded) return [tokenB];\n if (aHasEmbedded && bHasEmbedded) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n\n/**\n * Handles selection logic for standalone footnote tokens during alignment.\n * Manages cases where one or both tokens are standalone footnotes, preserving\n * both tokens when one is a footnote and the other is regular text.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']\n * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)\n */\nexport const handleStandaloneFootnotes = (tokenA: string, tokenB: string): null | string[] => {\n const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);\n const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);\n\n if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];\n if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];\n if (aIsFootnote && bIsFootnote) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n","import { normalizeArabicText } from './textUtils';\n\n// Alignment scoring constants\nconst ALIGNMENT_SCORES = {\n GAP_PENALTY: -1,\n MISMATCH_PENALTY: -2,\n PERFECT_MATCH: 2,\n SOFT_MATCH: 1,\n};\n\n/**\n * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.\n * The Levenshtein distance is the minimum number of single-character edits (insertions,\n * deletions, or substitutions) required to change one string into another.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Minimum edit distance between the two strings\n * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths\n * @example\n * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3\n * calculateLevenshteinDistance('', 'hello') // Returns 5\n */\nexport const calculateLevenshteinDistance = (textA: string, textB: string): number => {\n const lengthA = textA.length;\n const lengthB = textB.length;\n\n if (lengthA === 0) {\n return lengthB;\n }\n\n if (lengthB === 0) {\n return lengthA;\n }\n\n // Use shorter string for the array to optimize space\n const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];\n const shortLen = shorter.length;\n const longLen = longer.length;\n\n let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);\n\n for (let i = 1; i <= longLen; i++) {\n const currentRow = [i];\n\n for (let j = 1; j <= shortLen; j++) {\n const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;\n const minCost = Math.min(\n previousRow[j] + 1, // deletion\n currentRow[j - 1] + 1, // insertion\n previousRow[j - 1] + substitutionCost, // substitution\n );\n currentRow.push(minCost);\n }\n\n previousRow = currentRow;\n }\n\n return previousRow[shortLen];\n};\n\n/**\n * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.\n * Uses Levenshtein distance normalized by the length of the longer string.\n * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)\n * @example\n * calculateSimilarity('hello', 'hello') // Returns 1.0\n * calculateSimilarity('hello', 'help') // Returns 0.6\n */\nexport const calculateSimilarity = (textA: string, textB: string): number => {\n const maxLength = Math.max(textA.length, textB.length) || 1;\n const distance = calculateLevenshteinDistance(textA, textB);\n return (maxLength - distance) / maxLength;\n};\n\n/**\n * Checks if two texts are similar after Arabic normalization.\n * Normalizes both texts by removing diacritics and decorative elements,\n * then compares their similarity against the provided threshold.\n *\n * @param textA - First text to compare\n * @param textB - Second text to compare\n * @param threshold - Similarity threshold (0.0 to 1.0)\n * @returns True if normalized texts meet the similarity threshold\n * @example\n * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true\n */\nexport const areSimilarAfterNormalization = (textA: string, textB: string, threshold: number = 0.6): boolean => {\n const normalizedA = normalizeArabicText(textA);\n const normalizedB = normalizeArabicText(textB);\n return calculateSimilarity(normalizedA, normalizedB) >= threshold;\n};\n\n/**\n * Calculates alignment score for two tokens in sequence alignment.\n * Uses different scoring criteria: perfect match after normalization gets highest score,\n * typo symbols or highly similar tokens get soft match score, mismatches get penalty.\n *\n * @param tokenA - First token to score\n * @param tokenB - Second token to score\n * @param typoSymbols - Array of special symbols that get preferential treatment\n * @param similarityThreshold - Threshold for considering tokens highly similar\n * @returns Alignment score (higher is better match)\n * @example\n * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)\n * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity\n */\nexport const calculateAlignmentScore = (\n tokenA: string,\n tokenB: string,\n typoSymbols: string[],\n similarityThreshold: number,\n): number => {\n const normalizedA = normalizeArabicText(tokenA);\n const normalizedB = normalizeArabicText(tokenB);\n\n // Perfect match after normalization\n if (normalizedA === normalizedB) {\n return ALIGNMENT_SCORES.PERFECT_MATCH;\n }\n\n // Check if either token is a typo symbol or high similarity\n const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);\n const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;\n\n if (isTypoSymbol || isHighlySimilar) {\n return ALIGNMENT_SCORES.SOFT_MATCH;\n }\n\n return ALIGNMENT_SCORES.MISMATCH_PENALTY;\n};\n\ntype AlignedTokenPair = [null | string, null | string];\n\ntype AlignmentCell = {\n direction: 'diagonal' | 'left' | 'up' | null;\n score: number;\n};\n\n/**\n * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.\n * Follows the directional indicators in the matrix to build the sequence of aligned\n * token pairs from the Needleman-Wunsch algorithm.\n *\n * @param matrix - Scoring matrix with directional information from alignment\n * @param tokensA - First sequence of tokens\n * @param tokensB - Second sequence of tokens\n * @returns Array of aligned token pairs, where null indicates a gap\n * @throws Error if invalid alignment direction is encountered\n */\nexport const backtrackAlignment = (\n matrix: AlignmentCell[][],\n tokensA: string[],\n tokensB: string[],\n): AlignedTokenPair[] => {\n const alignment: AlignedTokenPair[] = [];\n let i = tokensA.length;\n let j = tokensB.length;\n\n while (i > 0 || j > 0) {\n const currentCell = matrix[i][j];\n\n switch (currentCell.direction) {\n case 'diagonal':\n alignment.push([tokensA[--i], tokensB[--j]]);\n break;\n case 'left':\n alignment.push([null, tokensB[--j]]);\n break;\n case 'up':\n alignment.push([tokensA[--i], null]);\n break;\n default:\n throw new Error('Invalid alignment direction');\n }\n }\n\n return alignment.reverse();\n};\n\n/**\n * Performs global sequence alignment using the Needleman-Wunsch algorithm.\n * Aligns two token sequences to find the optimal pairing that maximizes\n * the total alignment score, handling insertions, deletions, and substitutions.\n *\n * @param tokensA - First sequence of tokens to align\n * @param tokensB - Second sequence of tokens to align\n * @param typoSymbols - Special symbols that affect scoring\n * @param similarityThreshold - Threshold for high similarity scoring\n * @returns Array of aligned token pairs, with null indicating gaps\n * @example\n * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)\n * // Returns [['a', 'a'], ['b', 'c']]\n */\nexport const alignTokenSequences = (\n tokensA: string[],\n tokensB: string[],\n typoSymbols: string[],\n similarityThreshold: number,\n): AlignedTokenPair[] => {\n const lengthA = tokensA.length;\n const lengthB = tokensB.length;\n\n // Initialize scoring matrix\n const scoringMatrix: AlignmentCell[][] = Array.from({ length: lengthA + 1 }, () =>\n Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 })),\n );\n\n // Initialize first row and column\n for (let i = 1; i <= lengthA; i++) {\n scoringMatrix[i][0] = { direction: 'up', score: i * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n for (let j = 1; j <= lengthB; j++) {\n scoringMatrix[0][j] = { direction: 'left', score: j * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n\n // Fill scoring matrix\n for (let i = 1; i <= lengthA; i++) {\n for (let j = 1; j <= lengthB; j++) {\n const alignmentScore = calculateAlignmentScore(\n tokensA[i - 1],\n tokensB[j - 1],\n typoSymbols,\n similarityThreshold,\n );\n\n const diagonalScore = scoringMatrix[i - 1][j - 1].score + alignmentScore;\n const upScore = scoringMatrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY;\n const leftScore = scoringMatrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY;\n\n const bestScore = Math.max(diagonalScore, upScore, leftScore);\n let bestDirection: 'diagonal' | 'left' | 'up' = 'left';\n\n if (bestScore === diagonalScore) {\n bestDirection = 'diagonal';\n } else if (bestScore === upScore) {\n bestDirection = 'up';\n }\n\n scoringMatrix[i][j] = { direction: bestDirection, score: bestScore };\n }\n }\n\n // Backtrack to build alignment\n return backtrackAlignment(scoringMatrix, tokensA, tokensB);\n};\n","import type { FixTypoOptions } from './types';\n\nimport { alignTokenSequences, areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport {\n handleFootnoteFusion,\n handleFootnoteSelection,\n handleStandaloneFootnotes,\n normalizeArabicText,\n tokenizeText,\n} from './textUtils';\n\n/**\n * Selects the best token(s) from an aligned pair during typo correction.\n * Uses various heuristics including normalization, footnote handling, typo symbols,\n * and similarity scores to determine which token(s) to keep.\n *\n * @param originalToken - Token from the original OCR text (may be null)\n * @param altToken - Token from the alternative OCR text (may be null)\n * @param options - Configuration options including typo symbols and similarity threshold\n * @returns Array of selected tokens (usually contains one token, but may contain multiple)\n */\nconst selectBestTokens = (\n originalToken: null | string,\n altToken: null | string,\n { similarityThreshold, typoSymbols }: FixTypoOptions,\n): string[] => {\n // Handle missing tokens\n if (originalToken === null) {\n return [altToken!];\n }\n if (altToken === null) {\n return [originalToken];\n }\n\n // Preserve original if same after normalization (keeps diacritics)\n if (normalizeArabicText(originalToken) === normalizeArabicText(altToken)) {\n return [originalToken];\n }\n\n // Handle embedded footnotes\n const result = handleFootnoteSelection(originalToken, altToken);\n if (result) return result;\n\n // Handle standalone footnotes\n const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);\n if (footnoteResult) return footnoteResult;\n\n // Handle typo symbols - prefer the symbol itself\n if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {\n const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);\n return typoSymbol ? [typoSymbol] : [originalToken];\n }\n\n // Choose based on similarity\n const normalizedOriginal = normalizeArabicText(originalToken);\n const normalizedAlt = normalizeArabicText(altToken);\n const similarity = calculateSimilarity(normalizedOriginal, normalizedAlt);\n\n return [similarity > similarityThreshold ? originalToken : altToken];\n};\n\n/**\n * Removes duplicate tokens and handles footnote fusion in post-processing.\n * Identifies and removes tokens that are highly similar while preserving\n * important variations. Also handles special cases like footnote merging.\n *\n * @param tokens - Array of tokens to process\n * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)\n * @returns Array of tokens with duplicates removed and footnotes fused\n */\nconst removeDuplicateTokens = (tokens: string[], highSimilarityThreshold: number): string[] => {\n if (tokens.length === 0) {\n return tokens;\n }\n\n const result: string[] = [];\n\n for (const currentToken of tokens) {\n if (result.length === 0) {\n result.push(currentToken);\n continue;\n }\n\n const previousToken = result.at(-1)!;\n\n // Handle ordinary echoes (similar tokens)\n if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {\n // Keep the shorter version\n if (currentToken.length < previousToken.length) {\n result[result.length - 1] = currentToken;\n }\n continue;\n }\n\n // Handle footnote fusion cases\n if (handleFootnoteFusion(result, previousToken, currentToken)) {\n continue;\n }\n\n result.push(currentToken);\n }\n\n return result;\n};\n\n/**\n * Processes text alignment between original and alternate OCR results to fix typos.\n * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,\n * then selects the best tokens and performs post-processing.\n *\n * @param originalText - Original OCR text that may contain typos\n * @param altText - Reference text from alternate OCR for comparison\n * @param options - Configuration options for alignment and selection\n * @returns Corrected text with typos fixed\n */\nexport const processTextAlignment = (originalText: string, altText: string, options: FixTypoOptions): string => {\n const originalTokens = tokenizeText(originalText, options.typoSymbols);\n const altTokens = tokenizeText(altText, options.typoSymbols);\n\n // Align token sequences\n const alignedPairs = alignTokenSequences(\n originalTokens,\n altTokens,\n options.typoSymbols,\n options.similarityThreshold,\n );\n\n // Select best tokens from each aligned pair\n const mergedTokens = alignedPairs.flatMap(([original, alt]) => selectBestTokens(original, alt, options));\n\n // Remove duplicates and handle post-processing\n const finalTokens = removeDuplicateTokens(mergedTokens, options.highSimilarityThreshold);\n\n return finalTokens.join(' ');\n};\n\nexport const fixTypo = (\n original: string,\n correction: string,\n {\n highSimilarityThreshold = 0.8,\n similarityThreshold = 0.6,\n typoSymbols,\n }: Partial<FixTypoOptions> & Pick<FixTypoOptions, 'typoSymbols'>,\n) => {\n return processTextAlignment(original, correction, { highSimilarityThreshold, similarityThreshold, typoSymbols });\n};\n\nexport * from './similarity';\nexport * from './textUtils';\n"],"mappings":"AAAO,IAAMA,EAAW,CACpB,aAAc,sBACd,uBAAwB,iDACxB,+BAAgC,gCAChC,WAAY,mDACZ,iBAAkB,0BAClB,mBAAoB,mCACpB,QAAS,UACT,WAAY,KAChB,EAYaC,EAAuBC,GACzBA,EAAK,QAAQF,EAAS,QAAS,EAAE,EAAE,QAAQA,EAAS,WAAY,EAAE,EAAE,KAAK,EAavEG,EAAiBD,GAAyB,CACnD,IAAME,EAAQF,EAAK,MAAMF,EAAS,YAAY,EAC9C,OAAOI,EAAQA,EAAM,CAAC,EAAI,EAC9B,EAaaC,EAAe,CAACH,EAAcI,EAA4B,CAAC,IAAgB,CACpF,IAAIC,EAAgBL,EAGpB,QAAWM,KAAUF,EAAiB,CAClC,IAAMG,EAAc,IAAI,OAAOD,EAAQ,GAAG,EAC1CD,EAAgBA,EAAc,QAAQE,EAAa,IAAID,CAAM,GAAG,CACpE,CAEA,OAAOD,EAAc,KAAK,EAAE,MAAMP,EAAS,UAAU,EAAE,OAAO,OAAO,CACzE,EAeaU,EAAuB,CAACC,EAAkBC,EAAuBC,IAAkC,CAC5G,IAAMC,EAAmBd,EAAS,mBAAmB,KAAKY,CAAa,EACjEG,EAAkBf,EAAS,iBAAiB,KAAKa,CAAY,EAC7DG,EAAmBhB,EAAS,mBAAmB,KAAKa,CAAY,EAChEI,EAAkBjB,EAAS,iBAAiB,KAAKY,CAAa,EAE9DM,EAAaf,EAAcS,CAAa,EACxCO,EAAahB,EAAcU,CAAY,EAG7C,OAAIC,GAAoBC,GAAmBG,IAAeC,GACtDR,EAAOA,EAAO,OAAS,CAAC,EAAIE,EACrB,IAIP,GAAAI,GAAmBD,GAAoBE,IAAeC,EAK9D,EAcaC,EAA0B,CAACC,EAAgBC,IAAoC,CACxF,IAAMC,EAAevB,EAAS,iBAAiB,KAAKqB,CAAM,EACpDG,EAAexB,EAAS,iBAAiB,KAAKsB,CAAM,EAE1D,OAAIC,GAAgB,CAACC,EAAqB,CAACH,CAAM,EAC7CG,GAAgB,CAACD,EAAqB,CAACD,CAAM,EAC7CC,GAAgBC,EACT,CAACH,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EAcaG,EAA4B,CAACJ,EAAgBC,IAAoC,CAC1F,IAAMI,EAAc1B,EAAS,mBAAmB,KAAKqB,CAAM,EACrDM,EAAc3B,EAAS,mBAAmB,KAAKsB,CAAM,EAE3D,OAAII,GAAe,CAACC,EAAoB,CAACN,EAAQC,CAAM,EACnDK,GAAe,CAACD,EAAoB,CAACJ,EAAQD,CAAM,EACnDK,GAAeC,EACR,CAACN,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EChJA,IAAMM,EAAmB,CACrB,YAAa,GACb,iBAAkB,GAClB,cAAe,EACf,WAAY,CAChB,EAeaC,EAA+B,CAACC,EAAeC,IAA0B,CAClF,IAAMC,EAAUF,EAAM,OAChBG,EAAUF,EAAM,OAEtB,GAAIC,IAAY,EACZ,OAAOC,EAGX,GAAIA,IAAY,EACZ,OAAOD,EAIX,GAAM,CAACE,EAASC,CAAM,EAAIH,GAAWC,EAAU,CAACH,EAAOC,CAAK,EAAI,CAACA,EAAOD,CAAK,EACvEM,EAAWF,EAAQ,OACnBG,EAAUF,EAAO,OAEnBG,EAAc,MAAM,KAAK,CAAE,OAAQF,EAAW,CAAE,EAAG,CAACG,EAAGC,IAAUA,CAAK,EAE1E,QAASC,EAAI,EAAGA,GAAKJ,EAASI,IAAK,CAC/B,IAAMC,EAAa,CAACD,CAAC,EAErB,QAASE,EAAI,EAAGA,GAAKP,EAAUO,IAAK,CAChC,IAAMC,EAAmBT,EAAOM,EAAI,CAAC,IAAMP,EAAQS,EAAI,CAAC,EAAI,EAAI,EAC1DE,EAAU,KAAK,IACjBP,EAAYK,CAAC,EAAI,EACjBD,EAAWC,EAAI,CAAC,EAAI,EACpBL,EAAYK,EAAI,CAAC,EAAIC,CACzB,EACAF,EAAW,KAAKG,CAAO,CAC3B,CAEAP,EAAcI,CAClB,CAEA,OAAOJ,EAAYF,CAAQ,CAC/B,EAcaU,EAAsB,CAAChB,EAAeC,IAA0B,CACzE,IAAMgB,EAAY,KAAK,IAAIjB,EAAM,OAAQC,EAAM,MAAM,GAAK,EACpDiB,EAAWnB,EAA6BC,EAAOC,CAAK,EAC1D,OAAQgB,EAAYC,GAAYD,CACpC,EAcaE,EAA+B,CAACnB,EAAeC,EAAemB,EAAoB,KAAiB,CAC5G,IAAMC,EAAcC,EAAoBtB,CAAK,EACvCuB,EAAcD,EAAoBrB,CAAK,EAC7C,OAAOe,EAAoBK,EAAaE,CAAW,GAAKH,CAC5D,EAgBaI,EAA0B,CACnCC,EACAC,EACAC,EACAC,IACS,CACT,IAAMP,EAAcC,EAAoBG,CAAM,EACxCF,EAAcD,EAAoBI,CAAM,EAG9C,GAAIL,IAAgBE,EAChB,OAAOzB,EAAiB,cAI5B,IAAM+B,EAAeF,EAAY,SAASF,CAAM,GAAKE,EAAY,SAASD,CAAM,EAC1EI,EAAkBd,EAAoBK,EAAaE,CAAW,GAAKK,EAEzE,OAAIC,GAAgBC,EACThC,EAAiB,WAGrBA,EAAiB,gBAC5B,EAoBaiC,EAAqB,CAC9BC,EACAC,EACAC,IACqB,CACrB,IAAMC,EAAgC,CAAC,EACnC,EAAIF,EAAQ,OACZpB,EAAIqB,EAAQ,OAEhB,KAAO,EAAI,GAAKrB,EAAI,GAGhB,OAFoBmB,EAAO,CAAC,EAAEnB,CAAC,EAEX,UAAW,CAC3B,IAAK,WACDsB,EAAU,KAAK,CAACF,EAAQ,EAAE,CAAC,EAAGC,EAAQ,EAAErB,CAAC,CAAC,CAAC,EAC3C,MACJ,IAAK,OACDsB,EAAU,KAAK,CAAC,KAAMD,EAAQ,EAAErB,CAAC,CAAC,CAAC,EACnC,MACJ,IAAK,KACDsB,EAAU,KAAK,CAACF,EAAQ,EAAE,CAAC,EAAG,IAAI,CAAC,EACnC,MACJ,QACI,MAAM,IAAI,MAAM,6BAA6B,CACrD,CAGJ,OAAOE,EAAU,QAAQ,CAC7B,EAgBaC,EAAsB,CAC/BH,EACAC,EACAP,EACAC,IACqB,CACrB,IAAM1B,EAAU+B,EAAQ,OAClB9B,EAAU+B,EAAQ,OAGlBG,EAAmC,MAAM,KAAK,CAAE,OAAQnC,EAAU,CAAE,EAAG,IACzE,MAAM,KAAK,CAAE,OAAQC,EAAU,CAAE,EAAG,KAAO,CAAE,UAAW,KAAM,MAAO,CAAE,EAAE,CAC7E,EAGA,QAASQ,EAAI,EAAGA,GAAKT,EAASS,IAC1B0B,EAAc1B,CAAC,EAAE,CAAC,EAAI,CAAE,UAAW,KAAM,MAAOA,EAAIb,EAAiB,WAAY,EAErF,QAASe,EAAI,EAAGA,GAAKV,EAASU,IAC1BwB,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW,OAAQ,MAAOA,EAAIf,EAAiB,WAAY,EAIvF,QAASa,EAAI,EAAGA,GAAKT,EAASS,IAC1B,QAASE,EAAI,EAAGA,GAAKV,EAASU,IAAK,CAC/B,IAAMyB,EAAiBd,EACnBS,EAAQtB,EAAI,CAAC,EACbuB,EAAQrB,EAAI,CAAC,EACbc,EACAC,CACJ,EAEMW,EAAgBF,EAAc1B,EAAI,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQyB,EACpDE,EAAUH,EAAc1B,EAAI,CAAC,EAAEE,CAAC,EAAE,MAAQf,EAAiB,YAC3D2C,EAAYJ,EAAc1B,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQf,EAAiB,YAE7D4C,EAAY,KAAK,IAAIH,EAAeC,EAASC,CAAS,EACxDE,EAA4C,OAE5CD,IAAcH,EACdI,EAAgB,WACTD,IAAcF,IACrBG,EAAgB,MAGpBN,EAAc1B,CAAC,EAAEE,CAAC,EAAI,CAAE,UAAW8B,EAAe,MAAOD,CAAU,CACvE,CAIJ,OAAOX,EAAmBM,EAAeJ,EAASC,CAAO,CAC7D,ECpOA,IAAMU,EAAmB,CACrBC,EACAC,EACA,CAAE,oBAAAC,EAAqB,YAAAC,CAAY,IACxB,CAEX,GAAIH,IAAkB,KAClB,MAAO,CAACC,CAAS,EAErB,GAAIA,IAAa,KACb,MAAO,CAACD,CAAa,EAIzB,GAAII,EAAoBJ,CAAa,IAAMI,EAAoBH,CAAQ,EACnE,MAAO,CAACD,CAAa,EAIzB,IAAMK,EAASC,EAAwBN,EAAeC,CAAQ,EAC9D,GAAII,EAAQ,OAAOA,EAGnB,IAAME,EAAiBC,EAA0BR,EAAeC,CAAQ,EACxE,GAAIM,EAAgB,OAAOA,EAG3B,GAAIJ,EAAY,SAASH,CAAa,GAAKG,EAAY,SAASF,CAAQ,EAAG,CACvE,IAAMQ,EAAaN,EAAY,KAAMO,GAAWA,IAAWV,GAAiBU,IAAWT,CAAQ,EAC/F,OAAOQ,EAAa,CAACA,CAAU,EAAI,CAACT,CAAa,CACrD,CAGA,IAAMW,EAAqBP,EAAoBJ,CAAa,EACtDY,EAAgBR,EAAoBH,CAAQ,EAGlD,MAAO,CAFYY,EAAoBF,EAAoBC,CAAa,EAEnDV,EAAsBF,EAAgBC,CAAQ,CACvE,EAWMa,EAAwB,CAACC,EAAkBC,IAA8C,CAC3F,GAAID,EAAO,SAAW,EAClB,OAAOA,EAGX,IAAMV,EAAmB,CAAC,EAE1B,QAAWY,KAAgBF,EAAQ,CAC/B,GAAIV,EAAO,SAAW,EAAG,CACrBA,EAAO,KAAKY,CAAY,EACxB,QACJ,CAEA,IAAMC,EAAgBb,EAAO,GAAG,EAAE,EAGlC,GAAIc,EAA6BD,EAAeD,EAAcD,CAAuB,EAAG,CAEhFC,EAAa,OAASC,EAAc,SACpCb,EAAOA,EAAO,OAAS,CAAC,EAAIY,GAEhC,QACJ,CAGIG,EAAqBf,EAAQa,EAAeD,CAAY,GAI5DZ,EAAO,KAAKY,CAAY,CAC5B,CAEA,OAAOZ,CACX,EAYagB,EAAuB,CAACC,EAAsBC,EAAiBC,IAAoC,CAC5G,IAAMC,EAAiBC,EAAaJ,EAAcE,EAAQ,WAAW,EAC/DG,EAAYD,EAAaH,EAASC,EAAQ,WAAW,EAWrDI,EAReC,EACjBJ,EACAE,EACAH,EAAQ,YACRA,EAAQ,mBACZ,EAGkC,QAAQ,CAAC,CAACM,EAAUC,CAAG,IAAMhC,EAAiB+B,EAAUC,EAAKP,CAAO,CAAC,EAKvG,OAFoBV,EAAsBc,EAAcJ,EAAQ,uBAAuB,EAEpE,KAAK,GAAG,CAC/B,EAEaQ,EAAU,CACnBF,EACAG,EACA,CACI,wBAAAjB,EAA0B,GAC1B,oBAAAd,EAAsB,GACtB,YAAAC,CACJ,IAEOkB,EAAqBS,EAAUG,EAAY,CAAE,wBAAAjB,EAAyB,oBAAAd,EAAqB,YAAAC,CAAY,CAAC","names":["PATTERNS","normalizeArabicText","text","extractDigits","match","tokenizeText","preserveSymbols","processedText","symbol","symbolRegex","handleFootnoteFusion","result","previousToken","currentToken","prevIsStandalone","currHasEmbedded","currIsStandalone","prevHasEmbedded","prevDigits","currDigits","handleFootnoteSelection","tokenA","tokenB","aHasEmbedded","bHasEmbedded","handleStandaloneFootnotes","aIsFootnote","bIsFootnote","ALIGNMENT_SCORES","calculateLevenshteinDistance","textA","textB","lengthA","lengthB","shorter","longer","shortLen","longLen","previousRow","_","index","i","currentRow","j","substitutionCost","minCost","calculateSimilarity","maxLength","distance","areSimilarAfterNormalization","threshold","normalizedA","normalizeArabicText","normalizedB","calculateAlignmentScore","tokenA","tokenB","typoSymbols","similarityThreshold","isTypoSymbol","isHighlySimilar","backtrackAlignment","matrix","tokensA","tokensB","alignment","alignTokenSequences","scoringMatrix","alignmentScore","diagonalScore","upScore","leftScore","bestScore","bestDirection","selectBestTokens","originalToken","altToken","similarityThreshold","typoSymbols","normalizeArabicText","result","handleFootnoteSelection","footnoteResult","handleStandaloneFootnotes","typoSymbol","symbol","normalizedOriginal","normalizedAlt","calculateSimilarity","removeDuplicateTokens","tokens","highSimilarityThreshold","currentToken","previousToken","areSimilarAfterNormalization","handleFootnoteFusion","processTextAlignment","originalText","altText","options","originalTokens","tokenizeText","altTokens","mergedTokens","alignTokenSequences","original","alt","fixTypo","correction"]}
package/package.json ADDED
@@ -0,0 +1,65 @@
1
+ {
2
+ "name": "baburchi",
3
+ "version": "1.0.0",
4
+ "author": "Ragaeeb Haq",
5
+ "repository": {
6
+ "type": "git",
7
+ "url": "https://github.com/ragaeeb/baburchi.git"
8
+ },
9
+ "main": "dist/index.js",
10
+ "module": "dist/index.ts",
11
+ "devDependencies": {
12
+ "@eslint/js": "^9.29.0",
13
+ "@types/bun": "^1.2.16",
14
+ "eslint": "^9.29.0",
15
+ "eslint-config-prettier": "^10.1.5",
16
+ "eslint-plugin-perfectionist": "^4.15.0",
17
+ "eslint-plugin-prettier": "^5.5.0",
18
+ "globals": "^16.2.0",
19
+ "prettier": "^3.5.3",
20
+ "semantic-release": "^24.2.5",
21
+ "tsup": "^8.5.0",
22
+ "typescript-eslint": "^8.34.1"
23
+ },
24
+ "bugs": {
25
+ "url": "https://github.com/ragaeeb/baburchi/issues"
26
+ },
27
+ "description": "A lightweight TypeScript library designed to fix typos in OCR post-processing.",
28
+ "engines": {
29
+ "bun": ">=1.2.16",
30
+ "node": ">=22.0.0"
31
+ },
32
+ "exports": {
33
+ ".": {
34
+ "import": "./dist/index.js",
35
+ "types": "./dist/index.d.ts"
36
+ }
37
+ },
38
+ "sideEffects": false,
39
+ "files": [
40
+ "dist/index.js",
41
+ "dist/index.js.map",
42
+ "dist/*.d.ts",
43
+ "LICENSE.md",
44
+ "README.md"
45
+ ],
46
+ "homepage": "https://github.com/ragaeeb/baburchi",
47
+ "keywords": [
48
+ "nodejs",
49
+ "ocr",
50
+ "formatting",
51
+ "typos",
52
+ "correction",
53
+ "paragraphs",
54
+ "text-processing",
55
+ "typescript"
56
+ ],
57
+ "license": "MIT",
58
+ "scripts": {
59
+ "build": "tsup",
60
+ "test": "bun test --coverage --coverage-reporter=lcov"
61
+ },
62
+ "source": "src/index.ts",
63
+ "type": "module",
64
+ "types": "dist/index.d.ts"
65
+ }