baburchi 1.2.0 โ†’ 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,12 +12,13 @@
12
12
  ![GitHub issues](https://img.shields.io/github/issues/ragaeeb/baburchi)
13
13
  ![GitHub stars](https://img.shields.io/github/stars/ragaeeb/baburchi?style=social)
14
14
 
15
- A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms.
15
+ A lightweight TypeScript library for intelligent OCR text post-processing, specializing in Arabic text with advanced typo correction using sequence alignment algorithms and comprehensive noise detection.
16
16
 
17
17
  ## Features
18
18
 
19
19
  - ๐Ÿง  **Intelligent Text Alignment**: Uses the Needleman-Wunsch algorithm for optimal text sequence alignment
20
20
  - ๐Ÿ”ค **Arabic Text Specialization**: Advanced normalization and diacritics handling for Arabic text
21
+ - ๐Ÿงน **Noise Detection**: Comprehensive Arabic text noise detection and OCR artifact identification
21
22
  - ๐Ÿ“ **Footnote Management**: Smart handling of embedded and standalone footnotes
22
23
  - โšก **High Performance**: Space-optimized algorithms with O(min(m,n)) space complexity
23
24
  - ๐ŸŽฏ **Special Symbol Preservation**: Configurable preservation of religious symbols and honorifics
@@ -44,15 +45,19 @@ bun add baburchi
44
45
  ## Quick Start
45
46
 
46
47
  ```typescript
47
- import { fixTypo } from 'baburchi';
48
+ import { fixTypo, isArabicTextNoise } from 'baburchi';
48
49
 
49
- // Basic usage with Arabic text
50
+ // Basic typo correction with Arabic text
50
51
  const originalText = 'ู…ุญู…ุฏ ุตู„ู‰ ุงู„ู„ู‡ ุนู„ูŠู‡ ูˆุณู„ู… ุฑุณูˆู„ ุงู„ู„ู‡';
51
52
  const correctedText = 'ู…ุญู…ุฏ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡';
52
53
  const typoSymbols = ['๏ทบ', '๏ทฝ', '๏ทป'];
53
54
 
54
55
  const result = fixTypo(originalText, correctedText, { typoSymbols });
55
56
  console.log(result); // 'ู…ุญู…ุฏ ุตู„ู‰ ุงู„ู„ู‡ ุนู„ูŠู‡ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡'
57
+
58
+ // Noise detection for OCR cleanup
59
+ const cleanText = isArabicTextNoise('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…'); // false
60
+ const noiseText = isArabicTextNoise('---'); // true
56
61
  ```
57
62
 
58
63
  ## API Reference
@@ -203,6 +208,106 @@ const options: FixTypoOptions = {
203
208
  };
204
209
  ```
205
210
 
211
+ ## Text Segment Alignment
212
+
213
+ Baburchi provides specialized functionality for aligning split text segments back to their target lines. This is particularly useful when OCR has fragmented continuous text or poetry into separate segments that need to be reconstructed.
214
+
215
+ ### `alignTextSegments(targetLines, segmentLines)`
216
+
217
+ Aligns split text segments to match target lines by finding the best order and combining segments when necessary.
218
+
219
+ **Parameters:**
220
+
221
+ - `targetLines` (string[]): Array where each element is either a string to align against, or falsy to skip alignment
222
+ - `segmentLines` (string[]): Array of text segments that may represent split versions of target lines
223
+
224
+ **Returns:** Array of aligned text lines
225
+
226
+ #### Poetry Reconstruction Example
227
+
228
+ ```typescript
229
+ import { alignTextSegments } from 'baburchi';
230
+
231
+ // Target lines from a poetry collection
232
+ const targetLines = [
233
+ '', // Don't align - pass through as-is
234
+ 'ู‚ุฏ ู‚ูุฏูู‘ู… ุงู„ุนูŽุฌู’ุจู ุนู„ู‰ ุงู„ุฑูู‘ูˆูŽูŠุณ ูˆุดุงุฑู ุงู„ูˆู‡ุฏู ุฃุจุง ู‚ูุจูŠุณู',
235
+ 'ูˆุทุงูˆู„ ุงู„ุจู‚ู„ู ูุฑูˆุนูŽ ุงู„ู…ูŠู’ุณ ูˆู‡ุจุช ุงู„ุนู†ุฒ ู„ู‚ุฑุน ุงู„ุชูŠุณู',
236
+ 'ูˆุงุฏูŽู‘ุนุช ุงู„ุฑูˆู… ุฃุจู‹ุง ููŠ ู‚ูŠุณ ูˆุงุฎุชู„ุท ุงู„ู†ุงุณ ุงุฎุชู„ุงุท ุงู„ุญูŠุณู',
237
+ 'ุฅุฐ ู‚ุฑุง ุงู„ู‚ุงุถูŠ ุญู„ูŠู ุงู„ูƒูŠุณ ู…ุนุงู†ูŠ ุงู„ุดุนุฑ ุนู„ู‰ ุงู„ุนุจูŠุณูŠ',
238
+ '', // Don't align - pass through as-is
239
+ ];
240
+
241
+ // OCR segments (fragmented and possibly out of order)
242
+ const segmentLines = [
243
+ 'A', // Header/marker
244
+ 'ู‚ุฏ ู‚ูุฏูู‘ู… ุงู„ุนูŽุฌู’ุจู ุนู„ู‰ ุงู„ุฑูู‘ูˆูŽูŠุณ ูˆุดู€ุงุฑู ุงู„ูˆู‡ู€ุฏู ุฃุจู€ู€ุง ู‚ูุจูŠุณ',
245
+ 'ูˆุทุงูˆู„ ุงู„ุจู‚ู„ู ูุฑูˆุนูŽ ุงู„ู…ูŠู’ุณ',
246
+ 'ูˆู‡ุจุช ุงู„ุนู†ู€ุฒ ู„ู€ู‚ุฑุน ุงู„ุชู€ูŠุณ',
247
+ 'ูˆุงุฎุชู„ุท ุงู„ู†ุงุณ ุงุฎุชู„ุงุท ุงู„ุญูŠุณ',
248
+ 'ูˆุงุฏูŽู‘ุนุช ุงู„ุฑูˆู… ุฃุจู‹ุง ููŠ ู‚ูŠุณ',
249
+ 'ู…ุนู€ุงู†ูŠ ุงู„ุดุนุฑ ุนู„ู‰ ุงู„ุนู€ุจู€ูŠู€ู€ุณู€ูŠ',
250
+ 'ุฅุฐ ู‚ุฑุง ุงู„ู‚ุงุถูŠ ุญู„ูŠู ุงู„ูƒูŠุณ',
251
+ 'B', // Footer/marker
252
+ ];
253
+
254
+ const result = alignTextSegments(targetLines, segmentLines);
255
+ console.log(result);
256
+ // Output:
257
+ // [
258
+ // 'A',
259
+ // 'ู‚ุฏ ู‚ูุฏูู‘ู… ุงู„ุนูŽุฌู’ุจู ุนู„ู‰ ุงู„ุฑูู‘ูˆูŽูŠุณ ูˆุดู€ุงุฑู ุงู„ูˆู‡ู€ุฏู ุฃุจู€ู€ุง ู‚ูุจูŠุณ',
260
+ // 'ูˆุทุงูˆู„ ุงู„ุจู‚ู„ู ูุฑูˆุนูŽ ุงู„ู…ูŠู’ุณ ูˆู‡ุจุช ุงู„ุนู†ู€ุฒ ู„ู€ู‚ุฑุน ุงู„ุชู€ูŠุณ',
261
+ // 'ูˆุงุฏูŽู‘ุนุช ุงู„ุฑูˆู… ุฃุจู‹ุง ููŠ ู‚ูŠุณ ูˆุงุฎุชู„ุท ุงู„ู†ุงุณ ุงุฎุชู„ุงุท ุงู„ุญูŠุณ',
262
+ // 'ุฅุฐ ู‚ุฑุง ุงู„ู‚ุงุถูŠ ุญู„ูŠู ุงู„ูƒูŠุณ ู…ุนู€ุงู†ูŠ ุงู„ุดุนุฑ ุนู„ู‰ ุงู„ุนู€ุจู€ูŠู€ู€ุณู€ูŠ',
263
+ // 'B'
264
+ // ]
265
+ ```
266
+
267
+ #### Handling Reversed Segments
268
+
269
+ ```typescript
270
+ import { alignTextSegments } from 'baburchi';
271
+
272
+ // When OCR produces segments in wrong order
273
+ const targetLines = ['hello world goodbye'];
274
+ const segmentLines = ['goodbye', 'hello world'];
275
+
276
+ const result = alignTextSegments(targetLines, segmentLines);
277
+ console.log(result); // ['hello world goodbye']
278
+ ```
279
+
280
+ #### Mixed Alignment Scenarios
281
+
282
+ ```typescript
283
+ import { alignTextSegments } from 'baburchi';
284
+
285
+ // Some lines need alignment, others are one-to-one
286
+ const targetLines = ['', 'split line content', '']; // Empty strings = no alignment needed
287
+ const segmentLines = ['header', 'split line', 'content', 'footer'];
288
+
289
+ const result = alignTextSegments(targetLines, segmentLines);
290
+ console.log(result); // ['header', 'split line content', 'footer']
291
+ ```
292
+
293
+ ### How It Works
294
+
295
+ 1. **Target Processing**: For each target line that requires alignment (non-falsy), the algorithm:
296
+ - Finds the best combination of available segments that matches the target
297
+ - Uses similarity scoring to determine optimal segment ordering
298
+ - Combines segments when they form a better match together
299
+
300
+ 2. **One-to-One Mapping**: For falsy target lines (empty strings, null, undefined), segments are passed through directly
301
+
302
+ 3. **Remaining Segments**: Any segments not consumed during alignment are appended to the result
303
+
304
+ This function is particularly useful for:
305
+
306
+ - Reconstructing fragmented poetry or prose
307
+ - Aligning OCR segments with reference text
308
+ - Handling cases where text layout affects line ordering
309
+ - Processing documents where content has been split across multiple detection regions
310
+
206
311
  ## Utilities
207
312
 
208
313
  The library also exports utility functions for advanced use cases:
@@ -215,6 +320,7 @@ import {
215
320
  alignTokenSequences,
216
321
  hasInvalidFootnotes,
217
322
  correctReferences,
323
+ alignTextSegments,
218
324
  } from 'baburchi';
219
325
 
220
326
  // Calculate similarity between two strings
@@ -235,6 +341,219 @@ const lines = [
235
341
  { text: '() This is a footnote', isFootnote: true },
236
342
  ];
237
343
  const corrected = correctReferences(lines);
344
+
345
+ // Align fragmented text segments
346
+ const aligned = alignTextSegments(
347
+ ['target line one', '', 'target line three'],
348
+ ['segment1', 'segment2', 'segment3', 'segment4'],
349
+ );
350
+ ```
351
+
352
+ ## Noise Detection
353
+
354
+ Baburchi provides comprehensive noise detection capabilities specifically designed for Arabic OCR post-processing. These functions help identify and filter out OCR artifacts, formatting elements, and meaningless content commonly found in digitized Arabic documents.
355
+
356
+ ### `isArabicTextNoise(text)`
357
+
358
+ The main noise detection function that performs comprehensive analysis to identify unwanted OCR artifacts.
359
+
360
+ ```typescript
361
+ import { isArabicTextNoise } from 'baburchi';
362
+
363
+ // Detect formatting artifacts
364
+ console.log(isArabicTextNoise('---')); // true
365
+ console.log(isArabicTextNoise('...')); // true
366
+ console.log(isArabicTextNoise('!!!')); // true
367
+
368
+ // Detect OCR errors
369
+ console.log(isArabicTextNoise('ABC')); // true (uppercase-only pattern)
370
+ console.log(isArabicTextNoise('- 77')); // true (digit-dash combination)
371
+
372
+ // Valid Arabic content
373
+ console.log(isArabicTextNoise('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…')); // false
374
+ console.log(isArabicTextNoise('ู…ุญู…ุฏ ๏ทบ')); // false
375
+ console.log(isArabicTextNoise('2023')); // false (substantial number)
376
+ ```
377
+
378
+ ### Character Analysis Functions
379
+
380
+ #### `analyzeCharacterStats(text)`
381
+
382
+ Analyzes character composition and frequency statistics for detailed text analysis.
383
+
384
+ ```typescript
385
+ import { analyzeCharacterStats } from 'baburchi';
386
+
387
+ const stats = analyzeCharacterStats('ู…ุฑุญุจุง 123!');
388
+ console.log(stats);
389
+ // {
390
+ // arabicCount: 5,
391
+ // digitCount: 3,
392
+ // latinCount: 0,
393
+ // spaceCount: 1,
394
+ // punctuationCount: 1,
395
+ // symbolCount: 0,
396
+ // charFreq: Map { 'ู…' => 1, 'ุฑ' => 1, 'ุญ' => 1, ... }
397
+ // }
398
+ ```
399
+
400
+ #### `hasExcessiveRepetition(charStats, textLength)`
401
+
402
+ Detects excessive character repetition that commonly indicates noise.
403
+
404
+ ```typescript
405
+ import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';
406
+
407
+ const stats = analyzeCharacterStats('!!!!!');
408
+ console.log(hasExcessiveRepetition(stats, 5)); // true
409
+
410
+ const normalStats = analyzeCharacterStats('hello world');
411
+ console.log(hasExcessiveRepetition(normalStats, 11)); // false
412
+ ```
413
+
414
+ ### Pattern Detection Functions
415
+
416
+ #### `isBasicNoisePattern(text)`
417
+
418
+ Identifies text matching common noise patterns using regular expressions.
419
+
420
+ ```typescript
421
+ import { isBasicNoisePattern } from 'baburchi';
422
+
423
+ console.log(isBasicNoisePattern('---')); // true
424
+ console.log(isBasicNoisePattern('...')); // true
425
+ console.log(isBasicNoisePattern('ABC')); // true
426
+ console.log(isBasicNoisePattern('- 77')); // true
427
+ console.log(isBasicNoisePattern('hello world')); // false
428
+ ```
429
+
430
+ #### `isSpacingNoise(charStats, contentChars, textLength)`
431
+
432
+ Detects problematic spacing patterns that indicate OCR artifacts.
433
+
434
+ ```typescript
435
+ import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';
436
+
437
+ const stats = analyzeCharacterStats(' a ');
438
+ const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;
439
+ console.log(isSpacingNoise(stats, contentChars, 3)); // true
440
+
441
+ const normalStats = analyzeCharacterStats('hello world');
442
+ const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;
443
+ console.log(isSpacingNoise(normalStats, normalContent, 11)); // false
444
+ ```
445
+
446
+ ### Content Validation Functions
447
+
448
+ #### `isValidArabicContent(charStats, textLength)`
449
+
450
+ Validates whether Arabic content is substantial enough to be meaningful.
451
+
452
+ ```typescript
453
+ import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';
454
+
455
+ const validStats = analyzeCharacterStats('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…');
456
+ console.log(isValidArabicContent(validStats, 12)); // true
457
+
458
+ const shortStats = analyzeCharacterStats('ุต');
459
+ console.log(isValidArabicContent(shortStats, 1)); // false
460
+
461
+ const withDigitsStats = analyzeCharacterStats('ุต 5');
462
+ console.log(isValidArabicContent(withDigitsStats, 3)); // true
463
+ ```
464
+
465
+ #### `isNonArabicNoise(charStats, textLength, text)`
466
+
467
+ Determines if non-Arabic content should be classified as noise.
468
+
469
+ ```typescript
470
+ import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';
471
+
472
+ const stats = analyzeCharacterStats('!!!');
473
+ console.log(isNonArabicNoise(stats, 3, '!!!')); // true
474
+
475
+ const validStats = analyzeCharacterStats('2023');
476
+ console.log(isNonArabicNoise(validStats, 4, '2023')); // false
477
+ ```
478
+
479
+ ### Noise Detection Use Cases
480
+
481
+ #### OCR Post-Processing Pipeline
482
+
483
+ ```typescript
484
+ import { isArabicTextNoise } from 'baburchi';
485
+
486
+ const ocrLines = ['ุงู„ุณู„ุงู… ุนู„ูŠูƒู… ูˆุฑุญู…ุฉ ุงู„ู„ู‡', '---', 'ู‡ุฐุง ุงู„ู†ุต ุตุญูŠุญ', 'ABC', '...', 'ู…ุญู…ุฏ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡'];
487
+
488
+ const cleanLines = ocrLines.filter((line) => !isArabicTextNoise(line));
489
+ console.log(cleanLines);
490
+ // ['ุงู„ุณู„ุงู… ุนู„ูŠูƒู… ูˆุฑุญู…ุฉ ุงู„ู„ู‡', 'ู‡ุฐุง ุงู„ู†ุต ุตุญูŠุญ', 'ู…ุญู…ุฏ ๏ทบ ุฑุณูˆู„ ุงู„ู„ู‡']
491
+ ```
492
+
493
+ #### Document Quality Assessment
494
+
495
+ ```typescript
496
+ import { analyzeCharacterStats, isArabicTextNoise } from 'baburchi';
497
+
498
+ function assessDocumentQuality(text: string) {
499
+ const lines = text.split('\n');
500
+ const stats = {
501
+ totalLines: lines.length,
502
+ validLines: 0,
503
+ noiseLines: 0,
504
+ noisyContent: [] as string[],
505
+ };
506
+
507
+ for (const line of lines) {
508
+ if (isArabicTextNoise(line.trim())) {
509
+ stats.noiseLines++;
510
+ stats.noisyContent.push(line);
511
+ } else {
512
+ stats.validLines++;
513
+ }
514
+ }
515
+
516
+ return {
517
+ ...stats,
518
+ qualityRatio: stats.validLines / stats.totalLines,
519
+ needsCleaning: stats.qualityRatio < 0.8,
520
+ };
521
+ }
522
+
523
+ const document = `ุงู„ุณู„ุงู… ุนู„ูŠูƒู…
524
+ ---
525
+ ู‡ุฐุง ู†ุต ุนุฑุจูŠ ุตุญูŠุญ
526
+ ABC
527
+ ุงู„ู†ู‡ุงูŠุฉ`;
528
+
529
+ const quality = assessDocumentQuality(document);
530
+ console.log(quality);
531
+ // { totalLines: 5, validLines: 3, noiseLines: 2, qualityRatio: 0.6, needsCleaning: true }
532
+ ```
533
+
534
+ #### Batch Text Cleaning
535
+
536
+ ```typescript
537
+ import { isArabicTextNoise } from 'baburchi';
538
+
539
+ function cleanTextBatch(texts: string[]): { clean: string[]; noise: string[] } {
540
+ const result = { clean: [] as string[], noise: [] as string[] };
541
+
542
+ for (const text of texts) {
543
+ if (isArabicTextNoise(text)) {
544
+ result.noise.push(text);
545
+ } else {
546
+ result.clean.push(text);
547
+ }
548
+ }
549
+
550
+ return result;
551
+ }
552
+
553
+ const mixedTexts = ['ุงู„ุณู„ุงู… ุนู„ูŠูƒู…', '---', 'ู…ุฑุญุจุง', '!!!', '2023'];
554
+ const { clean, noise } = cleanTextBatch(mixedTexts);
555
+ console.log('Clean:', clean); // ['ุงู„ุณู„ุงู… ุนู„ูŠูƒู…', 'ู…ุฑุญุจุง', '2023']
556
+ console.log('Noise:', noise); // ['---', '!!!']
238
557
  ```
239
558
 
240
559
  ### Footnote Processing
@@ -446,4 +765,4 @@ Ragaeeb Haq
446
765
 
447
766
  ---
448
767
 
449
- Built with โค๏ธ using TypeScript and Bun. Optimized for Arabic text processing and OCR post-processing.
768
+ Built with โค๏ธ using TypeScript and Bun. Optimized for Arabic text processing, OCR post-processing, and noise detection.
package/dist/index.d.ts CHANGED
@@ -27,6 +27,20 @@ type FixTypoOptions = {
27
27
  readonly typoSymbols: string[];
28
28
  };
29
29
 
30
+ /**
31
+ * Aligns split text segments to match target lines by finding the best order.
32
+ *
33
+ * This function handles cases where text lines have been split into segments
34
+ * and need to be merged back together in the correct order. It compares
35
+ * different arrangements of the segments against target lines to find the
36
+ * best match based on similarity scores.
37
+ *
38
+ * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment
39
+ * @param segmentLines - Array of text segments that may represent split versions of target lines.
40
+ * @returns Array of aligned text lines
41
+ */
42
+ declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
43
+
30
44
  /**
31
45
  * Represents an error found when checking balance of quotes or brackets in text.
32
46
  */
@@ -200,6 +214,174 @@ type TextLine = {
200
214
  */
201
215
  declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
202
216
 
217
+ /**
218
+ * Character statistics for analyzing text content and patterns
219
+ */
220
+ type CharacterStats = {
221
+ /** Number of Arabic script characters in the text */
222
+ arabicCount: number;
223
+ /** Map of character frequencies for repetition analysis */
224
+ charFreq: Map<string, number>;
225
+ /** Number of digit characters (0-9) in the text */
226
+ digitCount: number;
227
+ /** Number of Latin alphabet characters (a-z, A-Z) in the text */
228
+ latinCount: number;
229
+ /** Number of punctuation characters in the text */
230
+ punctuationCount: number;
231
+ /** Number of whitespace characters in the text */
232
+ spaceCount: number;
233
+ /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
234
+ symbolCount: number;
235
+ };
236
+ /**
237
+ * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
238
+ * This function performs comprehensive analysis to identify patterns commonly associated
239
+ * with OCR errors, formatting artifacts, or meaningless content in Arabic text processing.
240
+ *
241
+ * @param text - The input string to analyze for noise patterns
242
+ * @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content
243
+ *
244
+ * @example
245
+ * ```typescript
246
+ * import { isArabicTextNoise } from 'baburchi';
247
+ *
248
+ * console.log(isArabicTextNoise('---')); // true (formatting artifact)
249
+ * console.log(isArabicTextNoise('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…')); // false (valid Arabic)
250
+ * console.log(isArabicTextNoise('ABC')); // true (uppercase pattern)
251
+ * ```
252
+ */
253
+ declare const isArabicTextNoise: (text: string) => boolean;
254
+ /**
255
+ * Analyzes character composition and frequency statistics for the input text.
256
+ * Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols)
257
+ * and tracks character frequency for pattern analysis.
258
+ *
259
+ * @param text - The text string to analyze
260
+ * @returns CharacterStats object containing detailed character analysis
261
+ *
262
+ * @example
263
+ * ```typescript
264
+ * import { analyzeCharacterStats } from 'baburchi';
265
+ *
266
+ * const stats = analyzeCharacterStats('ู…ุฑุญุจุง 123!');
267
+ * console.log(stats.arabicCount); // 5
268
+ * console.log(stats.digitCount); // 3
269
+ * console.log(stats.symbolCount); // 1
270
+ * ```
271
+ */
272
+ declare function analyzeCharacterStats(text: string): CharacterStats;
273
+ /**
274
+ * Detects excessive repetition of specific characters that commonly indicate noise.
275
+ * Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs,
276
+ * and underscores that often appear in OCR artifacts or formatting elements.
277
+ *
278
+ * @param charStats - Character statistics from analyzeCharacterStats
279
+ * @param textLength - Total length of the original text
280
+ * @returns true if excessive repetition is detected, false otherwise
281
+ *
282
+ * @example
283
+ * ```typescript
284
+ * import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';
285
+ *
286
+ * const stats = analyzeCharacterStats('!!!!!');
287
+ * console.log(hasExcessiveRepetition(stats, 5)); // true
288
+ *
289
+ * const normalStats = analyzeCharacterStats('hello world');
290
+ * console.log(hasExcessiveRepetition(normalStats, 11)); // false
291
+ * ```
292
+ */
293
+ declare function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean;
294
+ /**
295
+ * Identifies text that matches common noise patterns using regular expressions.
296
+ * Detects patterns like repeated dashes, dot sequences, uppercase-only text,
297
+ * digit-dash combinations, and other formatting artifacts commonly found in OCR output.
298
+ *
299
+ * @param text - The text string to check against noise patterns
300
+ * @returns true if the text matches a basic noise pattern, false otherwise
301
+ *
302
+ * @example
303
+ * ```typescript
304
+ * import { isBasicNoisePattern } from 'baburchi';
305
+ *
306
+ * console.log(isBasicNoisePattern('---')); // true
307
+ * console.log(isBasicNoisePattern('...')); // true
308
+ * console.log(isBasicNoisePattern('ABC')); // true
309
+ * console.log(isBasicNoisePattern('- 77')); // true
310
+ * console.log(isBasicNoisePattern('hello world')); // false
311
+ * ```
312
+ */
313
+ declare function isBasicNoisePattern(text: string): boolean;
314
+ /**
315
+ * Determines if non-Arabic content should be classified as noise based on various heuristics.
316
+ * Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition
317
+ * to identify unwanted OCR artifacts or meaningless content.
318
+ *
319
+ * @param charStats - Character statistics from analyzeCharacterStats
320
+ * @param textLength - Total length of the original text
321
+ * @param text - The original text string for additional pattern matching
322
+ * @returns true if the content is likely noise, false if it appears to be valid content
323
+ *
324
+ * @example
325
+ * ```typescript
326
+ * import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';
327
+ *
328
+ * const stats = analyzeCharacterStats('!!!');
329
+ * console.log(isNonArabicNoise(stats, 3, '!!!')); // true
330
+ *
331
+ * const validStats = analyzeCharacterStats('2023');
332
+ * console.log(isNonArabicNoise(validStats, 4, '2023')); // false
333
+ * ```
334
+ */
335
+ declare function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean;
336
+ /**
337
+ * Detects problematic spacing patterns that indicate noise or OCR artifacts.
338
+ * Identifies cases where spacing is excessive relative to content, or where
339
+ * single characters are surrounded by spaces in a way that suggests OCR errors.
340
+ *
341
+ * @param charStats - Character statistics from analyzeCharacterStats
342
+ * @param contentChars - Number of meaningful content characters (Arabic + Latin + digits)
343
+ * @param textLength - Total length of the original text
344
+ * @returns true if spacing patterns indicate noise, false otherwise
345
+ *
346
+ * @example
347
+ * ```typescript
348
+ * import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';
349
+ *
350
+ * const stats = analyzeCharacterStats(' a ');
351
+ * const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;
352
+ * console.log(isSpacingNoise(stats, contentChars, 3)); // true
353
+ *
354
+ * const normalStats = analyzeCharacterStats('hello world');
355
+ * const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;
356
+ * console.log(isSpacingNoise(normalStats, normalContent, 11)); // false
357
+ * ```
358
+ */
359
+ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean;
360
+ /**
361
+ * Validates whether Arabic content is substantial enough to be considered meaningful.
362
+ * Uses character counts and text length to determine if Arabic text contains
363
+ * sufficient content or if it's likely to be a fragment or OCR artifact.
364
+ *
365
+ * @param charStats - Character statistics from analyzeCharacterStats
366
+ * @param textLength - Total length of the original text
367
+ * @returns true if the Arabic content appears valid, false if it's likely noise
368
+ *
369
+ * @example
370
+ * ```typescript
371
+ * import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';
372
+ *
373
+ * const validStats = analyzeCharacterStats('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…');
374
+ * console.log(isValidArabicContent(validStats, 12)); // true
375
+ *
376
+ * const shortStats = analyzeCharacterStats('ุต');
377
+ * console.log(isValidArabicContent(shortStats, 1)); // false
378
+ *
379
+ * const withDigitsStats = analyzeCharacterStats('ุต 5');
380
+ * console.log(isValidArabicContent(withDigitsStats, 3)); // true
381
+ * ```
382
+ */
383
+ declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
384
+
203
385
  /**
204
386
  * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
205
387
  * The Levenshtein distance is the minimum number of single-character edits (insertions,
@@ -292,6 +474,8 @@ declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSy
292
474
  * Collection of regex patterns used throughout the library for text processing
293
475
  */
294
476
  declare const PATTERNS: {
477
+ /** Matches Arabic characters across all Unicode blocks */
478
+ arabicCharacters: RegExp;
295
479
  /** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */
296
480
  arabicDigits: RegExp;
297
481
  /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
@@ -407,4 +591,4 @@ declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => nul
407
591
  declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
408
592
  declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
409
593
 
410
- export { BRACKETS, CLOSE_BRACKETS, type CharacterError, OPEN_BRACKETS, PATTERNS, alignTokenSequences, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasInvalidFootnotes, isBalanced, normalizeArabicText, processTextAlignment, tokenizeText };
594
+ export { BRACKETS, CLOSE_BRACKETS, type CharacterError, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, tokenizeText };
package/dist/index.js CHANGED
@@ -1,3 +1,3 @@
1
- var u={arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[ุŒ.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},m=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),E=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},S=(e,t=[])=>{let n=e;for(let r of t){let o=new RegExp(r,"g");n=n.replace(o,` ${r} `)}return n.trim().split(u.whitespace).filter(Boolean)},A=(e,t,n)=>{let r=u.footnoteStandalone.test(t),o=u.footnoteEmbedded.test(n),c=u.footnoteStandalone.test(n),a=u.footnoteEmbedded.test(t),s=E(t),i=E(n);return r&&o&&s===i?(e[e.length-1]=n,!0):!!(a&&c&&s===i)},F=(e,t)=>{let n=u.footnoteEmbedded.test(e),r=u.footnoteEmbedded.test(t);return n&&!r?[e]:r&&!n?[t]:n&&r?[e.length<=t.length?e:t]:null},C=(e,t)=>{let n=u.footnoteStandalone.test(e),r=u.footnoteStandalone.test(t);return n&&!r?[e,t]:r&&!n?[t,e]:n&&r?[e.length<=t.length?e:t]:null};var x={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},L=(e,t)=>{let n=e.length,r=t.length;if(n===0)return r;if(r===0)return n;let[o,c]=n<=r?[e,t]:[t,e],a=o.length,s=c.length,i=Array.from({length:a+1},(f,d)=>d);for(let f=1;f<=s;f++){let d=[f];for(let p=1;p<=a;p++){let b=c[f-1]===o[p-1]?0:1,l=Math.min(i[p]+1,d[p-1]+1,i[p-1]+b);d.push(l)}i=d}return i[a]},R=(e,t)=>{let n=Math.max(e.length,t.length)||1,r=L(e,t);return(n-r)/n},B=(e,t,n=.6)=>{let r=m(e),o=m(t);return R(r,o)>=n},z=(e,t,n,r)=>{let o=m(e),c=m(t);if(o===c)return x.PERFECT_MATCH;let a=n.includes(e)||n.includes(t),s=R(o,c)>=r;return a||s?x.SOFT_MATCH:x.MISMATCH_PENALTY},_=(e,t,n)=>{let r=[],o=t.length,c=n.length;for(;o>0||c>0;)switch(e[o][c].direction){case"diagonal":r.push([t[--o],n[--c]]);break;case"left":r.push([null,n[--c]]);break;case"up":r.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return r.reverse()},P=(e,t,n,r)=>{let o=e.length,c=t.length,a=Array.from({length:o+1},()=>Array.from({length:c+1},()=>({direction:null,score:0})));for(let s=1;s<=o;s++)a[s][0]={direction:"up",score:s*x.GAP_PENALTY};for(let s=1;s<=c;s++)a[0][s]={direction:"left",score:s*x.GAP_PENALTY};for(let s=1;s<=o;s++)for(let i=1;i<=c;i++){let f=z(e[s-1],t[i-1],n,r),d=a[s-1][i-1].score+f,p=a[s-1][i].score+x.GAP_PENALTY,b=a[s][i-1].score+x.GAP_PENALTY,l=Math.max(d,p,b),g="left";l===d?g="diagonal":l===p&&(g="up"),a[s][i]={direction:g,score:l}}return _(a,e,t)};var O=e=>{let t=[],n=0,r=-1;for(let c=0;c<e.length;c++)e[c]==='"'&&(n++,r=c);let o=n%2===0;return!o&&r!==-1&&t.push({char:'"',index:r,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},v={"\xAB":"\xBB","(":")","[":"]","{":"}"},D=new Set(["\xAB","(","[","{"]),H=new Set(["\xBB",")","]","}"]),I=e=>{let t=[],n=[];for(let r=0;r<e.length;r++){let o=e[r];if(D.has(o))n.push({char:o,index:r});else if(H.has(o)){let c=n.pop();c?v[c.char]!==o&&(t.push({char:c.char,index:c.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:r,reason:"mismatched",type:"bracket"})):t.push({char:o,index:r,reason:"unmatched",type:"bracket"})}}return n.forEach(({char:r,index:o})=>{t.push({char:r,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},N=e=>{let t=O(e),n=I(e);return{errors:[...t.errors,...n.errors].sort((r,o)=>r.index-o.index),isBalanced:t.isBalanced&&n.isBalanced}},Z=e=>{let t=[],n=e.split(`
2
- `),r=0;return n.forEach((o,c)=>{if(o.length>10){let a=N(o);a.isBalanced||a.errors.forEach(s=>{t.push({absoluteIndex:r+s.index,char:s.char,reason:s.reason,type:s.type})})}r+=o.length+(c<n.length-1?1:0)}),t},k=e=>O(e).isBalanced,ee=e=>I(e).isBalanced,te=e=>N(e).isBalanced;var q="()",V=e=>u.invalidReferenceRegex.test(e),Y=new Intl.NumberFormat("ar-SA"),G=e=>Y.format(e),T=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,j=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},n=e.replace(/[()]/g,""),r="";for(let c of n)r+=t[c];let o=parseInt(r,10);return isNaN(o)?0:o},w=e=>{let t=e.filter(s=>!s.isFootnote).flatMap(s=>s.text.match(u.arabicReferenceRegex)||[]),n=e.filter(s=>!s.isFootnote).flatMap(s=>s.text.match(u.ocrConfusedReferenceRegex)||[]),r=e.filter(s=>s.isFootnote).flatMap(s=>s.text.match(u.arabicFootnoteReferenceRegex)||[]),o=e.filter(s=>s.isFootnote).flatMap(s=>s.text.match(u.ocrConfusedFootnoteReferenceRegex)||[]),c=n.map(s=>s.replace(/[.1OV9]/g,i=>T(i))),a=o.map(s=>s.replace(/[.1OV9]/g,i=>T(i)));return{bodyReferences:[...t,...c],footnoteReferences:[...r,...a],ocrConfusedInBody:n,ocrConfusedInFootnotes:o}},K=(e,t)=>{if(e.some(c=>V(c.text)))return!0;let r=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(r.size!==o.size)return!0;for(let c of r)if(!o.has(c))return!0;return!1},oe=e=>{let t=w(e);if(!K(e,t))return e;let n=e.map(l=>{let g=l.text,y=/\([.1OV9]+\)/g;return g=g.replace(y,h=>h.replace(/[.1OV9]/g,M=>T(M))),{...l,text:g}}),r=w(n),o=new Set(r.bodyReferences),c=new Set(r.footnoteReferences),a=[...new Set(r.bodyReferences)],s=[...new Set(r.footnoteReferences)],i=a.filter(l=>!c.has(l)),f=s.filter(l=>!o.has(l)),d=[...o,...c],b={count:(d.length>0?Math.max(0,...d.map(l=>j(l))):0)+1};return n.map(l=>{if(!l.text.includes(q))return l;let g=l.text;return g=g.replace(/\(\)/g,()=>{if(l.isFootnote){let h=i.shift();if(h)return h}else{let h=f.shift();if(h)return h}let y=`(${G(b.count)})`;return b.count++,y}),{...l,text:g}})};var Q=(e,t,{similarityThreshold:n,typoSymbols:r})=>{if(e===null)return[t];if(t===null)return[e];if(m(e)===m(t))return[e];let o=F(e,t);if(o)return o;let c=C(e,t);if(c)return c;if(r.includes(e)||r.includes(t)){let f=r.find(d=>d===e||d===t);return f?[f]:[e]}let a=m(e),s=m(t);return[R(a,s)>n?e:t]},$=(e,t)=>{if(e.length===0)return e;let n=[];for(let r of e){if(n.length===0){n.push(r);continue}let o=n.at(-1);if(B(o,r,t)){r.length<o.length&&(n[n.length-1]=r);continue}A(n,o,r)||n.push(r)}return n},U=(e,t,n)=>{let r=S(e,n.typoSymbols),o=S(t,n.typoSymbols),a=P(r,o,n.typoSymbols,n.similarityThreshold).flatMap(([i,f])=>Q(i,f,n));return $(a,n.highSimilarityThreshold).join(" ")},ie=(e,t,{highSimilarityThreshold:n=.8,similarityThreshold:r=.6,typoSymbols:o})=>U(e,t,{highSimilarityThreshold:n,similarityThreshold:r,typoSymbols:o});export{v as BRACKETS,H as CLOSE_BRACKETS,D as OPEN_BRACKETS,u as PATTERNS,P as alignTokenSequences,ee as areBracketsBalanced,k as areQuotesBalanced,B as areSimilarAfterNormalization,_ as backtrackAlignment,z as calculateAlignmentScore,L as calculateLevenshteinDistance,R as calculateSimilarity,N as checkBalance,oe as correctReferences,E as extractDigits,ie as fixTypo,Z as getUnbalancedErrors,A as handleFootnoteFusion,F as handleFootnoteSelection,C as handleStandaloneFootnotes,V as hasInvalidFootnotes,te as isBalanced,m as normalizeArabicText,U as processTextAlignment,S as tokenizeText};
1
+ var u={arabicCharacters:/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[ุŒ.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},g=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),T=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},F=(e,t=[])=>{let r=e;for(let n of t){let o=new RegExp(n,"g");r=r.replace(o,` ${n} `)}return r.trim().split(u.whitespace).filter(Boolean)},S=(e,t,r)=>{let n=u.footnoteStandalone.test(t),o=u.footnoteEmbedded.test(r),s=u.footnoteStandalone.test(r),c=u.footnoteEmbedded.test(t),i=T(t),a=T(r);return n&&o&&i===a?(e[e.length-1]=r,!0):!!(c&&s&&i===a)},E=(e,t)=>{let r=u.footnoteEmbedded.test(e),n=u.footnoteEmbedded.test(t);return r&&!n?[e]:n&&!r?[t]:r&&n?[e.length<=t.length?e:t]:null},N=(e,t)=>{let r=u.footnoteStandalone.test(e),n=u.footnoteStandalone.test(t);return r&&!n?[e,t]:n&&!r?[t,e]:r&&n?[e.length<=t.length?e:t]:null};var b={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},I=(e,t)=>{let r=e.length,n=t.length;if(r===0)return n;if(n===0)return r;let[o,s]=r<=n?[e,t]:[t,e],c=o.length,i=s.length,a=Array.from({length:c+1},(f,d)=>d);for(let f=1;f<=i;f++){let d=[f];for(let p=1;p<=c;p++){let C=s[f-1]===o[p-1]?0:1,l=Math.min(a[p]+1,d[p-1]+1,a[p-1]+C);d.push(l)}a=d}return a[c]},x=(e,t)=>{let r=Math.max(e.length,t.length)||1,n=I(e,t);return(r-n)/r},R=(e,t,r=.6)=>{let n=g(e),o=g(t);return x(n,o)>=r},v=(e,t,r,n)=>{let o=g(e),s=g(t);if(o===s)return b.PERFECT_MATCH;let c=r.includes(e)||r.includes(t),i=x(o,s)>=n;return c||i?b.SOFT_MATCH:b.MISMATCH_PENALTY},_=(e,t,r)=>{let n=[],o=t.length,s=r.length;for(;o>0||s>0;)switch(e[o][s].direction){case"diagonal":n.push([t[--o],r[--s]]);break;case"left":n.push([null,r[--s]]);break;case"up":n.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return n.reverse()},B=(e,t,r,n)=>{let o=e.length,s=t.length,c=Array.from({length:o+1},()=>Array.from({length:s+1},()=>({direction:null,score:0})));for(let i=1;i<=o;i++)c[i][0]={direction:"up",score:i*b.GAP_PENALTY};for(let i=1;i<=s;i++)c[0][i]={direction:"left",score:i*b.GAP_PENALTY};for(let i=1;i<=o;i++)for(let a=1;a<=s;a++){let f=v(e[i-1],t[a-1],r,n),d=c[i-1][a-1].score+f,p=c[i-1][a].score+b.GAP_PENALTY,C=c[i][a-1].score+b.GAP_PENALTY,l=Math.max(d,p,C),m="left";l===d?m="diagonal":l===p&&(m="up"),c[i][a]={direction:m,score:l}}return _(c,e,t)};var ae=(e,t)=>{let r=[],n=0;for(let o of e){if(n>=t.length)break;if(o){let{result:s,segmentsConsumed:c}=q(o,t,n);s&&r.push(s),n+=c}else r.push(t[n]),n++}return n<t.length&&r.push(...t.slice(n)),r},$=(e,t,r)=>{let n=`${t} ${r}`,o=`${r} ${t}`,s=g(e),c=x(s,g(n)),i=x(s,g(o));return c>=i?n:o},q=(e,t,r)=>{let n=t[r];if(R(e,n))return{result:n,segmentsConsumed:1};let o=t[r],s=t[r+1];return!o||!s?o?{result:o,segmentsConsumed:1}:{result:"",segmentsConsumed:0}:{result:$(e,o,s),segmentsConsumed:2}};var P=e=>{let t=[],r=0,n=-1;for(let s=0;s<e.length;s++)e[s]==='"'&&(r++,n=s);let o=r%2===0;return!o&&n!==-1&&t.push({char:'"',index:n,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},D={"\xAB":"\xBB","(":")","[":"]","{":"}"},H=new Set(["\xAB","(","[","{"]),L=new Set(["\xBB",")","]","}"]),M=e=>{let t=[],r=[];for(let n=0;n<e.length;n++){let o=e[n];if(H.has(o))r.push({char:o,index:n});else if(L.has(o)){let s=r.pop();s?D[s.char]!==o&&(t.push({char:s.char,index:s.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:n,reason:"mismatched",type:"bracket"})):t.push({char:o,index:n,reason:"unmatched",type:"bracket"})}}return r.forEach(({char:n,index:o})=>{t.push({char:n,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},O=e=>{let t=P(e),r=M(e);return{errors:[...t.errors,...r.errors].sort((n,o)=>n.index-o.index),isBalanced:t.isBalanced&&r.isBalanced}},le=e=>{let t=[],r=e.split(`
2
+ `),n=0;return r.forEach((o,s)=>{if(o.length>10){let c=O(o);c.isBalanced||c.errors.forEach(i=>{t.push({absoluteIndex:n+i.index,char:i.char,reason:i.reason,type:i.type})})}n+=o.length+(s<r.length-1?1:0)}),t},fe=e=>P(e).isBalanced,de=e=>M(e).isBalanced,ge=e=>O(e).isBalanced;var V="()",j=e=>u.invalidReferenceRegex.test(e),Y=new Intl.NumberFormat("ar-SA"),G=e=>Y.format(e),A=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,K=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},r=e.replace(/[()]/g,""),n="";for(let s of r)n+=t[s];let o=parseInt(n,10);return isNaN(o)?0:o},w=e=>{let t=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.arabicReferenceRegex)||[]),r=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedReferenceRegex)||[]),n=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.arabicFootnoteReferenceRegex)||[]),o=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedFootnoteReferenceRegex)||[]),s=r.map(i=>i.replace(/[.1OV9]/g,a=>A(a))),c=o.map(i=>i.replace(/[.1OV9]/g,a=>A(a)));return{bodyReferences:[...t,...s],footnoteReferences:[...n,...c],ocrConfusedInBody:r,ocrConfusedInFootnotes:o}},Z=(e,t)=>{if(e.some(s=>j(s.text)))return!0;let n=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(n.size!==o.size)return!0;for(let s of n)if(!o.has(s))return!0;return!1},he=e=>{let t=w(e);if(!Z(e,t))return e;let r=e.map(l=>{let m=l.text,y=/\([.1OV9]+\)/g;return m=m.replace(y,h=>h.replace(/[.1OV9]/g,z=>A(z))),{...l,text:m}}),n=w(r),o=new Set(n.bodyReferences),s=new Set(n.footnoteReferences),c=[...new Set(n.bodyReferences)],i=[...new Set(n.footnoteReferences)],a=c.filter(l=>!s.has(l)),f=i.filter(l=>!o.has(l)),d=[...o,...s],C={count:(d.length>0?Math.max(0,...d.map(l=>K(l))):0)+1};return r.map(l=>{if(!l.text.includes(V))return l;let m=l.text;return m=m.replace(/\(\)/g,()=>{if(l.isFootnote){let h=a.shift();if(h)return h}else{let h=f.shift();if(h)return h}let y=`(${G(C.count)})`;return C.count++,y}),{...l,text:m}})};var Ce=e=>{if(!e||e.trim().length===0)return!0;let t=e.trim(),r=t.length;if(r<2||W(t))return!0;let n=Q(t);if(U(n,r))return!0;let o=u.arabicCharacters.test(t);return!o&&/[a-zA-Z]/.test(t)?!0:o?!k(n,r):J(n,r,t)};function Q(e){let t={arabicCount:0,charFreq:new Map,digitCount:0,latinCount:0,punctuationCount:0,spaceCount:0,symbolCount:0},r=Array.from(e);for(let n of r)t.charFreq.set(n,(t.charFreq.get(n)||0)+1),u.arabicCharacters.test(n)?t.arabicCount++:/\d/.test(n)?t.digitCount++:/[a-zA-Z]/.test(n)?t.latinCount++:/\s/.test(n)?t.spaceCount++:/[.,;:()[\]{}"""''`]/.test(n)?t.punctuationCount++:t.symbolCount++;return t}function U(e,t){let r=0,n=["!",".","-","=","_"];for(let[o,s]of e.charFreq)s>=5&&n.includes(o)&&(r+=s);return r/t>.4}function W(e){return[/^[-=_โ”โ‰บโ‰ป\s]*$/,/^[.\s]*$/,/^[!\s]*$/,/^[A-Z\s]*$/,/^[-\d\s]*$/,/^\d+\s*$/,/^[A-Z]\s*$/,/^[โ€”\s]*$/,/^[เฅเคฐ\s-]*$/].some(r=>r.test(e))}function J(e,t,r){let n=e.arabicCount+e.latinCount+e.digitCount;return n===0||X(e,n,t)?!0:/[ู -ูฉ]/.test(r)&&e.digitCount>=3?!1:(e.symbolCount+Math.max(0,e.punctuationCount-5))/Math.max(n,1)>2||t<=5&&e.arabicCount===0&&!(/^\d+$/.test(r)&&e.digitCount>=3)?!0:/^\d{3,4}$/.test(r)?!1:t<=10}function X(e,t,r){let{arabicCount:n,spaceCount:o}=e;return o>0&&t===o+1&&t<=5||r<=10&&o>=2&&n===0||o/r>.6}function k(e,t){return e.arabicCount>=3||e.arabicCount>=1&&e.digitCount>0&&t<=20||e.arabicCount>=2&&e.punctuationCount<=2&&t<=10||e.arabicCount>=1&&t<=5&&e.punctuationCount<=1}var ee=(e,t,{similarityThreshold:r,typoSymbols:n})=>{if(e===null)return[t];if(t===null)return[e];if(g(e)===g(t))return[e];let o=E(e,t);if(o)return o;let s=N(e,t);if(s)return s;if(n.includes(e)||n.includes(t)){let f=n.find(d=>d===e||d===t);return f?[f]:[e]}let c=g(e),i=g(t);return[x(c,i)>r?e:t]},te=(e,t)=>{if(e.length===0)return e;let r=[];for(let n of e){if(r.length===0){r.push(n);continue}let o=r.at(-1);if(R(o,n,t)){n.length<o.length&&(r[r.length-1]=n);continue}S(r,o,n)||r.push(n)}return r},ne=(e,t,r)=>{let n=F(e,r.typoSymbols),o=F(t,r.typoSymbols),c=B(n,o,r.typoSymbols,r.similarityThreshold).flatMap(([a,f])=>ee(a,f,r));return te(c,r.highSimilarityThreshold).join(" ")},Ae=(e,t,{highSimilarityThreshold:r=.8,similarityThreshold:n=.6,typoSymbols:o})=>ne(e,t,{highSimilarityThreshold:r,similarityThreshold:n,typoSymbols:o});export{D as BRACKETS,L as CLOSE_BRACKETS,H as OPEN_BRACKETS,u as PATTERNS,ae as alignTextSegments,B as alignTokenSequences,Q as analyzeCharacterStats,de as areBracketsBalanced,fe as areQuotesBalanced,R as areSimilarAfterNormalization,_ as backtrackAlignment,v as calculateAlignmentScore,I as calculateLevenshteinDistance,x as calculateSimilarity,O as checkBalance,he as correctReferences,T as extractDigits,Ae as fixTypo,le as getUnbalancedErrors,S as handleFootnoteFusion,E as handleFootnoteSelection,N as handleStandaloneFootnotes,U as hasExcessiveRepetition,j as hasInvalidFootnotes,Ce as isArabicTextNoise,ge as isBalanced,W as isBasicNoisePattern,J as isNonArabicNoise,X as isSpacingNoise,k as isValidArabicContent,g as normalizeArabicText,ne as processTextAlignment,F as tokenizeText};
3
3
  //# sourceMappingURL=index.js.map
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/textUtils.ts","../src/similarity.ts","../src/balance.ts","../src/footnotes.ts","../src/index.ts"],"sourcesContent":["/**\n * Collection of regex patterns used throughout the library for text processing\n */\nexport const PATTERNS = {\n /** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */\n arabicDigits: /[0-9\\u0660-\\u0669]+/,\n\n /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\\([\\u0660-\\u0669]+\\) */\n arabicFootnoteReferenceRegex: /^\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ู -ูฉ) */\n arabicLettersAndDigits: /[0-9\\u0621-\\u063A\\u0641-\\u064A\\u0660-\\u0669]+/g,\n\n /** Matches Arabic punctuation marks and whitespace characters */\n arabicPunctuationAndWhitespace: /[\\s\\u060C\\u061B\\u061F\\u06D4]+/,\n\n /** Matches footnote references with Arabic-Indic digits in parentheses: \\([\\u0660-\\u0669]+\\) */\n arabicReferenceRegex: /\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic diacritical marks (harakat, tanween, etc.) */\n diacritics: /[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]/g,\n\n /** Matches embedded footnotes within text: \\([0-9\\u0660-\\u0669]+\\) */\n footnoteEmbedded: /\\([0-9\\u0660-\\u0669]+\\)/,\n\n /** Matches standalone footnote markers at line start/end: ^\\(?[0-9\\u0660-\\u0669]+\\)?[ุŒ.]?$ */\n footnoteStandalone: /^\\(?[0-9\\u0660-\\u0669]+\\)?[ุŒ.]?$/,\n\n /** Matches invalid/problematic footnote references: empty \"()\" or OCR-confused endings */\n invalidReferenceRegex: /\\(\\)|\\([.1OV9]+\\)/g, // Combined pattern for detecting any invalid/problematic references\n\n /** Matches OCR-confused footnote references at line start with characters like .1OV9 */\n ocrConfusedFootnoteReferenceRegex: /^\\([.1OV9]+\\)/g,\n\n /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */\n ocrConfusedReferenceRegex: /\\([.1OV9]+\\)/g,\n\n /** Matches Arabic tatweel (kashida) character used for text stretching */\n tatweel: /\\u0640/g,\n\n /** Matches one or more whitespace characters */\n whitespace: /\\s+/,\n};\n\n/**\n * Normalizes Arabic text by removing diacritics, and tatweel marks.\n * This normalization enables better text comparison by focusing on core characters\n * while ignoring decorative elements that don't affect meaning.\n *\n * @param text - Arabic text to normalize\n * @returns Normalized text with diacritics, tatweel, and basic tags removed\n * @example\n * normalizeArabicText('ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’') // Returns 'ุงู„ุณู„ุงู… ุนู„ูŠูƒู…'\n */\nexport const normalizeArabicText = (text: string): string => {\n return text.replace(PATTERNS.tatweel, '').replace(PATTERNS.diacritics, '').trim();\n};\n\n/**\n * Extracts the first sequence of Arabic or Western digits from text.\n * Used primarily for footnote number comparison to match related footnote elements.\n *\n * @param text - Text containing digits to extract\n * @returns First digit sequence found, or empty string if none found\n * @example\n * extractDigits('(ูฅ)ุฃุฎุฑุฌู‡ ุงู„ุจุฎุงุฑูŠ') // Returns 'ูฅ'\n * extractDigits('See note (123)') // Returns '123'\n */\nexport const extractDigits = (text: string): string => {\n const match = text.match(PATTERNS.arabicDigits);\n return match ? match[0] : '';\n};\n\n/**\n * Tokenizes text into individual words while preserving special symbols.\n * Removes HTML tags, adds spacing around preserved symbols to ensure they\n * are tokenized separately, then splits on whitespace.\n *\n * @param text - Text to tokenize\n * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens\n * @returns Array of tokens, or empty array if input is empty/whitespace\n * @example\n * tokenizeText('Hello ๏ทบ world', ['๏ทบ']) // Returns ['Hello', '๏ทบ', 'world']\n */\nexport const tokenizeText = (text: string, preserveSymbols: string[] = []): string[] => {\n let processedText = text;\n\n // Add spaces around each preserve symbol to ensure they're tokenized separately\n for (const symbol of preserveSymbols) {\n const symbolRegex = new RegExp(symbol, 'g');\n processedText = processedText.replace(symbolRegex, ` ${symbol} `);\n }\n\n return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);\n};\n\n/**\n * Handles fusion of standalone and embedded footnotes during token processing.\n * Detects patterns where standalone footnotes should be merged with embedded ones\n * or where trailing standalone footnotes should be skipped.\n *\n * @param result - Current result array being built\n * @param previousToken - The previous token in the sequence\n * @param currentToken - The current token being processed\n * @returns True if the current token was handled (fused or skipped), false otherwise\n * @example\n * // (ูฅ) + (ูฅ)ุฃุฎุฑุฌู‡ โ†’ result gets (ูฅ)ุฃุฎุฑุฌู‡\n * // (ูฅ)ุฃุฎุฑุฌู‡ + (ูฅ) โ†’ (ูฅ) is skipped\n */\nexport const handleFootnoteFusion = (result: string[], previousToken: string, currentToken: string): boolean => {\n const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);\n const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);\n const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);\n const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);\n\n const prevDigits = extractDigits(previousToken);\n const currDigits = extractDigits(currentToken);\n\n // Replace standalone with fused version: (ูฅ) + (ูฅ)ุฃุฎุฑุฌู‡ โ†’ (ูฅ)ุฃุฎุฑุฌู‡\n if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {\n result[result.length - 1] = currentToken;\n return true;\n }\n\n // Skip trailing standalone: (ูฅ)ุฃุฎุฑุฌู‡ + (ูฅ) โ†’ (ูฅ)ุฃุฎุฑุฌู‡\n if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) {\n return true;\n }\n\n return false;\n};\n\n/**\n * Handles selection logic for tokens with embedded footnotes during alignment.\n * Prefers tokens that contain embedded footnotes over plain text, and among\n * tokens with embedded footnotes, prefers the shorter one.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleFootnoteSelection('text', '(ูก)text') // Returns ['(ูก)text']\n * handleFootnoteSelection('(ูก)longtext', '(ูก)text') // Returns ['(ูก)text']\n */\nexport const handleFootnoteSelection = (tokenA: string, tokenB: string): null | string[] => {\n const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);\n const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);\n\n if (aHasEmbedded && !bHasEmbedded) return [tokenA];\n if (bHasEmbedded && !aHasEmbedded) return [tokenB];\n if (aHasEmbedded && bHasEmbedded) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n\n/**\n * Handles selection logic for standalone footnote tokens during alignment.\n * Manages cases where one or both tokens are standalone footnotes, preserving\n * both tokens when one is a footnote and the other is regular text.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleStandaloneFootnotes('(ูก)', 'text') // Returns ['(ูก)', 'text']\n * handleStandaloneFootnotes('(ูก)', '(ูข)') // Returns ['(ูก)'] (shorter one)\n */\nexport const handleStandaloneFootnotes = (tokenA: string, tokenB: string): null | string[] => {\n const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);\n const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);\n\n if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];\n if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];\n if (aIsFootnote && bIsFootnote) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n","import { normalizeArabicText } from './textUtils';\n\n// Alignment scoring constants\nconst ALIGNMENT_SCORES = {\n GAP_PENALTY: -1,\n MISMATCH_PENALTY: -2,\n PERFECT_MATCH: 2,\n SOFT_MATCH: 1,\n};\n\n/**\n * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.\n * The Levenshtein distance is the minimum number of single-character edits (insertions,\n * deletions, or substitutions) required to change one string into another.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Minimum edit distance between the two strings\n * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths\n * @example\n * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3\n * calculateLevenshteinDistance('', 'hello') // Returns 5\n */\nexport const calculateLevenshteinDistance = (textA: string, textB: string): number => {\n const lengthA = textA.length;\n const lengthB = textB.length;\n\n if (lengthA === 0) {\n return lengthB;\n }\n\n if (lengthB === 0) {\n return lengthA;\n }\n\n // Use shorter string for the array to optimize space\n const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];\n const shortLen = shorter.length;\n const longLen = longer.length;\n\n let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);\n\n for (let i = 1; i <= longLen; i++) {\n const currentRow = [i];\n\n for (let j = 1; j <= shortLen; j++) {\n const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;\n const minCost = Math.min(\n previousRow[j] + 1, // deletion\n currentRow[j - 1] + 1, // insertion\n previousRow[j - 1] + substitutionCost, // substitution\n );\n currentRow.push(minCost);\n }\n\n previousRow = currentRow;\n }\n\n return previousRow[shortLen];\n};\n\n/**\n * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.\n * Uses Levenshtein distance normalized by the length of the longer string.\n * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)\n * @example\n * calculateSimilarity('hello', 'hello') // Returns 1.0\n * calculateSimilarity('hello', 'help') // Returns 0.6\n */\nexport const calculateSimilarity = (textA: string, textB: string): number => {\n const maxLength = Math.max(textA.length, textB.length) || 1;\n const distance = calculateLevenshteinDistance(textA, textB);\n return (maxLength - distance) / maxLength;\n};\n\n/**\n * Checks if two texts are similar after Arabic normalization.\n * Normalizes both texts by removing diacritics and decorative elements,\n * then compares their similarity against the provided threshold.\n *\n * @param textA - First text to compare\n * @param textB - Second text to compare\n * @param threshold - Similarity threshold (0.0 to 1.0)\n * @returns True if normalized texts meet the similarity threshold\n * @example\n * areSimilarAfterNormalization('ุงู„ุณูŽู‘ู„ุงู…', 'ุงู„ุณู„ุงู…', 0.9) // Returns true\n */\nexport const areSimilarAfterNormalization = (textA: string, textB: string, threshold: number = 0.6): boolean => {\n const normalizedA = normalizeArabicText(textA);\n const normalizedB = normalizeArabicText(textB);\n return calculateSimilarity(normalizedA, normalizedB) >= threshold;\n};\n\n/**\n * Calculates alignment score for two tokens in sequence alignment.\n * Uses different scoring criteria: perfect match after normalization gets highest score,\n * typo symbols or highly similar tokens get soft match score, mismatches get penalty.\n *\n * @param tokenA - First token to score\n * @param tokenB - Second token to score\n * @param typoSymbols - Array of special symbols that get preferential treatment\n * @param similarityThreshold - Threshold for considering tokens highly similar\n * @returns Alignment score (higher is better match)\n * @example\n * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)\n * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity\n */\nexport const calculateAlignmentScore = (\n tokenA: string,\n tokenB: string,\n typoSymbols: string[],\n similarityThreshold: number,\n): number => {\n const normalizedA = normalizeArabicText(tokenA);\n const normalizedB = normalizeArabicText(tokenB);\n\n // Perfect match after normalization\n if (normalizedA === normalizedB) {\n return ALIGNMENT_SCORES.PERFECT_MATCH;\n }\n\n // Check if either token is a typo symbol or high similarity\n const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);\n const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;\n\n if (isTypoSymbol || isHighlySimilar) {\n return ALIGNMENT_SCORES.SOFT_MATCH;\n }\n\n return ALIGNMENT_SCORES.MISMATCH_PENALTY;\n};\n\ntype AlignedTokenPair = [null | string, null | string];\n\ntype AlignmentCell = {\n direction: 'diagonal' | 'left' | 'up' | null;\n score: number;\n};\n\n/**\n * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.\n * Follows the directional indicators in the matrix to build the sequence of aligned\n * token pairs from the Needleman-Wunsch algorithm.\n *\n * @param matrix - Scoring matrix with directional information from alignment\n * @param tokensA - First sequence of tokens\n * @param tokensB - Second sequence of tokens\n * @returns Array of aligned token pairs, where null indicates a gap\n * @throws Error if invalid alignment direction is encountered\n */\nexport const backtrackAlignment = (\n matrix: AlignmentCell[][],\n tokensA: string[],\n tokensB: string[],\n): AlignedTokenPair[] => {\n const alignment: AlignedTokenPair[] = [];\n let i = tokensA.length;\n let j = tokensB.length;\n\n while (i > 0 || j > 0) {\n const currentCell = matrix[i][j];\n\n switch (currentCell.direction) {\n case 'diagonal':\n alignment.push([tokensA[--i], tokensB[--j]]);\n break;\n case 'left':\n alignment.push([null, tokensB[--j]]);\n break;\n case 'up':\n alignment.push([tokensA[--i], null]);\n break;\n default:\n throw new Error('Invalid alignment direction');\n }\n }\n\n return alignment.reverse();\n};\n\n/**\n * Performs global sequence alignment using the Needleman-Wunsch algorithm.\n * Aligns two token sequences to find the optimal pairing that maximizes\n * the total alignment score, handling insertions, deletions, and substitutions.\n *\n * @param tokensA - First sequence of tokens to align\n * @param tokensB - Second sequence of tokens to align\n * @param typoSymbols - Special symbols that affect scoring\n * @param similarityThreshold - Threshold for high similarity scoring\n * @returns Array of aligned token pairs, with null indicating gaps\n * @example\n * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)\n * // Returns [['a', 'a'], ['b', 'c']]\n */\nexport const alignTokenSequences = (\n tokensA: string[],\n tokensB: string[],\n typoSymbols: string[],\n similarityThreshold: number,\n): AlignedTokenPair[] => {\n const lengthA = tokensA.length;\n const lengthB = tokensB.length;\n\n // Initialize scoring matrix\n const scoringMatrix: AlignmentCell[][] = Array.from({ length: lengthA + 1 }, () =>\n Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 })),\n );\n\n // Initialize first row and column\n for (let i = 1; i <= lengthA; i++) {\n scoringMatrix[i][0] = { direction: 'up', score: i * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n for (let j = 1; j <= lengthB; j++) {\n scoringMatrix[0][j] = { direction: 'left', score: j * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n\n // Fill scoring matrix\n for (let i = 1; i <= lengthA; i++) {\n for (let j = 1; j <= lengthB; j++) {\n const alignmentScore = calculateAlignmentScore(\n tokensA[i - 1],\n tokensB[j - 1],\n typoSymbols,\n similarityThreshold,\n );\n\n const diagonalScore = scoringMatrix[i - 1][j - 1].score + alignmentScore;\n const upScore = scoringMatrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY;\n const leftScore = scoringMatrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY;\n\n const bestScore = Math.max(diagonalScore, upScore, leftScore);\n let bestDirection: 'diagonal' | 'left' | 'up' = 'left';\n\n if (bestScore === diagonalScore) {\n bestDirection = 'diagonal';\n } else if (bestScore === upScore) {\n bestDirection = 'up';\n }\n\n scoringMatrix[i][j] = { direction: bestDirection, score: bestScore };\n }\n }\n\n // Backtrack to build alignment\n return backtrackAlignment(scoringMatrix, tokensA, tokensB);\n};\n","/**\n * Represents an error found when checking balance of quotes or brackets in text.\n */\ntype BalanceError = {\n /** The character that caused the error */\n char: string;\n /** The position of the character in the string */\n index: number;\n /** The reason for the error */\n reason: 'mismatched' | 'unclosed' | 'unmatched';\n /** The type of character that caused the error */\n type: 'bracket' | 'quote';\n};\n\n/**\n * Result of a balance check operation.\n */\ntype BalanceResult = {\n /** Array of errors found during balance checking */\n errors: BalanceError[];\n /** Whether the text is properly balanced */\n isBalanced: boolean;\n};\n\n/**\n * Checks if all double quotes in a string are balanced and returns detailed error information.\n *\n * A string has balanced quotes when every opening quote has a corresponding closing quote.\n * This function counts all quote characters and determines if there's an even number of them.\n * If there's an odd number, the last quote is marked as unmatched.\n *\n * @param str - The string to check for quote balance\n * @returns An object containing balance status and any errors found\n *\n * @example\n * ```typescript\n * checkQuoteBalance('Hello \"world\"') // { errors: [], isBalanced: true }\n * checkQuoteBalance('Hello \"world') // { errors: [{ char: '\"', index: 6, reason: 'unmatched', type: 'quote' }], isBalanced: false }\n * ```\n */\nconst checkQuoteBalance = (str: string): BalanceResult => {\n const errors: BalanceError[] = [];\n let quoteCount = 0;\n let lastQuoteIndex = -1;\n\n for (let i = 0; i < str.length; i++) {\n if (str[i] === '\"') {\n quoteCount++;\n lastQuoteIndex = i;\n }\n }\n\n const isBalanced = quoteCount % 2 === 0;\n\n if (!isBalanced && lastQuoteIndex !== -1) {\n errors.push({\n char: '\"',\n index: lastQuoteIndex,\n reason: 'unmatched',\n type: 'quote',\n });\n }\n\n return { errors, isBalanced };\n};\n\n/** Mapping of opening brackets to their corresponding closing brackets */\nexport const BRACKETS = { 'ยซ': 'ยป', '(': ')', '[': ']', '{': '}' };\n\n/** Set of all opening bracket characters */\nexport const OPEN_BRACKETS = new Set(['ยซ', '(', '[', '{']);\n\n/** Set of all closing bracket characters */\nexport const CLOSE_BRACKETS = new Set(['ยป', ')', ']', '}']);\n\n/**\n * Checks if all brackets in a string are properly balanced and returns detailed error information.\n *\n * A string has balanced brackets when:\n * - Every opening bracket has a corresponding closing bracket\n * - Brackets are properly nested (no crossing pairs)\n * - Each closing bracket matches the most recent unmatched opening bracket\n *\n * Supports the following bracket pairs: (), [], {}, ยซยป\n *\n * @param str - The string to check for bracket balance\n * @returns An object containing balance status and any errors found\n *\n * @example\n * ```typescript\n * checkBracketBalance('(hello [world])') // { errors: [], isBalanced: true }\n * checkBracketBalance('(hello [world)') // { errors: [{ char: '[', index: 7, reason: 'unclosed', type: 'bracket' }], isBalanced: false }\n * checkBracketBalance('(hello ]world[') // { errors: [...], isBalanced: false }\n * ```\n */\nconst checkBracketBalance = (str: string): BalanceResult => {\n const errors: BalanceError[] = [];\n const stack: Array<{ char: string; index: number }> = [];\n\n for (let i = 0; i < str.length; i++) {\n const char = str[i];\n\n if (OPEN_BRACKETS.has(char)) {\n stack.push({ char, index: i });\n } else if (CLOSE_BRACKETS.has(char)) {\n const lastOpen = stack.pop();\n\n if (!lastOpen) {\n errors.push({\n char,\n index: i,\n reason: 'unmatched',\n type: 'bracket',\n });\n } else if (BRACKETS[lastOpen.char as keyof typeof BRACKETS] !== char) {\n errors.push({\n char: lastOpen.char,\n index: lastOpen.index,\n reason: 'mismatched',\n type: 'bracket',\n });\n errors.push({\n char,\n index: i,\n reason: 'mismatched',\n type: 'bracket',\n });\n }\n }\n }\n\n stack.forEach(({ char, index }) => {\n errors.push({\n char,\n index,\n reason: 'unclosed',\n type: 'bracket',\n });\n });\n\n return { errors, isBalanced: errors.length === 0 };\n};\n\n/**\n * Checks if both quotes and brackets are balanced in a string and returns detailed error information.\n *\n * This function combines the results of both quote and bracket balance checking,\n * providing a comprehensive analysis of all balance issues in the text.\n * The errors are sorted by their position in the string for easier debugging.\n *\n * @param str - The string to check for overall balance\n * @returns An object containing combined balance status and all errors found, sorted by position\n *\n * @example\n * ```typescript\n * checkBalance('Hello \"world\" and (test)') // { errors: [], isBalanced: true }\n * checkBalance('Hello \"world and (test') // { errors: [...], isBalanced: false }\n * ```\n */\nexport const checkBalance = (str: string): BalanceResult => {\n const quoteResult = checkQuoteBalance(str);\n const bracketResult = checkBracketBalance(str);\n\n return {\n errors: [...quoteResult.errors, ...bracketResult.errors].sort((a, b) => a.index - b.index),\n isBalanced: quoteResult.isBalanced && bracketResult.isBalanced,\n };\n};\n\n/**\n * Enhanced error detection that returns absolute character positions for use with HighlightableTextarea.\n *\n * This interface extends the basic BalanceError to include absolute positioning\n * across multiple lines of text, making it suitable for text editors and\n * syntax highlighters that need precise character positioning.\n */\nexport interface CharacterError {\n /** Absolute character position from the start of the entire text */\n absoluteIndex: number;\n /** The character that caused the error */\n char: string;\n /** The reason for the error */\n reason: 'mismatched' | 'unclosed' | 'unmatched';\n /** The type of character that caused the error */\n type: 'bracket' | 'quote';\n}\n\n/**\n * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.\n *\n * This function processes text line by line, but only checks lines longer than 10 characters\n * for balance issues. It returns absolute positions that can be used with text editors\n * or highlighting components that need precise character positioning across the entire text.\n *\n * The absolute index accounts for newline characters between lines, providing accurate\n * positioning for the original text string.\n *\n * @param text - The multi-line text to analyze for balance errors\n * @returns Array of character errors with absolute positioning information\n *\n * @example\n * ```typescript\n * const text = 'Line 1 with \"quote\\nLine 2 with (bracket';\n * const errors = getUnbalancedErrors(text);\n * // Returns errors with absoluteIndex pointing to exact character positions\n * ```\n */\nexport const getUnbalancedErrors = (text: string): CharacterError[] => {\n const characterErrors: CharacterError[] = [];\n const lines = text.split('\\n');\n let absoluteIndex = 0;\n\n lines.forEach((line, lineIndex) => {\n if (line.length > 10) {\n const balanceResult = checkBalance(line);\n if (!balanceResult.isBalanced) {\n balanceResult.errors.forEach((error) => {\n characterErrors.push({\n absoluteIndex: absoluteIndex + error.index,\n char: error.char,\n reason: error.reason,\n type: error.type,\n });\n });\n }\n }\n // Add 1 for the newline character (except for the last line)\n absoluteIndex += line.length + (lineIndex < lines.length - 1 ? 1 : 0);\n });\n\n return characterErrors;\n};\n\n/**\n * Checks if all double quotes in a string are balanced.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for quote balance\n * @returns True if quotes are balanced, false otherwise\n *\n * @example\n * ```typescript\n * areQuotesBalanced('Hello \"world\"') // true\n * areQuotesBalanced('Hello \"world') // false\n * ```\n */\nexport const areQuotesBalanced = (str: string): boolean => {\n return checkQuoteBalance(str).isBalanced;\n};\n\n/**\n * Checks if all brackets in a string are properly balanced.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for bracket balance\n * @returns True if brackets are balanced, false otherwise\n *\n * @example\n * ```typescript\n * areBracketsBalanced('(hello [world])') // true\n * areBracketsBalanced('(hello [world') // false\n * ```\n */\nexport const areBracketsBalanced = (str: string): boolean => {\n return checkBracketBalance(str).isBalanced;\n};\n\n/**\n * Checks if both quotes and brackets are balanced in a string.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for overall balance\n * @returns True if both quotes and brackets are balanced, false otherwise\n *\n * @example\n * ```typescript\n * isBalanced('Hello \"world\" and (test)') // true\n * isBalanced('Hello \"world and (test') // false\n * ```\n */\nexport const isBalanced = (str: string): boolean => {\n return checkBalance(str).isBalanced;\n};\n","import { PATTERNS } from './textUtils';\n\nconst INVALID_FOOTNOTE = '()';\n\n/**\n * Checks if the given text contains invalid footnote references.\n * Invalid footnotes include empty parentheses \"()\" or OCR-confused characters\n * like \".1OV9\" that were misrecognized instead of Arabic numerals.\n *\n * @param text - Text to check for invalid footnote patterns\n * @returns True if text contains invalid footnote references, false otherwise\n * @example\n * hasInvalidFootnotes('This text has ()') // Returns true\n * hasInvalidFootnotes('This text has (ูก)') // Returns false\n * hasInvalidFootnotes('OCR mistake (O)') // Returns true\n */\nexport const hasInvalidFootnotes = (text: string): boolean => {\n return PATTERNS.invalidReferenceRegex.test(text);\n};\n\n// Arabic number formatter instance\nconst arabicFormatter = new Intl.NumberFormat('ar-SA');\n\n/**\n * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.\n * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.\n *\n * @param num - The number to convert to Arabic numerals\n * @returns String representation using Arabic-Indic digits (ู -ูฉ)\n * @example\n * numberToArabic(123) // Returns 'ูกูขูฃ'\n * numberToArabic(5) // Returns 'ูฅ'\n */\nconst numberToArabic = (num: number): string => {\n return arabicFormatter.format(num);\n};\n\n/**\n * Converts OCR-confused characters to their corresponding Arabic-Indic numerals.\n * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.\n *\n * @param char - Single character that may be an OCR mistake\n * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists\n * @example\n * ocrToArabic('O') // Returns 'ูฅ' (O often confused with ูฅ)\n * ocrToArabic('1') // Returns 'ูก' (1 often confused with ูก)\n * ocrToArabic('.') // Returns 'ู ' (dot often confused with ู )\n */\nconst ocrToArabic = (char: string): string => {\n const ocrToArabicMap: { [key: string]: string } = {\n '1': 'ูก',\n '9': 'ูฉ',\n '.': 'ู ',\n O: 'ูฅ',\n o: 'ูฅ',\n V: 'ูง',\n v: 'ูง',\n };\n return ocrToArabicMap[char] || char;\n};\n\n/**\n * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.\n * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.\n *\n * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(ูกูขูฃ)'\n * @returns Parsed number, or 0 if parsing fails\n * @example\n * arabicToNumber('(ูกูขูฃ)') // Returns 123\n * arabicToNumber('(ูฅ)') // Returns 5\n * arabicToNumber('invalid') // Returns 0\n */\nconst arabicToNumber = (arabicStr: string): number => {\n const lookup: { [key: string]: string } = {\n 'ู ': '0',\n 'ูก': '1',\n 'ูข': '2',\n 'ูฃ': '3',\n 'ูค': '4',\n 'ูฅ': '5',\n 'ูฆ': '6',\n 'ูง': '7',\n 'ูจ': '8',\n 'ูฉ': '9',\n };\n const digits = arabicStr.replace(/[()]/g, '');\n let numStr = '';\n for (const char of digits) {\n numStr += lookup[char];\n }\n const parsed = parseInt(numStr, 10);\n return isNaN(parsed) ? 0 : parsed;\n};\n\ntype TextLine = {\n isFootnote?: boolean;\n text: string;\n};\n\n/**\n * Extracts all footnote references from text lines, categorizing them by type and location.\n * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.\n *\n * @param lines - Array of text line objects with optional isFootnote flag\n * @returns Object containing categorized reference arrays:\n * - bodyReferences: All valid references found in body text\n * - footnoteReferences: All valid references found in footnotes\n * - ocrConfusedInBody: OCR-confused references in body text (for tracking)\n * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)\n * @example\n * const lines = [\n * { text: 'Body with (ูก) and (O)', isFootnote: false },\n * { text: '(ูก) Footnote text', isFootnote: true }\n * ];\n * const refs = extractReferences(lines);\n * // refs.bodyReferences contains ['(ูก)', '(ูฅ)'] - OCR 'O' converted to 'ูฅ'\n */\nconst extractReferences = (lines: TextLine[]) => {\n const arabicReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);\n\n const ocrConfusedReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);\n\n const arabicReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);\n\n const ocrConfusedReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);\n\n const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n return {\n bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],\n footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],\n ocrConfusedInBody: ocrConfusedReferencesInBody,\n ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes,\n };\n};\n\n/**\n * Determines if footnote reference correction is needed by checking for:\n * 1. Invalid footnote patterns (empty parentheses, OCR mistakes)\n * 2. Mismatched sets of references between body text and footnotes\n * 3. Different counts of references in body vs footnotes\n *\n * @param lines - Array of text line objects to analyze\n * @param references - Extracted reference data from extractReferences()\n * @returns True if correction is needed, false if references are already correct\n * @example\n * const lines = [{ text: 'Text with ()', isFootnote: false }];\n * const refs = extractReferences(lines);\n * needsCorrection(lines, refs) // Returns true due to invalid \"()\" reference\n */\nconst needsCorrection = (lines: TextLine[], references: ReturnType<typeof extractReferences>) => {\n const mistakenReferences = lines.some((line) => hasInvalidFootnotes(line.text));\n if (mistakenReferences) return true;\n\n const bodySet = new Set(references.bodyReferences);\n const footnoteSet = new Set(references.footnoteReferences);\n if (bodySet.size !== footnoteSet.size) return true;\n\n // Check if the sets contain the same elements\n for (const ref of bodySet) {\n if (!footnoteSet.has(ref)) {\n return true;\n }\n }\n\n return false;\n};\n\n/**\n * Corrects footnote references in an array of text lines by:\n * 1. Converting OCR-confused characters to proper Arabic numerals\n * 2. Filling in empty \"()\" references with appropriate numbers\n * 3. Ensuring footnote references in body text match those in footnotes\n * 4. Generating new reference numbers when needed\n *\n * @param lines - Array of text line objects, each with optional isFootnote flag\n * @returns Array of corrected text lines with proper footnote references\n * @example\n * const lines = [\n * { text: 'Main text with ()', isFootnote: false },\n * { text: '() This is a footnote', isFootnote: true }\n * ];\n * const corrected = correctReferences(lines);\n * // Returns lines with \"()\" replaced by proper Arabic numerals like \"(ูก)\"\n */\nexport const correctReferences = <T extends TextLine>(lines: T[]): T[] => {\n const initialReferences = extractReferences(lines);\n\n if (!needsCorrection(lines, initialReferences)) {\n return lines;\n }\n\n // Pass 1: Sanitize lines by correcting only OCR characters inside reference markers.\n const sanitizedLines = lines.map((line) => {\n let updatedText = line.text;\n // This regex finds the full reference, e.g., \"(O)\" or \"(1)\"\n const ocrRegex = /\\([.1OV9]+\\)/g;\n updatedText = updatedText.replace(ocrRegex, (match) => {\n // This replace acts *inside* the found match, e.g., on \"O\" or \"1\"\n return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));\n });\n return { ...line, text: updatedText };\n });\n\n // Pass 2: Analyze the sanitized lines to get a clear and accurate picture of references.\n const cleanReferences = extractReferences(sanitizedLines);\n\n // Step 3: Create queues of \"unmatched\" references for two-way pairing.\n const bodyRefSet = new Set(cleanReferences.bodyReferences);\n const footnoteRefSet = new Set(cleanReferences.footnoteReferences);\n\n const uniqueBodyRefs = [...new Set(cleanReferences.bodyReferences)];\n const uniqueFootnoteRefs = [...new Set(cleanReferences.footnoteReferences)];\n\n // Queue 1: Body references available for footnotes.\n const bodyRefsForFootnotes = uniqueBodyRefs.filter((ref) => !footnoteRefSet.has(ref));\n // Queue 2: Footnote references available for the body.\n const footnoteRefsForBody = uniqueFootnoteRefs.filter((ref) => !bodyRefSet.has(ref));\n\n // Step 4: Determine the starting point for any completely new reference numbers.\n const allRefs = [...bodyRefSet, ...footnoteRefSet];\n const maxRefNum = allRefs.length > 0 ? Math.max(0, ...allRefs.map((ref) => arabicToNumber(ref))) : 0;\n const referenceCounter = { count: maxRefNum + 1 };\n\n // Step 5: Map over the sanitized lines, filling in '()' using the queues.\n return sanitizedLines.map((line) => {\n if (!line.text.includes(INVALID_FOOTNOTE)) {\n return line;\n }\n let updatedText = line.text;\n\n updatedText = updatedText.replace(/\\(\\)/g, () => {\n if (line.isFootnote) {\n const availableRef = bodyRefsForFootnotes.shift();\n if (availableRef) return availableRef;\n } else {\n // It's body text\n const availableRef = footnoteRefsForBody.shift();\n if (availableRef) return availableRef;\n }\n\n // If no available partner reference exists, generate a new one.\n const newRef = `(${numberToArabic(referenceCounter.count)})`;\n referenceCounter.count++;\n return newRef;\n });\n\n return { ...line, text: updatedText };\n });\n};\n","import type { FixTypoOptions } from './types';\n\nimport { alignTokenSequences, areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport {\n handleFootnoteFusion,\n handleFootnoteSelection,\n handleStandaloneFootnotes,\n normalizeArabicText,\n tokenizeText,\n} from './textUtils';\n\n/**\n * Selects the best token(s) from an aligned pair during typo correction.\n * Uses various heuristics including normalization, footnote handling, typo symbols,\n * and similarity scores to determine which token(s) to keep.\n *\n * @param originalToken - Token from the original OCR text (may be null)\n * @param altToken - Token from the alternative OCR text (may be null)\n * @param options - Configuration options including typo symbols and similarity threshold\n * @returns Array of selected tokens (usually contains one token, but may contain multiple)\n */\nconst selectBestTokens = (\n originalToken: null | string,\n altToken: null | string,\n { similarityThreshold, typoSymbols }: FixTypoOptions,\n): string[] => {\n // Handle missing tokens\n if (originalToken === null) {\n return [altToken!];\n }\n if (altToken === null) {\n return [originalToken];\n }\n\n // Preserve original if same after normalization (keeps diacritics)\n if (normalizeArabicText(originalToken) === normalizeArabicText(altToken)) {\n return [originalToken];\n }\n\n // Handle embedded footnotes\n const result = handleFootnoteSelection(originalToken, altToken);\n if (result) return result;\n\n // Handle standalone footnotes\n const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);\n if (footnoteResult) return footnoteResult;\n\n // Handle typo symbols - prefer the symbol itself\n if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {\n const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);\n return typoSymbol ? [typoSymbol] : [originalToken];\n }\n\n // Choose based on similarity\n const normalizedOriginal = normalizeArabicText(originalToken);\n const normalizedAlt = normalizeArabicText(altToken);\n const similarity = calculateSimilarity(normalizedOriginal, normalizedAlt);\n\n return [similarity > similarityThreshold ? originalToken : altToken];\n};\n\n/**\n * Removes duplicate tokens and handles footnote fusion in post-processing.\n * Identifies and removes tokens that are highly similar while preserving\n * important variations. Also handles special cases like footnote merging.\n *\n * @param tokens - Array of tokens to process\n * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)\n * @returns Array of tokens with duplicates removed and footnotes fused\n */\nconst removeDuplicateTokens = (tokens: string[], highSimilarityThreshold: number): string[] => {\n if (tokens.length === 0) {\n return tokens;\n }\n\n const result: string[] = [];\n\n for (const currentToken of tokens) {\n if (result.length === 0) {\n result.push(currentToken);\n continue;\n }\n\n const previousToken = result.at(-1)!;\n\n // Handle ordinary echoes (similar tokens)\n if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {\n // Keep the shorter version\n if (currentToken.length < previousToken.length) {\n result[result.length - 1] = currentToken;\n }\n continue;\n }\n\n // Handle footnote fusion cases\n if (handleFootnoteFusion(result, previousToken, currentToken)) {\n continue;\n }\n\n result.push(currentToken);\n }\n\n return result;\n};\n\n/**\n * Processes text alignment between original and alternate OCR results to fix typos.\n * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,\n * then selects the best tokens and performs post-processing.\n *\n * @param originalText - Original OCR text that may contain typos\n * @param altText - Reference text from alternate OCR for comparison\n * @param options - Configuration options for alignment and selection\n * @returns Corrected text with typos fixed\n */\nexport const processTextAlignment = (originalText: string, altText: string, options: FixTypoOptions): string => {\n const originalTokens = tokenizeText(originalText, options.typoSymbols);\n const altTokens = tokenizeText(altText, options.typoSymbols);\n\n // Align token sequences\n const alignedPairs = alignTokenSequences(\n originalTokens,\n altTokens,\n options.typoSymbols,\n options.similarityThreshold,\n );\n\n // Select best tokens from each aligned pair\n const mergedTokens = alignedPairs.flatMap(([original, alt]) => selectBestTokens(original, alt, options));\n\n // Remove duplicates and handle post-processing\n const finalTokens = removeDuplicateTokens(mergedTokens, options.highSimilarityThreshold);\n\n return finalTokens.join(' ');\n};\n\nexport const fixTypo = (\n original: string,\n correction: string,\n {\n highSimilarityThreshold = 0.8,\n similarityThreshold = 0.6,\n typoSymbols,\n }: Partial<FixTypoOptions> & Pick<FixTypoOptions, 'typoSymbols'>,\n) => {\n return processTextAlignment(original, correction, { highSimilarityThreshold, similarityThreshold, typoSymbols });\n};\n\nexport * from './balance';\nexport * from './footnotes';\nexport * from './similarity';\nexport * from './textUtils';\n"],"mappings":"AAGO,IAAMA,EAAW,CAEpB,aAAc,sBAGd,6BAA8B,yBAG9B,uBAAwB,iDAGxB,+BAAgC,gCAGhC,qBAAsB,wBAGtB,WAAY,mDAGZ,iBAAkB,0BAGlB,mBAAoB,mCAGpB,sBAAuB,qBAGvB,kCAAmC,iBAGnC,0BAA2B,gBAG3B,QAAS,UAGT,WAAY,KAChB,EAYaC,EAAuBC,GACzBA,EAAK,QAAQF,EAAS,QAAS,EAAE,EAAE,QAAQA,EAAS,WAAY,EAAE,EAAE,KAAK,EAavEG,EAAiBD,GAAyB,CACnD,IAAME,EAAQF,EAAK,MAAMF,EAAS,YAAY,EAC9C,OAAOI,EAAQA,EAAM,CAAC,EAAI,EAC9B,EAaaC,EAAe,CAACH,EAAcI,EAA4B,CAAC,IAAgB,CACpF,IAAIC,EAAgBL,EAGpB,QAAWM,KAAUF,EAAiB,CAClC,IAAMG,EAAc,IAAI,OAAOD,EAAQ,GAAG,EAC1CD,EAAgBA,EAAc,QAAQE,EAAa,IAAID,CAAM,GAAG,CACpE,CAEA,OAAOD,EAAc,KAAK,EAAE,MAAMP,EAAS,UAAU,EAAE,OAAO,OAAO,CACzE,EAeaU,EAAuB,CAACC,EAAkBC,EAAuBC,IAAkC,CAC5G,IAAMC,EAAmBd,EAAS,mBAAmB,KAAKY,CAAa,EACjEG,EAAkBf,EAAS,iBAAiB,KAAKa,CAAY,EAC7DG,EAAmBhB,EAAS,mBAAmB,KAAKa,CAAY,EAChEI,EAAkBjB,EAAS,iBAAiB,KAAKY,CAAa,EAE9DM,EAAaf,EAAcS,CAAa,EACxCO,EAAahB,EAAcU,CAAY,EAG7C,OAAIC,GAAoBC,GAAmBG,IAAeC,GACtDR,EAAOA,EAAO,OAAS,CAAC,EAAIE,EACrB,IAIP,GAAAI,GAAmBD,GAAoBE,IAAeC,EAK9D,EAcaC,EAA0B,CAACC,EAAgBC,IAAoC,CACxF,IAAMC,EAAevB,EAAS,iBAAiB,KAAKqB,CAAM,EACpDG,EAAexB,EAAS,iBAAiB,KAAKsB,CAAM,EAE1D,OAAIC,GAAgB,CAACC,EAAqB,CAACH,CAAM,EAC7CG,GAAgB,CAACD,EAAqB,CAACD,CAAM,EAC7CC,GAAgBC,EACT,CAACH,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EAcaG,EAA4B,CAACJ,EAAgBC,IAAoC,CAC1F,IAAMI,EAAc1B,EAAS,mBAAmB,KAAKqB,CAAM,EACrDM,EAAc3B,EAAS,mBAAmB,KAAKsB,CAAM,EAE3D,OAAII,GAAe,CAACC,EAAoB,CAACN,EAAQC,CAAM,EACnDK,GAAe,CAACD,EAAoB,CAACJ,EAAQD,CAAM,EACnDK,GAAeC,EACR,CAACN,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,ECjLA,IAAMM,EAAmB,CACrB,YAAa,GACb,iBAAkB,GAClB,cAAe,EACf,WAAY,CAChB,EAeaC,EAA+B,CAACC,EAAeC,IAA0B,CAClF,IAAMC,EAAUF,EAAM,OAChBG,EAAUF,EAAM,OAEtB,GAAIC,IAAY,EACZ,OAAOC,EAGX,GAAIA,IAAY,EACZ,OAAOD,EAIX,GAAM,CAACE,EAASC,CAAM,EAAIH,GAAWC,EAAU,CAACH,EAAOC,CAAK,EAAI,CAACA,EAAOD,CAAK,EACvEM,EAAWF,EAAQ,OACnBG,EAAUF,EAAO,OAEnBG,EAAc,MAAM,KAAK,CAAE,OAAQF,EAAW,CAAE,EAAG,CAACG,EAAGC,IAAUA,CAAK,EAE1E,QAASC,EAAI,EAAGA,GAAKJ,EAASI,IAAK,CAC/B,IAAMC,EAAa,CAACD,CAAC,EAErB,QAASE,EAAI,EAAGA,GAAKP,EAAUO,IAAK,CAChC,IAAMC,EAAmBT,EAAOM,EAAI,CAAC,IAAMP,EAAQS,EAAI,CAAC,EAAI,EAAI,EAC1DE,EAAU,KAAK,IACjBP,EAAYK,CAAC,EAAI,EACjBD,EAAWC,EAAI,CAAC,EAAI,EACpBL,EAAYK,EAAI,CAAC,EAAIC,CACzB,EACAF,EAAW,KAAKG,CAAO,CAC3B,CAEAP,EAAcI,CAClB,CAEA,OAAOJ,EAAYF,CAAQ,CAC/B,EAcaU,EAAsB,CAAChB,EAAeC,IAA0B,CACzE,IAAMgB,EAAY,KAAK,IAAIjB,EAAM,OAAQC,EAAM,MAAM,GAAK,EACpDiB,EAAWnB,EAA6BC,EAAOC,CAAK,EAC1D,OAAQgB,EAAYC,GAAYD,CACpC,EAcaE,EAA+B,CAACnB,EAAeC,EAAemB,EAAoB,KAAiB,CAC5G,IAAMC,EAAcC,EAAoBtB,CAAK,EACvCuB,EAAcD,EAAoBrB,CAAK,EAC7C,OAAOe,EAAoBK,EAAaE,CAAW,GAAKH,CAC5D,EAgBaI,EAA0B,CACnCC,EACAC,EACAC,EACAC,IACS,CACT,IAAMP,EAAcC,EAAoBG,CAAM,EACxCF,EAAcD,EAAoBI,CAAM,EAG9C,GAAIL,IAAgBE,EAChB,OAAOzB,EAAiB,cAI5B,IAAM+B,EAAeF,EAAY,SAASF,CAAM,GAAKE,EAAY,SAASD,CAAM,EAC1EI,EAAkBd,EAAoBK,EAAaE,CAAW,GAAKK,EAEzE,OAAIC,GAAgBC,EACThC,EAAiB,WAGrBA,EAAiB,gBAC5B,EAoBaiC,EAAqB,CAC9BC,EACAC,EACAC,IACqB,CACrB,IAAMC,EAAgC,CAAC,EACnCxB,EAAIsB,EAAQ,OACZpB,EAAIqB,EAAQ,OAEhB,KAAOvB,EAAI,GAAKE,EAAI,GAGhB,OAFoBmB,EAAOrB,CAAC,EAAEE,CAAC,EAEX,UAAW,CAC3B,IAAK,WACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAGuB,EAAQ,EAAErB,CAAC,CAAC,CAAC,EAC3C,MACJ,IAAK,OACDsB,EAAU,KAAK,CAAC,KAAMD,EAAQ,EAAErB,CAAC,CAAC,CAAC,EACnC,MACJ,IAAK,KACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAG,IAAI,CAAC,EACnC,MACJ,QACI,MAAM,IAAI,MAAM,6BAA6B,CACrD,CAGJ,OAAOwB,EAAU,QAAQ,CAC7B,EAgBaC,EAAsB,CAC/BH,EACAC,EACAP,EACAC,IACqB,CACrB,IAAM1B,EAAU+B,EAAQ,OAClB9B,EAAU+B,EAAQ,OAGlBG,EAAmC,MAAM,KAAK,CAAE,OAAQnC,EAAU,CAAE,EAAG,IACzE,MAAM,KAAK,CAAE,OAAQC,EAAU,CAAE,EAAG,KAAO,CAAE,UAAW,KAAM,MAAO,CAAE,EAAE,CAC7E,EAGA,QAASQ,EAAI,EAAGA,GAAKT,EAASS,IAC1B0B,EAAc1B,CAAC,EAAE,CAAC,EAAI,CAAE,UAAW,KAAM,MAAOA,EAAIb,EAAiB,WAAY,EAErF,QAASe,EAAI,EAAGA,GAAKV,EAASU,IAC1BwB,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW,OAAQ,MAAOA,EAAIf,EAAiB,WAAY,EAIvF,QAASa,EAAI,EAAGA,GAAKT,EAASS,IAC1B,QAASE,EAAI,EAAGA,GAAKV,EAASU,IAAK,CAC/B,IAAMyB,EAAiBd,EACnBS,EAAQtB,EAAI,CAAC,EACbuB,EAAQrB,EAAI,CAAC,EACbc,EACAC,CACJ,EAEMW,EAAgBF,EAAc1B,EAAI,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQyB,EACpDE,EAAUH,EAAc1B,EAAI,CAAC,EAAEE,CAAC,EAAE,MAAQf,EAAiB,YAC3D2C,EAAYJ,EAAc1B,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQf,EAAiB,YAE7D4C,EAAY,KAAK,IAAIH,EAAeC,EAASC,CAAS,EACxDE,EAA4C,OAE5CD,IAAcH,EACdI,EAAgB,WACTD,IAAcF,IACrBG,EAAgB,MAGpBN,EAAc1B,CAAC,EAAEE,CAAC,EAAI,CAAE,UAAW8B,EAAe,MAAOD,CAAU,CACvE,CAIJ,OAAOX,EAAmBM,EAAeJ,EAASC,CAAO,CAC7D,ECjNA,IAAMU,EAAqBC,GAA+B,CACtD,IAAMC,EAAyB,CAAC,EAC5BC,EAAa,EACbC,EAAiB,GAErB,QAASC,EAAI,EAAGA,EAAIJ,EAAI,OAAQI,IACxBJ,EAAII,CAAC,IAAM,MACXF,IACAC,EAAiBC,GAIzB,IAAMC,EAAaH,EAAa,IAAM,EAEtC,MAAI,CAACG,GAAcF,IAAmB,IAClCF,EAAO,KAAK,CACR,KAAM,IACN,MAAOE,EACP,OAAQ,YACR,KAAM,OACV,CAAC,EAGE,CAAE,OAAAF,EAAQ,WAAAI,CAAW,CAChC,EAGaC,EAAW,CAAE,OAAK,OAAK,IAAK,IAAK,IAAK,IAAK,IAAK,GAAI,EAGpDC,EAAgB,IAAI,IAAI,CAAC,OAAK,IAAK,IAAK,GAAG,CAAC,EAG5CC,EAAiB,IAAI,IAAI,CAAC,OAAK,IAAK,IAAK,GAAG,CAAC,EAsBpDC,EAAuBT,GAA+B,CACxD,IAAMC,EAAyB,CAAC,EAC1BS,EAAgD,CAAC,EAEvD,QAASN,EAAI,EAAGA,EAAIJ,EAAI,OAAQI,IAAK,CACjC,IAAMO,EAAOX,EAAII,CAAC,EAElB,GAAIG,EAAc,IAAII,CAAI,EACtBD,EAAM,KAAK,CAAE,KAAAC,EAAM,MAAOP,CAAE,CAAC,UACtBI,EAAe,IAAIG,CAAI,EAAG,CACjC,IAAMC,EAAWF,EAAM,IAAI,EAEtBE,EAOMN,EAASM,EAAS,IAA6B,IAAMD,IAC5DV,EAAO,KAAK,CACR,KAAMW,EAAS,KACf,MAAOA,EAAS,MAChB,OAAQ,aACR,KAAM,SACV,CAAC,EACDX,EAAO,KAAK,CACR,KAAAU,EACA,MAAOP,EACP,OAAQ,aACR,KAAM,SACV,CAAC,GAlBDH,EAAO,KAAK,CACR,KAAAU,EACA,MAAOP,EACP,OAAQ,YACR,KAAM,SACV,CAAC,CAeT,CACJ,CAEA,OAAAM,EAAM,QAAQ,CAAC,CAAE,KAAAC,EAAM,MAAAE,CAAM,IAAM,CAC/BZ,EAAO,KAAK,CACR,KAAAU,EACA,MAAAE,EACA,OAAQ,WACR,KAAM,SACV,CAAC,CACL,CAAC,EAEM,CAAE,OAAAZ,EAAQ,WAAYA,EAAO,SAAW,CAAE,CACrD,EAkBaa,EAAgBd,GAA+B,CACxD,IAAMe,EAAchB,EAAkBC,CAAG,EACnCgB,EAAgBP,EAAoBT,CAAG,EAE7C,MAAO,CACH,OAAQ,CAAC,GAAGe,EAAY,OAAQ,GAAGC,EAAc,MAAM,EAAE,KAAK,CAACC,EAAGC,IAAMD,EAAE,MAAQC,EAAE,KAAK,EACzF,WAAYH,EAAY,YAAcC,EAAc,UACxD,CACJ,EAwCaG,EAAuBC,GAAmC,CACnE,IAAMC,EAAoC,CAAC,EACrCC,EAAQF,EAAK,MAAM;AAAA,CAAI,EACzBG,EAAgB,EAEpB,OAAAD,EAAM,QAAQ,CAACE,EAAMC,IAAc,CAC/B,GAAID,EAAK,OAAS,GAAI,CAClB,IAAME,EAAgBZ,EAAaU,CAAI,EAClCE,EAAc,YACfA,EAAc,OAAO,QAASC,GAAU,CACpCN,EAAgB,KAAK,CACjB,cAAeE,EAAgBI,EAAM,MACrC,KAAMA,EAAM,KACZ,OAAQA,EAAM,OACd,KAAMA,EAAM,IAChB,CAAC,CACL,CAAC,CAET,CAEAJ,GAAiBC,EAAK,QAAUC,EAAYH,EAAM,OAAS,EAAI,EAAI,EACvE,CAAC,EAEMD,CACX,EAiBaO,EAAqB5B,GACvBD,EAAkBC,CAAG,EAAE,WAkBrB6B,GAAuB7B,GACzBS,EAAoBT,CAAG,EAAE,WAkBvBK,GAAcL,GAChBc,EAAad,CAAG,EAAE,WC7R7B,IAAM8B,EAAmB,KAcZC,EAAuBC,GACzBC,EAAS,sBAAsB,KAAKD,CAAI,EAI7CE,EAAkB,IAAI,KAAK,aAAa,OAAO,EAY/CC,EAAkBC,GACbF,EAAgB,OAAOE,CAAG,EAc/BC,EAAeC,IACiC,CAC9C,EAAK,SACL,EAAK,SACL,IAAK,SACL,EAAG,SACH,EAAG,SACH,EAAG,SACH,EAAG,QACP,GACsBA,CAAI,GAAKA,EAc7BC,EAAkBC,GAA8B,CAClD,IAAMC,EAAoC,CACtC,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,GACT,EACMC,EAASF,EAAU,QAAQ,QAAS,EAAE,EACxCG,EAAS,GACb,QAAWL,KAAQI,EACfC,GAAUF,EAAOH,CAAI,EAEzB,IAAMM,EAAS,SAASD,EAAQ,EAAE,EAClC,OAAO,MAAMC,CAAM,EAAI,EAAIA,CAC/B,EAyBMC,EAAqBC,GAAsB,CAC7C,IAAMC,EAAyBD,EAC1B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,oBAAoB,GAAK,CAAC,CAAC,EAE/DgB,EAA8BH,EAC/B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,yBAAyB,GAAK,CAAC,CAAC,EAEpEiB,EAA8BJ,EAC/B,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,4BAA4B,GAAK,CAAC,CAAC,EAEvEkB,EAAmCL,EACpC,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,iCAAiC,GAAK,CAAC,CAAC,EAE5EmB,EAAuBH,EAA4B,IAAKI,GAC1DA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEMgB,EAA2BH,EAAiC,IAAKE,GACnEA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEA,MAAO,CACH,eAAgB,CAAC,GAAGS,EAAwB,GAAGK,CAAoB,EACnE,mBAAoB,CAAC,GAAGF,EAA6B,GAAGI,CAAwB,EAChF,kBAAmBL,EACnB,uBAAwBE,CAC5B,CACJ,EAgBMI,EAAkB,CAACT,EAAmBU,IAAqD,CAE7F,GAD2BV,EAAM,KAAMW,GAAS1B,EAAoB0B,EAAK,IAAI,CAAC,EACtD,MAAO,GAE/B,IAAMC,EAAU,IAAI,IAAIF,EAAW,cAAc,EAC3CG,EAAc,IAAI,IAAIH,EAAW,kBAAkB,EACzD,GAAIE,EAAQ,OAASC,EAAY,KAAM,MAAO,GAG9C,QAAWN,KAAOK,EACd,GAAI,CAACC,EAAY,IAAIN,CAAG,EACpB,MAAO,GAIf,MAAO,EACX,EAmBaO,GAAyCd,GAAoB,CACtE,IAAMe,EAAoBhB,EAAkBC,CAAK,EAEjD,GAAI,CAACS,EAAgBT,EAAOe,CAAiB,EACzC,OAAOf,EAIX,IAAMgB,EAAiBhB,EAAM,IAAKW,GAAS,CACvC,IAAIM,EAAcN,EAAK,KAEjBO,EAAW,gBACjB,OAAAD,EAAcA,EAAY,QAAQC,EAAWC,GAElCA,EAAM,QAAQ,WAAa3B,GAASD,EAAYC,CAAI,CAAC,CAC/D,EACM,CAAE,GAAGmB,EAAM,KAAMM,CAAY,CACxC,CAAC,EAGKG,EAAkBrB,EAAkBiB,CAAc,EAGlDK,EAAa,IAAI,IAAID,EAAgB,cAAc,EACnDE,EAAiB,IAAI,IAAIF,EAAgB,kBAAkB,EAE3DG,EAAiB,CAAC,GAAG,IAAI,IAAIH,EAAgB,cAAc,CAAC,EAC5DI,EAAqB,CAAC,GAAG,IAAI,IAAIJ,EAAgB,kBAAkB,CAAC,EAGpEK,EAAuBF,EAAe,OAAQhB,GAAQ,CAACe,EAAe,IAAIf,CAAG,CAAC,EAE9EmB,EAAsBF,EAAmB,OAAQjB,GAAQ,CAACc,EAAW,IAAId,CAAG,CAAC,EAG7EoB,EAAU,CAAC,GAAGN,EAAY,GAAGC,CAAc,EAE3CM,EAAmB,CAAE,OADTD,EAAQ,OAAS,EAAI,KAAK,IAAI,EAAG,GAAGA,EAAQ,IAAKpB,GAAQd,EAAec,CAAG,CAAC,CAAC,EAAI,GACrD,CAAE,EAGhD,OAAOS,EAAe,IAAKL,GAAS,CAChC,GAAI,CAACA,EAAK,KAAK,SAAS3B,CAAgB,EACpC,OAAO2B,EAEX,IAAIM,EAAcN,EAAK,KAEvB,OAAAM,EAAcA,EAAY,QAAQ,QAAS,IAAM,CAC7C,GAAIN,EAAK,WAAY,CACjB,IAAMkB,EAAeJ,EAAqB,MAAM,EAChD,GAAII,EAAc,OAAOA,CAC7B,KAAO,CAEH,IAAMA,EAAeH,EAAoB,MAAM,EAC/C,GAAIG,EAAc,OAAOA,CAC7B,CAGA,IAAMC,EAAS,IAAIzC,EAAeuC,EAAiB,KAAK,CAAC,IACzD,OAAAA,EAAiB,QACVE,CACX,CAAC,EAEM,CAAE,GAAGnB,EAAM,KAAMM,CAAY,CACxC,CAAC,CACL,EClPA,IAAMc,EAAmB,CACrBC,EACAC,EACA,CAAE,oBAAAC,EAAqB,YAAAC,CAAY,IACxB,CAEX,GAAIH,IAAkB,KAClB,MAAO,CAACC,CAAS,EAErB,GAAIA,IAAa,KACb,MAAO,CAACD,CAAa,EAIzB,GAAII,EAAoBJ,CAAa,IAAMI,EAAoBH,CAAQ,EACnE,MAAO,CAACD,CAAa,EAIzB,IAAMK,EAASC,EAAwBN,EAAeC,CAAQ,EAC9D,GAAII,EAAQ,OAAOA,EAGnB,IAAME,EAAiBC,EAA0BR,EAAeC,CAAQ,EACxE,GAAIM,EAAgB,OAAOA,EAG3B,GAAIJ,EAAY,SAASH,CAAa,GAAKG,EAAY,SAASF,CAAQ,EAAG,CACvE,IAAMQ,EAAaN,EAAY,KAAMO,GAAWA,IAAWV,GAAiBU,IAAWT,CAAQ,EAC/F,OAAOQ,EAAa,CAACA,CAAU,EAAI,CAACT,CAAa,CACrD,CAGA,IAAMW,EAAqBP,EAAoBJ,CAAa,EACtDY,EAAgBR,EAAoBH,CAAQ,EAGlD,MAAO,CAFYY,EAAoBF,EAAoBC,CAAa,EAEnDV,EAAsBF,EAAgBC,CAAQ,CACvE,EAWMa,EAAwB,CAACC,EAAkBC,IAA8C,CAC3F,GAAID,EAAO,SAAW,EAClB,OAAOA,EAGX,IAAMV,EAAmB,CAAC,EAE1B,QAAWY,KAAgBF,EAAQ,CAC/B,GAAIV,EAAO,SAAW,EAAG,CACrBA,EAAO,KAAKY,CAAY,EACxB,QACJ,CAEA,IAAMC,EAAgBb,EAAO,GAAG,EAAE,EAGlC,GAAIc,EAA6BD,EAAeD,EAAcD,CAAuB,EAAG,CAEhFC,EAAa,OAASC,EAAc,SACpCb,EAAOA,EAAO,OAAS,CAAC,EAAIY,GAEhC,QACJ,CAGIG,EAAqBf,EAAQa,EAAeD,CAAY,GAI5DZ,EAAO,KAAKY,CAAY,CAC5B,CAEA,OAAOZ,CACX,EAYagB,EAAuB,CAACC,EAAsBC,EAAiBC,IAAoC,CAC5G,IAAMC,EAAiBC,EAAaJ,EAAcE,EAAQ,WAAW,EAC/DG,EAAYD,EAAaH,EAASC,EAAQ,WAAW,EAWrDI,EAReC,EACjBJ,EACAE,EACAH,EAAQ,YACRA,EAAQ,mBACZ,EAGkC,QAAQ,CAAC,CAACM,EAAUC,CAAG,IAAMhC,EAAiB+B,EAAUC,EAAKP,CAAO,CAAC,EAKvG,OAFoBV,EAAsBc,EAAcJ,EAAQ,uBAAuB,EAEpE,KAAK,GAAG,CAC/B,EAEaQ,GAAU,CACnBF,EACAG,EACA,CACI,wBAAAjB,EAA0B,GAC1B,oBAAAd,EAAsB,GACtB,YAAAC,CACJ,IAEOkB,EAAqBS,EAAUG,EAAY,CAAE,wBAAAjB,EAAyB,oBAAAd,EAAqB,YAAAC,CAAY,CAAC","names":["PATTERNS","normalizeArabicText","text","extractDigits","match","tokenizeText","preserveSymbols","processedText","symbol","symbolRegex","handleFootnoteFusion","result","previousToken","currentToken","prevIsStandalone","currHasEmbedded","currIsStandalone","prevHasEmbedded","prevDigits","currDigits","handleFootnoteSelection","tokenA","tokenB","aHasEmbedded","bHasEmbedded","handleStandaloneFootnotes","aIsFootnote","bIsFootnote","ALIGNMENT_SCORES","calculateLevenshteinDistance","textA","textB","lengthA","lengthB","shorter","longer","shortLen","longLen","previousRow","_","index","i","currentRow","j","substitutionCost","minCost","calculateSimilarity","maxLength","distance","areSimilarAfterNormalization","threshold","normalizedA","normalizeArabicText","normalizedB","calculateAlignmentScore","tokenA","tokenB","typoSymbols","similarityThreshold","isTypoSymbol","isHighlySimilar","backtrackAlignment","matrix","tokensA","tokensB","alignment","alignTokenSequences","scoringMatrix","alignmentScore","diagonalScore","upScore","leftScore","bestScore","bestDirection","checkQuoteBalance","str","errors","quoteCount","lastQuoteIndex","i","isBalanced","BRACKETS","OPEN_BRACKETS","CLOSE_BRACKETS","checkBracketBalance","stack","char","lastOpen","index","checkBalance","quoteResult","bracketResult","a","b","getUnbalancedErrors","text","characterErrors","lines","absoluteIndex","line","lineIndex","balanceResult","error","areQuotesBalanced","areBracketsBalanced","INVALID_FOOTNOTE","hasInvalidFootnotes","text","PATTERNS","arabicFormatter","numberToArabic","num","ocrToArabic","char","arabicToNumber","arabicStr","lookup","digits","numStr","parsed","extractReferences","lines","arabicReferencesInBody","b","ocrConfusedReferencesInBody","arabicReferencesInFootnotes","ocrConfusedReferencesInFootnotes","convertedOcrBodyRefs","ref","convertedOcrFootnoteRefs","needsCorrection","references","line","bodySet","footnoteSet","correctReferences","initialReferences","sanitizedLines","updatedText","ocrRegex","match","cleanReferences","bodyRefSet","footnoteRefSet","uniqueBodyRefs","uniqueFootnoteRefs","bodyRefsForFootnotes","footnoteRefsForBody","allRefs","referenceCounter","availableRef","newRef","selectBestTokens","originalToken","altToken","similarityThreshold","typoSymbols","normalizeArabicText","result","handleFootnoteSelection","footnoteResult","handleStandaloneFootnotes","typoSymbol","symbol","normalizedOriginal","normalizedAlt","calculateSimilarity","removeDuplicateTokens","tokens","highSimilarityThreshold","currentToken","previousToken","areSimilarAfterNormalization","handleFootnoteFusion","processTextAlignment","originalText","altText","options","originalTokens","tokenizeText","altTokens","mergedTokens","alignTokenSequences","original","alt","fixTypo","correction"]}
1
+ {"version":3,"sources":["../src/textUtils.ts","../src/similarity.ts","../src/alignment.ts","../src/balance.ts","../src/footnotes.ts","../src/noise.ts","../src/index.ts"],"sourcesContent":["/**\n * Collection of regex patterns used throughout the library for text processing\n */\nexport const PATTERNS = {\n /** Matches Arabic characters across all Unicode blocks */\n arabicCharacters: /[\\u0600-\\u06FF\\u0750-\\u077F\\u08A0-\\u08FF\\uFB50-\\uFDFF\\uFE70-\\uFEFF]/,\n\n /** Matches Arabic-Indic digits (ู -ูฉ) and Western digits (0-9) */\n arabicDigits: /[0-9\\u0660-\\u0669]+/,\n\n /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\\([\\u0660-\\u0669]+\\) */\n arabicFootnoteReferenceRegex: /^\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ู -ูฉ) */\n arabicLettersAndDigits: /[0-9\\u0621-\\u063A\\u0641-\\u064A\\u0660-\\u0669]+/g,\n\n /** Matches Arabic punctuation marks and whitespace characters */\n arabicPunctuationAndWhitespace: /[\\s\\u060C\\u061B\\u061F\\u06D4]+/,\n\n /** Matches footnote references with Arabic-Indic digits in parentheses: \\([\\u0660-\\u0669]+\\) */\n arabicReferenceRegex: /\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic diacritical marks (harakat, tanween, etc.) */\n diacritics: /[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]/g,\n\n /** Matches embedded footnotes within text: \\([0-9\\u0660-\\u0669]+\\) */\n footnoteEmbedded: /\\([0-9\\u0660-\\u0669]+\\)/,\n\n /** Matches standalone footnote markers at line start/end: ^\\(?[0-9\\u0660-\\u0669]+\\)?[ุŒ.]?$ */\n footnoteStandalone: /^\\(?[0-9\\u0660-\\u0669]+\\)?[ุŒ.]?$/,\n\n /** Matches invalid/problematic footnote references: empty \"()\" or OCR-confused endings */\n invalidReferenceRegex: /\\(\\)|\\([.1OV9]+\\)/g, // Combined pattern for detecting any invalid/problematic references\n\n /** Matches OCR-confused footnote references at line start with characters like .1OV9 */\n ocrConfusedFootnoteReferenceRegex: /^\\([.1OV9]+\\)/g,\n\n /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */\n ocrConfusedReferenceRegex: /\\([.1OV9]+\\)/g,\n\n /** Matches Arabic tatweel (kashida) character used for text stretching */\n tatweel: /\\u0640/g,\n\n /** Matches one or more whitespace characters */\n whitespace: /\\s+/,\n};\n\n/**\n * Normalizes Arabic text by removing diacritics, and tatweel marks.\n * This normalization enables better text comparison by focusing on core characters\n * while ignoring decorative elements that don't affect meaning.\n *\n * @param text - Arabic text to normalize\n * @returns Normalized text with diacritics, tatweel, and basic tags removed\n * @example\n * normalizeArabicText('ุงูŽู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’') // Returns 'ุงู„ุณู„ุงู… ุนู„ูŠูƒู…'\n */\nexport const normalizeArabicText = (text: string): string => {\n return text.replace(PATTERNS.tatweel, '').replace(PATTERNS.diacritics, '').trim();\n};\n\n/**\n * Extracts the first sequence of Arabic or Western digits from text.\n * Used primarily for footnote number comparison to match related footnote elements.\n *\n * @param text - Text containing digits to extract\n * @returns First digit sequence found, or empty string if none found\n * @example\n * extractDigits('(ูฅ)ุฃุฎุฑุฌู‡ ุงู„ุจุฎุงุฑูŠ') // Returns 'ูฅ'\n * extractDigits('See note (123)') // Returns '123'\n */\nexport const extractDigits = (text: string): string => {\n const match = text.match(PATTERNS.arabicDigits);\n return match ? match[0] : '';\n};\n\n/**\n * Tokenizes text into individual words while preserving special symbols.\n * Removes HTML tags, adds spacing around preserved symbols to ensure they\n * are tokenized separately, then splits on whitespace.\n *\n * @param text - Text to tokenize\n * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens\n * @returns Array of tokens, or empty array if input is empty/whitespace\n * @example\n * tokenizeText('Hello ๏ทบ world', ['๏ทบ']) // Returns ['Hello', '๏ทบ', 'world']\n */\nexport const tokenizeText = (text: string, preserveSymbols: string[] = []): string[] => {\n let processedText = text;\n\n // Add spaces around each preserve symbol to ensure they're tokenized separately\n for (const symbol of preserveSymbols) {\n const symbolRegex = new RegExp(symbol, 'g');\n processedText = processedText.replace(symbolRegex, ` ${symbol} `);\n }\n\n return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);\n};\n\n/**\n * Handles fusion of standalone and embedded footnotes during token processing.\n * Detects patterns where standalone footnotes should be merged with embedded ones\n * or where trailing standalone footnotes should be skipped.\n *\n * @param result - Current result array being built\n * @param previousToken - The previous token in the sequence\n * @param currentToken - The current token being processed\n * @returns True if the current token was handled (fused or skipped), false otherwise\n * @example\n * // (ูฅ) + (ูฅ)ุฃุฎุฑุฌู‡ โ†’ result gets (ูฅ)ุฃุฎุฑุฌู‡\n * // (ูฅ)ุฃุฎุฑุฌู‡ + (ูฅ) โ†’ (ูฅ) is skipped\n */\nexport const handleFootnoteFusion = (result: string[], previousToken: string, currentToken: string): boolean => {\n const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);\n const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);\n const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);\n const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);\n\n const prevDigits = extractDigits(previousToken);\n const currDigits = extractDigits(currentToken);\n\n // Replace standalone with fused version: (ูฅ) + (ูฅ)ุฃุฎุฑุฌู‡ โ†’ (ูฅ)ุฃุฎุฑุฌู‡\n if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {\n result[result.length - 1] = currentToken;\n return true;\n }\n\n // Skip trailing standalone: (ูฅ)ุฃุฎุฑุฌู‡ + (ูฅ) โ†’ (ูฅ)ุฃุฎุฑุฌู‡\n if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) {\n return true;\n }\n\n return false;\n};\n\n/**\n * Handles selection logic for tokens with embedded footnotes during alignment.\n * Prefers tokens that contain embedded footnotes over plain text, and among\n * tokens with embedded footnotes, prefers the shorter one.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleFootnoteSelection('text', '(ูก)text') // Returns ['(ูก)text']\n * handleFootnoteSelection('(ูก)longtext', '(ูก)text') // Returns ['(ูก)text']\n */\nexport const handleFootnoteSelection = (tokenA: string, tokenB: string): null | string[] => {\n const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);\n const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);\n\n if (aHasEmbedded && !bHasEmbedded) return [tokenA];\n if (bHasEmbedded && !aHasEmbedded) return [tokenB];\n if (aHasEmbedded && bHasEmbedded) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n\n/**\n * Handles selection logic for standalone footnote tokens during alignment.\n * Manages cases where one or both tokens are standalone footnotes, preserving\n * both tokens when one is a footnote and the other is regular text.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleStandaloneFootnotes('(ูก)', 'text') // Returns ['(ูก)', 'text']\n * handleStandaloneFootnotes('(ูก)', '(ูข)') // Returns ['(ูก)'] (shorter one)\n */\nexport const handleStandaloneFootnotes = (tokenA: string, tokenB: string): null | string[] => {\n const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);\n const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);\n\n if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];\n if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];\n if (aIsFootnote && bIsFootnote) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n","import { normalizeArabicText } from './textUtils';\n\n// Alignment scoring constants\nconst ALIGNMENT_SCORES = {\n GAP_PENALTY: -1,\n MISMATCH_PENALTY: -2,\n PERFECT_MATCH: 2,\n SOFT_MATCH: 1,\n};\n\n/**\n * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.\n * The Levenshtein distance is the minimum number of single-character edits (insertions,\n * deletions, or substitutions) required to change one string into another.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Minimum edit distance between the two strings\n * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths\n * @example\n * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3\n * calculateLevenshteinDistance('', 'hello') // Returns 5\n */\nexport const calculateLevenshteinDistance = (textA: string, textB: string): number => {\n const lengthA = textA.length;\n const lengthB = textB.length;\n\n if (lengthA === 0) {\n return lengthB;\n }\n\n if (lengthB === 0) {\n return lengthA;\n }\n\n // Use shorter string for the array to optimize space\n const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];\n const shortLen = shorter.length;\n const longLen = longer.length;\n\n let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);\n\n for (let i = 1; i <= longLen; i++) {\n const currentRow = [i];\n\n for (let j = 1; j <= shortLen; j++) {\n const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;\n const minCost = Math.min(\n previousRow[j] + 1, // deletion\n currentRow[j - 1] + 1, // insertion\n previousRow[j - 1] + substitutionCost, // substitution\n );\n currentRow.push(minCost);\n }\n\n previousRow = currentRow;\n }\n\n return previousRow[shortLen];\n};\n\n/**\n * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.\n * Uses Levenshtein distance normalized by the length of the longer string.\n * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)\n * @example\n * calculateSimilarity('hello', 'hello') // Returns 1.0\n * calculateSimilarity('hello', 'help') // Returns 0.6\n */\nexport const calculateSimilarity = (textA: string, textB: string): number => {\n const maxLength = Math.max(textA.length, textB.length) || 1;\n const distance = calculateLevenshteinDistance(textA, textB);\n return (maxLength - distance) / maxLength;\n};\n\n/**\n * Checks if two texts are similar after Arabic normalization.\n * Normalizes both texts by removing diacritics and decorative elements,\n * then compares their similarity against the provided threshold.\n *\n * @param textA - First text to compare\n * @param textB - Second text to compare\n * @param threshold - Similarity threshold (0.0 to 1.0)\n * @returns True if normalized texts meet the similarity threshold\n * @example\n * areSimilarAfterNormalization('ุงู„ุณูŽู‘ู„ุงู…', 'ุงู„ุณู„ุงู…', 0.9) // Returns true\n */\nexport const areSimilarAfterNormalization = (textA: string, textB: string, threshold: number = 0.6): boolean => {\n const normalizedA = normalizeArabicText(textA);\n const normalizedB = normalizeArabicText(textB);\n return calculateSimilarity(normalizedA, normalizedB) >= threshold;\n};\n\n/**\n * Calculates alignment score for two tokens in sequence alignment.\n * Uses different scoring criteria: perfect match after normalization gets highest score,\n * typo symbols or highly similar tokens get soft match score, mismatches get penalty.\n *\n * @param tokenA - First token to score\n * @param tokenB - Second token to score\n * @param typoSymbols - Array of special symbols that get preferential treatment\n * @param similarityThreshold - Threshold for considering tokens highly similar\n * @returns Alignment score (higher is better match)\n * @example\n * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)\n * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity\n */\nexport const calculateAlignmentScore = (\n tokenA: string,\n tokenB: string,\n typoSymbols: string[],\n similarityThreshold: number,\n): number => {\n const normalizedA = normalizeArabicText(tokenA);\n const normalizedB = normalizeArabicText(tokenB);\n\n // Perfect match after normalization\n if (normalizedA === normalizedB) {\n return ALIGNMENT_SCORES.PERFECT_MATCH;\n }\n\n // Check if either token is a typo symbol or high similarity\n const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);\n const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;\n\n if (isTypoSymbol || isHighlySimilar) {\n return ALIGNMENT_SCORES.SOFT_MATCH;\n }\n\n return ALIGNMENT_SCORES.MISMATCH_PENALTY;\n};\n\ntype AlignedTokenPair = [null | string, null | string];\n\ntype AlignmentCell = {\n direction: 'diagonal' | 'left' | 'up' | null;\n score: number;\n};\n\n/**\n * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.\n * Follows the directional indicators in the matrix to build the sequence of aligned\n * token pairs from the Needleman-Wunsch algorithm.\n *\n * @param matrix - Scoring matrix with directional information from alignment\n * @param tokensA - First sequence of tokens\n * @param tokensB - Second sequence of tokens\n * @returns Array of aligned token pairs, where null indicates a gap\n * @throws Error if invalid alignment direction is encountered\n */\nexport const backtrackAlignment = (\n matrix: AlignmentCell[][],\n tokensA: string[],\n tokensB: string[],\n): AlignedTokenPair[] => {\n const alignment: AlignedTokenPair[] = [];\n let i = tokensA.length;\n let j = tokensB.length;\n\n while (i > 0 || j > 0) {\n const currentCell = matrix[i][j];\n\n switch (currentCell.direction) {\n case 'diagonal':\n alignment.push([tokensA[--i], tokensB[--j]]);\n break;\n case 'left':\n alignment.push([null, tokensB[--j]]);\n break;\n case 'up':\n alignment.push([tokensA[--i], null]);\n break;\n default:\n throw new Error('Invalid alignment direction');\n }\n }\n\n return alignment.reverse();\n};\n\n/**\n * Performs global sequence alignment using the Needleman-Wunsch algorithm.\n * Aligns two token sequences to find the optimal pairing that maximizes\n * the total alignment score, handling insertions, deletions, and substitutions.\n *\n * @param tokensA - First sequence of tokens to align\n * @param tokensB - Second sequence of tokens to align\n * @param typoSymbols - Special symbols that affect scoring\n * @param similarityThreshold - Threshold for high similarity scoring\n * @returns Array of aligned token pairs, with null indicating gaps\n * @example\n * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)\n * // Returns [['a', 'a'], ['b', 'c']]\n */\nexport const alignTokenSequences = (\n tokensA: string[],\n tokensB: string[],\n typoSymbols: string[],\n similarityThreshold: number,\n): AlignedTokenPair[] => {\n const lengthA = tokensA.length;\n const lengthB = tokensB.length;\n\n // Initialize scoring matrix\n const scoringMatrix: AlignmentCell[][] = Array.from({ length: lengthA + 1 }, () =>\n Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 })),\n );\n\n // Initialize first row and column\n for (let i = 1; i <= lengthA; i++) {\n scoringMatrix[i][0] = { direction: 'up', score: i * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n for (let j = 1; j <= lengthB; j++) {\n scoringMatrix[0][j] = { direction: 'left', score: j * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n\n // Fill scoring matrix\n for (let i = 1; i <= lengthA; i++) {\n for (let j = 1; j <= lengthB; j++) {\n const alignmentScore = calculateAlignmentScore(\n tokensA[i - 1],\n tokensB[j - 1],\n typoSymbols,\n similarityThreshold,\n );\n\n const diagonalScore = scoringMatrix[i - 1][j - 1].score + alignmentScore;\n const upScore = scoringMatrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY;\n const leftScore = scoringMatrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY;\n\n const bestScore = Math.max(diagonalScore, upScore, leftScore);\n let bestDirection: 'diagonal' | 'left' | 'up' = 'left';\n\n if (bestScore === diagonalScore) {\n bestDirection = 'diagonal';\n } else if (bestScore === upScore) {\n bestDirection = 'up';\n }\n\n scoringMatrix[i][j] = { direction: bestDirection, score: bestScore };\n }\n }\n\n // Backtrack to build alignment\n return backtrackAlignment(scoringMatrix, tokensA, tokensB);\n};\n","import { areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport { normalizeArabicText } from './textUtils';\n\n/**\n * Aligns split text segments to match target lines by finding the best order.\n *\n * This function handles cases where text lines have been split into segments\n * and need to be merged back together in the correct order. It compares\n * different arrangements of the segments against target lines to find the\n * best match based on similarity scores.\n *\n * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment\n * @param segmentLines - Array of text segments that may represent split versions of target lines.\n * @returns Array of aligned text lines\n */\nexport const alignTextSegments = (targetLines: string[], segmentLines: string[]) => {\n const alignedLines: string[] = [];\n let segmentIndex = 0;\n\n for (const targetLine of targetLines) {\n if (segmentIndex >= segmentLines.length) {\n break;\n }\n\n if (targetLine) {\n // Process line that needs alignment\n const { result, segmentsConsumed } = processAlignmentTarget(targetLine, segmentLines, segmentIndex);\n\n if (result) {\n alignedLines.push(result);\n }\n segmentIndex += segmentsConsumed;\n } else {\n // For lines that don't need alignment, use one-to-one correspondence\n alignedLines.push(segmentLines[segmentIndex]);\n segmentIndex++;\n }\n }\n\n // Add any remaining segments that were not processed\n if (segmentIndex < segmentLines.length) {\n alignedLines.push(...segmentLines.slice(segmentIndex));\n }\n\n return alignedLines;\n};\n\n/**\n * Tries to merge two segments in both possible orders and returns the best match.\n */\nconst findBestSegmentMerge = (targetLine: string, partA: string, partB: string) => {\n const mergedForward = `${partA} ${partB}`;\n const mergedReversed = `${partB} ${partA}`;\n\n const normalizedTarget = normalizeArabicText(targetLine);\n const scoreForward = calculateSimilarity(normalizedTarget, normalizeArabicText(mergedForward));\n const scoreReversed = calculateSimilarity(normalizedTarget, normalizeArabicText(mergedReversed));\n\n return scoreForward >= scoreReversed ? mergedForward : mergedReversed;\n};\n\n/**\n * Processes a single target line that needs alignment.\n */\nconst processAlignmentTarget = (targetLine: string, segmentLines: string[], segmentIndex: number) => {\n const currentSegment = segmentLines[segmentIndex];\n\n // First, check if the current segment is already a good match\n if (areSimilarAfterNormalization(targetLine, currentSegment)) {\n return { result: currentSegment, segmentsConsumed: 1 };\n }\n\n // If not a direct match, try to merge two segments\n const partA = segmentLines[segmentIndex];\n const partB = segmentLines[segmentIndex + 1];\n\n // Ensure we have two parts to merge\n if (!partA || !partB) {\n return partA ? { result: partA, segmentsConsumed: 1 } : { result: '', segmentsConsumed: 0 };\n }\n\n const bestMerge = findBestSegmentMerge(targetLine, partA, partB);\n return { result: bestMerge, segmentsConsumed: 2 };\n};\n","/**\n * Represents an error found when checking balance of quotes or brackets in text.\n */\ntype BalanceError = {\n /** The character that caused the error */\n char: string;\n /** The position of the character in the string */\n index: number;\n /** The reason for the error */\n reason: 'mismatched' | 'unclosed' | 'unmatched';\n /** The type of character that caused the error */\n type: 'bracket' | 'quote';\n};\n\n/**\n * Result of a balance check operation.\n */\ntype BalanceResult = {\n /** Array of errors found during balance checking */\n errors: BalanceError[];\n /** Whether the text is properly balanced */\n isBalanced: boolean;\n};\n\n/**\n * Checks if all double quotes in a string are balanced and returns detailed error information.\n *\n * A string has balanced quotes when every opening quote has a corresponding closing quote.\n * This function counts all quote characters and determines if there's an even number of them.\n * If there's an odd number, the last quote is marked as unmatched.\n *\n * @param str - The string to check for quote balance\n * @returns An object containing balance status and any errors found\n *\n * @example\n * ```typescript\n * checkQuoteBalance('Hello \"world\"') // { errors: [], isBalanced: true }\n * checkQuoteBalance('Hello \"world') // { errors: [{ char: '\"', index: 6, reason: 'unmatched', type: 'quote' }], isBalanced: false }\n * ```\n */\nconst checkQuoteBalance = (str: string): BalanceResult => {\n const errors: BalanceError[] = [];\n let quoteCount = 0;\n let lastQuoteIndex = -1;\n\n for (let i = 0; i < str.length; i++) {\n if (str[i] === '\"') {\n quoteCount++;\n lastQuoteIndex = i;\n }\n }\n\n const isBalanced = quoteCount % 2 === 0;\n\n if (!isBalanced && lastQuoteIndex !== -1) {\n errors.push({\n char: '\"',\n index: lastQuoteIndex,\n reason: 'unmatched',\n type: 'quote',\n });\n }\n\n return { errors, isBalanced };\n};\n\n/** Mapping of opening brackets to their corresponding closing brackets */\nexport const BRACKETS = { 'ยซ': 'ยป', '(': ')', '[': ']', '{': '}' };\n\n/** Set of all opening bracket characters */\nexport const OPEN_BRACKETS = new Set(['ยซ', '(', '[', '{']);\n\n/** Set of all closing bracket characters */\nexport const CLOSE_BRACKETS = new Set(['ยป', ')', ']', '}']);\n\n/**\n * Checks if all brackets in a string are properly balanced and returns detailed error information.\n *\n * A string has balanced brackets when:\n * - Every opening bracket has a corresponding closing bracket\n * - Brackets are properly nested (no crossing pairs)\n * - Each closing bracket matches the most recent unmatched opening bracket\n *\n * Supports the following bracket pairs: (), [], {}, ยซยป\n *\n * @param str - The string to check for bracket balance\n * @returns An object containing balance status and any errors found\n *\n * @example\n * ```typescript\n * checkBracketBalance('(hello [world])') // { errors: [], isBalanced: true }\n * checkBracketBalance('(hello [world)') // { errors: [{ char: '[', index: 7, reason: 'unclosed', type: 'bracket' }], isBalanced: false }\n * checkBracketBalance('(hello ]world[') // { errors: [...], isBalanced: false }\n * ```\n */\nconst checkBracketBalance = (str: string): BalanceResult => {\n const errors: BalanceError[] = [];\n const stack: Array<{ char: string; index: number }> = [];\n\n for (let i = 0; i < str.length; i++) {\n const char = str[i];\n\n if (OPEN_BRACKETS.has(char)) {\n stack.push({ char, index: i });\n } else if (CLOSE_BRACKETS.has(char)) {\n const lastOpen = stack.pop();\n\n if (!lastOpen) {\n errors.push({\n char,\n index: i,\n reason: 'unmatched',\n type: 'bracket',\n });\n } else if (BRACKETS[lastOpen.char as keyof typeof BRACKETS] !== char) {\n errors.push({\n char: lastOpen.char,\n index: lastOpen.index,\n reason: 'mismatched',\n type: 'bracket',\n });\n errors.push({\n char,\n index: i,\n reason: 'mismatched',\n type: 'bracket',\n });\n }\n }\n }\n\n stack.forEach(({ char, index }) => {\n errors.push({\n char,\n index,\n reason: 'unclosed',\n type: 'bracket',\n });\n });\n\n return { errors, isBalanced: errors.length === 0 };\n};\n\n/**\n * Checks if both quotes and brackets are balanced in a string and returns detailed error information.\n *\n * This function combines the results of both quote and bracket balance checking,\n * providing a comprehensive analysis of all balance issues in the text.\n * The errors are sorted by their position in the string for easier debugging.\n *\n * @param str - The string to check for overall balance\n * @returns An object containing combined balance status and all errors found, sorted by position\n *\n * @example\n * ```typescript\n * checkBalance('Hello \"world\" and (test)') // { errors: [], isBalanced: true }\n * checkBalance('Hello \"world and (test') // { errors: [...], isBalanced: false }\n * ```\n */\nexport const checkBalance = (str: string): BalanceResult => {\n const quoteResult = checkQuoteBalance(str);\n const bracketResult = checkBracketBalance(str);\n\n return {\n errors: [...quoteResult.errors, ...bracketResult.errors].sort((a, b) => a.index - b.index),\n isBalanced: quoteResult.isBalanced && bracketResult.isBalanced,\n };\n};\n\n/**\n * Enhanced error detection that returns absolute character positions for use with HighlightableTextarea.\n *\n * This interface extends the basic BalanceError to include absolute positioning\n * across multiple lines of text, making it suitable for text editors and\n * syntax highlighters that need precise character positioning.\n */\nexport interface CharacterError {\n /** Absolute character position from the start of the entire text */\n absoluteIndex: number;\n /** The character that caused the error */\n char: string;\n /** The reason for the error */\n reason: 'mismatched' | 'unclosed' | 'unmatched';\n /** The type of character that caused the error */\n type: 'bracket' | 'quote';\n}\n\n/**\n * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.\n *\n * This function processes text line by line, but only checks lines longer than 10 characters\n * for balance issues. It returns absolute positions that can be used with text editors\n * or highlighting components that need precise character positioning across the entire text.\n *\n * The absolute index accounts for newline characters between lines, providing accurate\n * positioning for the original text string.\n *\n * @param text - The multi-line text to analyze for balance errors\n * @returns Array of character errors with absolute positioning information\n *\n * @example\n * ```typescript\n * const text = 'Line 1 with \"quote\\nLine 2 with (bracket';\n * const errors = getUnbalancedErrors(text);\n * // Returns errors with absoluteIndex pointing to exact character positions\n * ```\n */\nexport const getUnbalancedErrors = (text: string): CharacterError[] => {\n const characterErrors: CharacterError[] = [];\n const lines = text.split('\\n');\n let absoluteIndex = 0;\n\n lines.forEach((line, lineIndex) => {\n if (line.length > 10) {\n const balanceResult = checkBalance(line);\n if (!balanceResult.isBalanced) {\n balanceResult.errors.forEach((error) => {\n characterErrors.push({\n absoluteIndex: absoluteIndex + error.index,\n char: error.char,\n reason: error.reason,\n type: error.type,\n });\n });\n }\n }\n // Add 1 for the newline character (except for the last line)\n absoluteIndex += line.length + (lineIndex < lines.length - 1 ? 1 : 0);\n });\n\n return characterErrors;\n};\n\n/**\n * Checks if all double quotes in a string are balanced.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for quote balance\n * @returns True if quotes are balanced, false otherwise\n *\n * @example\n * ```typescript\n * areQuotesBalanced('Hello \"world\"') // true\n * areQuotesBalanced('Hello \"world') // false\n * ```\n */\nexport const areQuotesBalanced = (str: string): boolean => {\n return checkQuoteBalance(str).isBalanced;\n};\n\n/**\n * Checks if all brackets in a string are properly balanced.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for bracket balance\n * @returns True if brackets are balanced, false otherwise\n *\n * @example\n * ```typescript\n * areBracketsBalanced('(hello [world])') // true\n * areBracketsBalanced('(hello [world') // false\n * ```\n */\nexport const areBracketsBalanced = (str: string): boolean => {\n return checkBracketBalance(str).isBalanced;\n};\n\n/**\n * Checks if both quotes and brackets are balanced in a string.\n *\n * This is a convenience function that returns only the boolean result\n * without detailed error information.\n *\n * @param str - The string to check for overall balance\n * @returns True if both quotes and brackets are balanced, false otherwise\n *\n * @example\n * ```typescript\n * isBalanced('Hello \"world\" and (test)') // true\n * isBalanced('Hello \"world and (test') // false\n * ```\n */\nexport const isBalanced = (str: string): boolean => {\n return checkBalance(str).isBalanced;\n};\n","import { PATTERNS } from './textUtils';\n\nconst INVALID_FOOTNOTE = '()';\n\n/**\n * Checks if the given text contains invalid footnote references.\n * Invalid footnotes include empty parentheses \"()\" or OCR-confused characters\n * like \".1OV9\" that were misrecognized instead of Arabic numerals.\n *\n * @param text - Text to check for invalid footnote patterns\n * @returns True if text contains invalid footnote references, false otherwise\n * @example\n * hasInvalidFootnotes('This text has ()') // Returns true\n * hasInvalidFootnotes('This text has (ูก)') // Returns false\n * hasInvalidFootnotes('OCR mistake (O)') // Returns true\n */\nexport const hasInvalidFootnotes = (text: string): boolean => {\n return PATTERNS.invalidReferenceRegex.test(text);\n};\n\n// Arabic number formatter instance\nconst arabicFormatter = new Intl.NumberFormat('ar-SA');\n\n/**\n * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.\n * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.\n *\n * @param num - The number to convert to Arabic numerals\n * @returns String representation using Arabic-Indic digits (ู -ูฉ)\n * @example\n * numberToArabic(123) // Returns 'ูกูขูฃ'\n * numberToArabic(5) // Returns 'ูฅ'\n */\nconst numberToArabic = (num: number): string => {\n return arabicFormatter.format(num);\n};\n\n/**\n * Converts OCR-confused characters to their corresponding Arabic-Indic numerals.\n * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.\n *\n * @param char - Single character that may be an OCR mistake\n * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists\n * @example\n * ocrToArabic('O') // Returns 'ูฅ' (O often confused with ูฅ)\n * ocrToArabic('1') // Returns 'ูก' (1 often confused with ูก)\n * ocrToArabic('.') // Returns 'ู ' (dot often confused with ู )\n */\nconst ocrToArabic = (char: string): string => {\n const ocrToArabicMap: { [key: string]: string } = {\n '1': 'ูก',\n '9': 'ูฉ',\n '.': 'ู ',\n O: 'ูฅ',\n o: 'ูฅ',\n V: 'ูง',\n v: 'ูง',\n };\n return ocrToArabicMap[char] || char;\n};\n\n/**\n * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.\n * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.\n *\n * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(ูกูขูฃ)'\n * @returns Parsed number, or 0 if parsing fails\n * @example\n * arabicToNumber('(ูกูขูฃ)') // Returns 123\n * arabicToNumber('(ูฅ)') // Returns 5\n * arabicToNumber('invalid') // Returns 0\n */\nconst arabicToNumber = (arabicStr: string): number => {\n const lookup: { [key: string]: string } = {\n 'ู ': '0',\n 'ูก': '1',\n 'ูข': '2',\n 'ูฃ': '3',\n 'ูค': '4',\n 'ูฅ': '5',\n 'ูฆ': '6',\n 'ูง': '7',\n 'ูจ': '8',\n 'ูฉ': '9',\n };\n const digits = arabicStr.replace(/[()]/g, '');\n let numStr = '';\n for (const char of digits) {\n numStr += lookup[char];\n }\n const parsed = parseInt(numStr, 10);\n return isNaN(parsed) ? 0 : parsed;\n};\n\ntype TextLine = {\n isFootnote?: boolean;\n text: string;\n};\n\n/**\n * Extracts all footnote references from text lines, categorizing them by type and location.\n * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.\n *\n * @param lines - Array of text line objects with optional isFootnote flag\n * @returns Object containing categorized reference arrays:\n * - bodyReferences: All valid references found in body text\n * - footnoteReferences: All valid references found in footnotes\n * - ocrConfusedInBody: OCR-confused references in body text (for tracking)\n * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)\n * @example\n * const lines = [\n * { text: 'Body with (ูก) and (O)', isFootnote: false },\n * { text: '(ูก) Footnote text', isFootnote: true }\n * ];\n * const refs = extractReferences(lines);\n * // refs.bodyReferences contains ['(ูก)', '(ูฅ)'] - OCR 'O' converted to 'ูฅ'\n */\nconst extractReferences = (lines: TextLine[]) => {\n const arabicReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);\n\n const ocrConfusedReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);\n\n const arabicReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);\n\n const ocrConfusedReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);\n\n const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n return {\n bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],\n footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],\n ocrConfusedInBody: ocrConfusedReferencesInBody,\n ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes,\n };\n};\n\n/**\n * Determines if footnote reference correction is needed by checking for:\n * 1. Invalid footnote patterns (empty parentheses, OCR mistakes)\n * 2. Mismatched sets of references between body text and footnotes\n * 3. Different counts of references in body vs footnotes\n *\n * @param lines - Array of text line objects to analyze\n * @param references - Extracted reference data from extractReferences()\n * @returns True if correction is needed, false if references are already correct\n * @example\n * const lines = [{ text: 'Text with ()', isFootnote: false }];\n * const refs = extractReferences(lines);\n * needsCorrection(lines, refs) // Returns true due to invalid \"()\" reference\n */\nconst needsCorrection = (lines: TextLine[], references: ReturnType<typeof extractReferences>) => {\n const mistakenReferences = lines.some((line) => hasInvalidFootnotes(line.text));\n if (mistakenReferences) return true;\n\n const bodySet = new Set(references.bodyReferences);\n const footnoteSet = new Set(references.footnoteReferences);\n if (bodySet.size !== footnoteSet.size) return true;\n\n // Check if the sets contain the same elements\n for (const ref of bodySet) {\n if (!footnoteSet.has(ref)) {\n return true;\n }\n }\n\n return false;\n};\n\n/**\n * Corrects footnote references in an array of text lines by:\n * 1. Converting OCR-confused characters to proper Arabic numerals\n * 2. Filling in empty \"()\" references with appropriate numbers\n * 3. Ensuring footnote references in body text match those in footnotes\n * 4. Generating new reference numbers when needed\n *\n * @param lines - Array of text line objects, each with optional isFootnote flag\n * @returns Array of corrected text lines with proper footnote references\n * @example\n * const lines = [\n * { text: 'Main text with ()', isFootnote: false },\n * { text: '() This is a footnote', isFootnote: true }\n * ];\n * const corrected = correctReferences(lines);\n * // Returns lines with \"()\" replaced by proper Arabic numerals like \"(ูก)\"\n */\nexport const correctReferences = <T extends TextLine>(lines: T[]): T[] => {\n const initialReferences = extractReferences(lines);\n\n if (!needsCorrection(lines, initialReferences)) {\n return lines;\n }\n\n // Pass 1: Sanitize lines by correcting only OCR characters inside reference markers.\n const sanitizedLines = lines.map((line) => {\n let updatedText = line.text;\n // This regex finds the full reference, e.g., \"(O)\" or \"(1)\"\n const ocrRegex = /\\([.1OV9]+\\)/g;\n updatedText = updatedText.replace(ocrRegex, (match) => {\n // This replace acts *inside* the found match, e.g., on \"O\" or \"1\"\n return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));\n });\n return { ...line, text: updatedText };\n });\n\n // Pass 2: Analyze the sanitized lines to get a clear and accurate picture of references.\n const cleanReferences = extractReferences(sanitizedLines);\n\n // Step 3: Create queues of \"unmatched\" references for two-way pairing.\n const bodyRefSet = new Set(cleanReferences.bodyReferences);\n const footnoteRefSet = new Set(cleanReferences.footnoteReferences);\n\n const uniqueBodyRefs = [...new Set(cleanReferences.bodyReferences)];\n const uniqueFootnoteRefs = [...new Set(cleanReferences.footnoteReferences)];\n\n // Queue 1: Body references available for footnotes.\n const bodyRefsForFootnotes = uniqueBodyRefs.filter((ref) => !footnoteRefSet.has(ref));\n // Queue 2: Footnote references available for the body.\n const footnoteRefsForBody = uniqueFootnoteRefs.filter((ref) => !bodyRefSet.has(ref));\n\n // Step 4: Determine the starting point for any completely new reference numbers.\n const allRefs = [...bodyRefSet, ...footnoteRefSet];\n const maxRefNum = allRefs.length > 0 ? Math.max(0, ...allRefs.map((ref) => arabicToNumber(ref))) : 0;\n const referenceCounter = { count: maxRefNum + 1 };\n\n // Step 5: Map over the sanitized lines, filling in '()' using the queues.\n return sanitizedLines.map((line) => {\n if (!line.text.includes(INVALID_FOOTNOTE)) {\n return line;\n }\n let updatedText = line.text;\n\n updatedText = updatedText.replace(/\\(\\)/g, () => {\n if (line.isFootnote) {\n const availableRef = bodyRefsForFootnotes.shift();\n if (availableRef) return availableRef;\n } else {\n // It's body text\n const availableRef = footnoteRefsForBody.shift();\n if (availableRef) return availableRef;\n }\n\n // If no available partner reference exists, generate a new one.\n const newRef = `(${numberToArabic(referenceCounter.count)})`;\n referenceCounter.count++;\n return newRef;\n });\n\n return { ...line, text: updatedText };\n });\n};\n","import { PATTERNS } from './textUtils';\n\n/**\n * Character statistics for analyzing text content and patterns\n */\ntype CharacterStats = {\n /** Number of Arabic script characters in the text */\n arabicCount: number;\n /** Map of character frequencies for repetition analysis */\n charFreq: Map<string, number>;\n /** Number of digit characters (0-9) in the text */\n digitCount: number;\n /** Number of Latin alphabet characters (a-z, A-Z) in the text */\n latinCount: number;\n /** Number of punctuation characters in the text */\n punctuationCount: number;\n /** Number of whitespace characters in the text */\n spaceCount: number;\n /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */\n symbolCount: number;\n};\n\n/**\n * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.\n * This function performs comprehensive analysis to identify patterns commonly associated\n * with OCR errors, formatting artifacts, or meaningless content in Arabic text processing.\n *\n * @param text - The input string to analyze for noise patterns\n * @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content\n *\n * @example\n * ```typescript\n * import { isArabicTextNoise } from 'baburchi';\n *\n * console.log(isArabicTextNoise('---')); // true (formatting artifact)\n * console.log(isArabicTextNoise('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…')); // false (valid Arabic)\n * console.log(isArabicTextNoise('ABC')); // true (uppercase pattern)\n * ```\n */\nexport const isArabicTextNoise = (text: string): boolean => {\n // Early return for empty or very short strings\n if (!text || text.trim().length === 0) {\n return true;\n }\n\n const trimmed = text.trim();\n const length = trimmed.length;\n\n // Very short strings are likely noise unless they're meaningful Arabic\n if (length < 2) {\n return true;\n }\n\n // Check for basic noise patterns first\n if (isBasicNoisePattern(trimmed)) {\n return true;\n }\n\n const charStats = analyzeCharacterStats(trimmed);\n\n // Check for excessive repetition\n if (hasExcessiveRepetition(charStats, length)) {\n return true;\n }\n\n // Check if text contains Arabic characters\n const hasArabic = PATTERNS.arabicCharacters.test(trimmed);\n\n // Handle non-Arabic text\n if (!hasArabic && /[a-zA-Z]/.test(trimmed)) {\n return true;\n }\n\n // Arabic-specific validation\n if (hasArabic) {\n return !isValidArabicContent(charStats, length);\n }\n\n // Non-Arabic content validation\n return isNonArabicNoise(charStats, length, trimmed);\n};\n\n/**\n * Analyzes character composition and frequency statistics for the input text.\n * Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols)\n * and tracks character frequency for pattern analysis.\n *\n * @param text - The text string to analyze\n * @returns CharacterStats object containing detailed character analysis\n *\n * @example\n * ```typescript\n * import { analyzeCharacterStats } from 'baburchi';\n *\n * const stats = analyzeCharacterStats('ู…ุฑุญุจุง 123!');\n * console.log(stats.arabicCount); // 5\n * console.log(stats.digitCount); // 3\n * console.log(stats.symbolCount); // 1\n * ```\n */\nexport function analyzeCharacterStats(text: string): CharacterStats {\n const stats: CharacterStats = {\n arabicCount: 0,\n charFreq: new Map<string, number>(),\n digitCount: 0,\n latinCount: 0,\n punctuationCount: 0,\n spaceCount: 0,\n symbolCount: 0,\n };\n\n const chars = Array.from(text);\n\n for (const char of chars) {\n // Count character frequency for repetition detection\n stats.charFreq.set(char, (stats.charFreq.get(char) || 0) + 1);\n\n if (PATTERNS.arabicCharacters.test(char)) {\n stats.arabicCount++;\n } else if (/\\d/.test(char)) {\n stats.digitCount++;\n } else if (/[a-zA-Z]/.test(char)) {\n stats.latinCount++;\n } else if (/\\s/.test(char)) {\n stats.spaceCount++;\n } else if (/[.,;:()[\\]{}\"\"\"''`]/.test(char)) {\n stats.punctuationCount++;\n } else {\n stats.symbolCount++;\n }\n }\n\n return stats;\n}\n\n/**\n * Detects excessive repetition of specific characters that commonly indicate noise.\n * Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs,\n * and underscores that often appear in OCR artifacts or formatting elements.\n *\n * @param charStats - Character statistics from analyzeCharacterStats\n * @param textLength - Total length of the original text\n * @returns true if excessive repetition is detected, false otherwise\n *\n * @example\n * ```typescript\n * import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';\n *\n * const stats = analyzeCharacterStats('!!!!!');\n * console.log(hasExcessiveRepetition(stats, 5)); // true\n *\n * const normalStats = analyzeCharacterStats('hello world');\n * console.log(hasExcessiveRepetition(normalStats, 11)); // false\n * ```\n */\nexport function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean {\n let repeatCount = 0;\n const repetitiveChars = ['!', '.', '-', '=', '_'];\n\n for (const [char, count] of charStats.charFreq) {\n if (count >= 5 && repetitiveChars.includes(char)) {\n repeatCount += count;\n }\n }\n\n // High repetition ratio indicates noise\n return repeatCount / textLength > 0.4;\n}\n\n/**\n * Identifies text that matches common noise patterns using regular expressions.\n * Detects patterns like repeated dashes, dot sequences, uppercase-only text,\n * digit-dash combinations, and other formatting artifacts commonly found in OCR output.\n *\n * @param text - The text string to check against noise patterns\n * @returns true if the text matches a basic noise pattern, false otherwise\n *\n * @example\n * ```typescript\n * import { isBasicNoisePattern } from 'baburchi';\n *\n * console.log(isBasicNoisePattern('---')); // true\n * console.log(isBasicNoisePattern('...')); // true\n * console.log(isBasicNoisePattern('ABC')); // true\n * console.log(isBasicNoisePattern('- 77')); // true\n * console.log(isBasicNoisePattern('hello world')); // false\n * ```\n */\nexport function isBasicNoisePattern(text: string): boolean {\n const noisePatterns = [\n /^[-=_โ”โ‰บโ‰ป\\s]*$/, // Only dashes, equals, underscores, special chars, or spaces\n /^[.\\s]*$/, // Only dots and spaces\n /^[!\\s]*$/, // Only exclamation marks and spaces\n /^[A-Z\\s]*$/, // Only uppercase letters and spaces (like \"Ap Ap Ap\")\n /^[-\\d\\s]*$/, // Only dashes, digits and spaces (like \"- 77\", \"- 4\")\n /^\\d+\\s*$/, // Only digits and spaces (like \"1\", \" 1 \")\n /^[A-Z]\\s*$/, // Single uppercase letter with optional spaces\n /^[โ€”\\s]*$/, // Only em-dashes and spaces\n /^[เฅเคฐ\\s-]*$/, // Devanagari characters (likely OCR errors)\n ];\n\n return noisePatterns.some((pattern) => pattern.test(text));\n}\n\n/**\n * Determines if non-Arabic content should be classified as noise based on various heuristics.\n * Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition\n * to identify unwanted OCR artifacts or meaningless content.\n *\n * @param charStats - Character statistics from analyzeCharacterStats\n * @param textLength - Total length of the original text\n * @param text - The original text string for additional pattern matching\n * @returns true if the content is likely noise, false if it appears to be valid content\n *\n * @example\n * ```typescript\n * import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';\n *\n * const stats = analyzeCharacterStats('!!!');\n * console.log(isNonArabicNoise(stats, 3, '!!!')); // true\n *\n * const validStats = analyzeCharacterStats('2023');\n * console.log(isNonArabicNoise(validStats, 4, '2023')); // false\n * ```\n */\nexport function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean {\n const contentChars = charStats.arabicCount + charStats.latinCount + charStats.digitCount;\n\n // Text that's mostly symbols or punctuation is likely noise\n if (contentChars === 0) {\n return true;\n }\n\n // Check for specific spacing patterns that indicate noise\n if (isSpacingNoise(charStats, contentChars, textLength)) {\n return true;\n }\n\n // Special handling for Arabic numerals in parentheses (like \"(ูฆู ูกู ).\")\n const hasArabicNumerals = /[ู -ูฉ]/.test(text);\n if (hasArabicNumerals && charStats.digitCount >= 3) {\n return false;\n }\n\n // High symbol-to-content ratio indicates noise, but be more lenient with punctuation\n // Allow more punctuation for valid content like references, citations, etc.\n const adjustedNonContentChars = charStats.symbolCount + Math.max(0, charStats.punctuationCount - 5);\n if (adjustedNonContentChars / Math.max(contentChars, 1) > 2) {\n return true;\n }\n\n // Very short strings with no Arabic are likely noise (except substantial numbers)\n if (textLength <= 5 && charStats.arabicCount === 0 && !(/^\\d+$/.test(text) && charStats.digitCount >= 3)) {\n return true;\n }\n\n // Allow pure numbers if they're substantial (like years)\n if (/^\\d{3,4}$/.test(text)) {\n return false;\n }\n\n // Default to not noise for longer content\n return textLength <= 10;\n}\n\n/**\n * Detects problematic spacing patterns that indicate noise or OCR artifacts.\n * Identifies cases where spacing is excessive relative to content, or where\n * single characters are surrounded by spaces in a way that suggests OCR errors.\n *\n * @param charStats - Character statistics from analyzeCharacterStats\n * @param contentChars - Number of meaningful content characters (Arabic + Latin + digits)\n * @param textLength - Total length of the original text\n * @returns true if spacing patterns indicate noise, false otherwise\n *\n * @example\n * ```typescript\n * import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';\n *\n * const stats = analyzeCharacterStats(' a ');\n * const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;\n * console.log(isSpacingNoise(stats, contentChars, 3)); // true\n *\n * const normalStats = analyzeCharacterStats('hello world');\n * const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;\n * console.log(isSpacingNoise(normalStats, normalContent, 11)); // false\n * ```\n */\nexport function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean {\n const { arabicCount, spaceCount } = charStats;\n\n // Too many spaces relative to content\n if (spaceCount > 0 && contentChars === spaceCount + 1 && contentChars <= 5) {\n return true;\n }\n\n // Short text with multiple spaces and no Arabic\n if (textLength <= 10 && spaceCount >= 2 && arabicCount === 0) {\n return true;\n }\n\n // Excessive spacing ratio\n if (spaceCount / textLength > 0.6) {\n return true;\n }\n\n return false;\n}\n\n/**\n * Validates whether Arabic content is substantial enough to be considered meaningful.\n * Uses character counts and text length to determine if Arabic text contains\n * sufficient content or if it's likely to be a fragment or OCR artifact.\n *\n * @param charStats - Character statistics from analyzeCharacterStats\n * @param textLength - Total length of the original text\n * @returns true if the Arabic content appears valid, false if it's likely noise\n *\n * @example\n * ```typescript\n * import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';\n *\n * const validStats = analyzeCharacterStats('ุงู„ุณู„ุงู… ุนู„ูŠูƒู…');\n * console.log(isValidArabicContent(validStats, 12)); // true\n *\n * const shortStats = analyzeCharacterStats('ุต');\n * console.log(isValidArabicContent(shortStats, 1)); // false\n *\n * const withDigitsStats = analyzeCharacterStats('ุต 5');\n * console.log(isValidArabicContent(withDigitsStats, 3)); // true\n * ```\n */\nexport function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean {\n // Arabic text with reasonable content length is likely valid\n if (charStats.arabicCount >= 3) {\n return true;\n }\n\n // Short Arabic snippets with numbers might be valid (like dates, references)\n if (charStats.arabicCount >= 1 && charStats.digitCount > 0 && textLength <= 20) {\n return true;\n }\n\n // Allow short Arabic words with punctuation (like \"ู„ู‡.\" - \"for him/it.\")\n if (charStats.arabicCount >= 2 && charStats.punctuationCount <= 2 && textLength <= 10) {\n return true;\n }\n\n // Allow single meaningful Arabic words that are common standalone terms\n // This handles cases like pronouns, prepositions, common short words\n if (charStats.arabicCount >= 1 && textLength <= 5 && charStats.punctuationCount <= 1) {\n return true;\n }\n\n return false;\n}\n","import type { FixTypoOptions } from './types';\n\nimport { alignTokenSequences, areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport {\n handleFootnoteFusion,\n handleFootnoteSelection,\n handleStandaloneFootnotes,\n normalizeArabicText,\n tokenizeText,\n} from './textUtils';\n\n/**\n * Selects the best token(s) from an aligned pair during typo correction.\n * Uses various heuristics including normalization, footnote handling, typo symbols,\n * and similarity scores to determine which token(s) to keep.\n *\n * @param originalToken - Token from the original OCR text (may be null)\n * @param altToken - Token from the alternative OCR text (may be null)\n * @param options - Configuration options including typo symbols and similarity threshold\n * @returns Array of selected tokens (usually contains one token, but may contain multiple)\n */\nconst selectBestTokens = (\n originalToken: null | string,\n altToken: null | string,\n { similarityThreshold, typoSymbols }: FixTypoOptions,\n): string[] => {\n // Handle missing tokens\n if (originalToken === null) {\n return [altToken!];\n }\n if (altToken === null) {\n return [originalToken];\n }\n\n // Preserve original if same after normalization (keeps diacritics)\n if (normalizeArabicText(originalToken) === normalizeArabicText(altToken)) {\n return [originalToken];\n }\n\n // Handle embedded footnotes\n const result = handleFootnoteSelection(originalToken, altToken);\n if (result) return result;\n\n // Handle standalone footnotes\n const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);\n if (footnoteResult) return footnoteResult;\n\n // Handle typo symbols - prefer the symbol itself\n if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {\n const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);\n return typoSymbol ? [typoSymbol] : [originalToken];\n }\n\n // Choose based on similarity\n const normalizedOriginal = normalizeArabicText(originalToken);\n const normalizedAlt = normalizeArabicText(altToken);\n const similarity = calculateSimilarity(normalizedOriginal, normalizedAlt);\n\n return [similarity > similarityThreshold ? originalToken : altToken];\n};\n\n/**\n * Removes duplicate tokens and handles footnote fusion in post-processing.\n * Identifies and removes tokens that are highly similar while preserving\n * important variations. Also handles special cases like footnote merging.\n *\n * @param tokens - Array of tokens to process\n * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)\n * @returns Array of tokens with duplicates removed and footnotes fused\n */\nconst removeDuplicateTokens = (tokens: string[], highSimilarityThreshold: number): string[] => {\n if (tokens.length === 0) {\n return tokens;\n }\n\n const result: string[] = [];\n\n for (const currentToken of tokens) {\n if (result.length === 0) {\n result.push(currentToken);\n continue;\n }\n\n const previousToken = result.at(-1)!;\n\n // Handle ordinary echoes (similar tokens)\n if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {\n // Keep the shorter version\n if (currentToken.length < previousToken.length) {\n result[result.length - 1] = currentToken;\n }\n continue;\n }\n\n // Handle footnote fusion cases\n if (handleFootnoteFusion(result, previousToken, currentToken)) {\n continue;\n }\n\n result.push(currentToken);\n }\n\n return result;\n};\n\n/**\n * Processes text alignment between original and alternate OCR results to fix typos.\n * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,\n * then selects the best tokens and performs post-processing.\n *\n * @param originalText - Original OCR text that may contain typos\n * @param altText - Reference text from alternate OCR for comparison\n * @param options - Configuration options for alignment and selection\n * @returns Corrected text with typos fixed\n */\nexport const processTextAlignment = (originalText: string, altText: string, options: FixTypoOptions): string => {\n const originalTokens = tokenizeText(originalText, options.typoSymbols);\n const altTokens = tokenizeText(altText, options.typoSymbols);\n\n // Align token sequences\n const alignedPairs = alignTokenSequences(\n originalTokens,\n altTokens,\n options.typoSymbols,\n options.similarityThreshold,\n );\n\n // Select best tokens from each aligned pair\n const mergedTokens = alignedPairs.flatMap(([original, alt]) => selectBestTokens(original, alt, options));\n\n // Remove duplicates and handle post-processing\n const finalTokens = removeDuplicateTokens(mergedTokens, options.highSimilarityThreshold);\n\n return finalTokens.join(' ');\n};\n\nexport const fixTypo = (\n original: string,\n correction: string,\n {\n highSimilarityThreshold = 0.8,\n similarityThreshold = 0.6,\n typoSymbols,\n }: Partial<FixTypoOptions> & Pick<FixTypoOptions, 'typoSymbols'>,\n) => {\n return processTextAlignment(original, correction, { highSimilarityThreshold, similarityThreshold, typoSymbols });\n};\n\nexport * from './alignment';\nexport * from './balance';\nexport * from './footnotes';\nexport * from './noise';\nexport * from './similarity';\nexport * from './textUtils';\n"],"mappings":"AAGO,IAAMA,EAAW,CAEpB,iBAAkB,sEAGlB,aAAc,sBAGd,6BAA8B,yBAG9B,uBAAwB,iDAGxB,+BAAgC,gCAGhC,qBAAsB,wBAGtB,WAAY,mDAGZ,iBAAkB,0BAGlB,mBAAoB,mCAGpB,sBAAuB,qBAGvB,kCAAmC,iBAGnC,0BAA2B,gBAG3B,QAAS,UAGT,WAAY,KAChB,EAYaC,EAAuBC,GACzBA,EAAK,QAAQF,EAAS,QAAS,EAAE,EAAE,QAAQA,EAAS,WAAY,EAAE,EAAE,KAAK,EAavEG,EAAiBD,GAAyB,CACnD,IAAME,EAAQF,EAAK,MAAMF,EAAS,YAAY,EAC9C,OAAOI,EAAQA,EAAM,CAAC,EAAI,EAC9B,EAaaC,EAAe,CAACH,EAAcI,EAA4B,CAAC,IAAgB,CACpF,IAAIC,EAAgBL,EAGpB,QAAWM,KAAUF,EAAiB,CAClC,IAAMG,EAAc,IAAI,OAAOD,EAAQ,GAAG,EAC1CD,EAAgBA,EAAc,QAAQE,EAAa,IAAID,CAAM,GAAG,CACpE,CAEA,OAAOD,EAAc,KAAK,EAAE,MAAMP,EAAS,UAAU,EAAE,OAAO,OAAO,CACzE,EAeaU,EAAuB,CAACC,EAAkBC,EAAuBC,IAAkC,CAC5G,IAAMC,EAAmBd,EAAS,mBAAmB,KAAKY,CAAa,EACjEG,EAAkBf,EAAS,iBAAiB,KAAKa,CAAY,EAC7DG,EAAmBhB,EAAS,mBAAmB,KAAKa,CAAY,EAChEI,EAAkBjB,EAAS,iBAAiB,KAAKY,CAAa,EAE9DM,EAAaf,EAAcS,CAAa,EACxCO,EAAahB,EAAcU,CAAY,EAG7C,OAAIC,GAAoBC,GAAmBG,IAAeC,GACtDR,EAAOA,EAAO,OAAS,CAAC,EAAIE,EACrB,IAIP,GAAAI,GAAmBD,GAAoBE,IAAeC,EAK9D,EAcaC,EAA0B,CAACC,EAAgBC,IAAoC,CACxF,IAAMC,EAAevB,EAAS,iBAAiB,KAAKqB,CAAM,EACpDG,EAAexB,EAAS,iBAAiB,KAAKsB,CAAM,EAE1D,OAAIC,GAAgB,CAACC,EAAqB,CAACH,CAAM,EAC7CG,GAAgB,CAACD,EAAqB,CAACD,CAAM,EAC7CC,GAAgBC,EACT,CAACH,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EAcaG,EAA4B,CAACJ,EAAgBC,IAAoC,CAC1F,IAAMI,EAAc1B,EAAS,mBAAmB,KAAKqB,CAAM,EACrDM,EAAc3B,EAAS,mBAAmB,KAAKsB,CAAM,EAE3D,OAAII,GAAe,CAACC,EAAoB,CAACN,EAAQC,CAAM,EACnDK,GAAe,CAACD,EAAoB,CAACJ,EAAQD,CAAM,EACnDK,GAAeC,EACR,CAACN,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,ECpLA,IAAMM,EAAmB,CACrB,YAAa,GACb,iBAAkB,GAClB,cAAe,EACf,WAAY,CAChB,EAeaC,EAA+B,CAACC,EAAeC,IAA0B,CAClF,IAAMC,EAAUF,EAAM,OAChBG,EAAUF,EAAM,OAEtB,GAAIC,IAAY,EACZ,OAAOC,EAGX,GAAIA,IAAY,EACZ,OAAOD,EAIX,GAAM,CAACE,EAASC,CAAM,EAAIH,GAAWC,EAAU,CAACH,EAAOC,CAAK,EAAI,CAACA,EAAOD,CAAK,EACvEM,EAAWF,EAAQ,OACnBG,EAAUF,EAAO,OAEnBG,EAAc,MAAM,KAAK,CAAE,OAAQF,EAAW,CAAE,EAAG,CAACG,EAAGC,IAAUA,CAAK,EAE1E,QAASC,EAAI,EAAGA,GAAKJ,EAASI,IAAK,CAC/B,IAAMC,EAAa,CAACD,CAAC,EAErB,QAASE,EAAI,EAAGA,GAAKP,EAAUO,IAAK,CAChC,IAAMC,EAAmBT,EAAOM,EAAI,CAAC,IAAMP,EAAQS,EAAI,CAAC,EAAI,EAAI,EAC1DE,EAAU,KAAK,IACjBP,EAAYK,CAAC,EAAI,EACjBD,EAAWC,EAAI,CAAC,EAAI,EACpBL,EAAYK,EAAI,CAAC,EAAIC,CACzB,EACAF,EAAW,KAAKG,CAAO,CAC3B,CAEAP,EAAcI,CAClB,CAEA,OAAOJ,EAAYF,CAAQ,CAC/B,EAcaU,EAAsB,CAAChB,EAAeC,IAA0B,CACzE,IAAMgB,EAAY,KAAK,IAAIjB,EAAM,OAAQC,EAAM,MAAM,GAAK,EACpDiB,EAAWnB,EAA6BC,EAAOC,CAAK,EAC1D,OAAQgB,EAAYC,GAAYD,CACpC,EAcaE,EAA+B,CAACnB,EAAeC,EAAemB,EAAoB,KAAiB,CAC5G,IAAMC,EAAcC,EAAoBtB,CAAK,EACvCuB,EAAcD,EAAoBrB,CAAK,EAC7C,OAAOe,EAAoBK,EAAaE,CAAW,GAAKH,CAC5D,EAgBaI,EAA0B,CACnCC,EACAC,EACAC,EACAC,IACS,CACT,IAAMP,EAAcC,EAAoBG,CAAM,EACxCF,EAAcD,EAAoBI,CAAM,EAG9C,GAAIL,IAAgBE,EAChB,OAAOzB,EAAiB,cAI5B,IAAM+B,EAAeF,EAAY,SAASF,CAAM,GAAKE,EAAY,SAASD,CAAM,EAC1EI,EAAkBd,EAAoBK,EAAaE,CAAW,GAAKK,EAEzE,OAAIC,GAAgBC,EACThC,EAAiB,WAGrBA,EAAiB,gBAC5B,EAoBaiC,EAAqB,CAC9BC,EACAC,EACAC,IACqB,CACrB,IAAMC,EAAgC,CAAC,EACnCxB,EAAIsB,EAAQ,OACZpB,EAAIqB,EAAQ,OAEhB,KAAOvB,EAAI,GAAKE,EAAI,GAGhB,OAFoBmB,EAAOrB,CAAC,EAAEE,CAAC,EAEX,UAAW,CAC3B,IAAK,WACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAGuB,EAAQ,EAAErB,CAAC,CAAC,CAAC,EAC3C,MACJ,IAAK,OACDsB,EAAU,KAAK,CAAC,KAAMD,EAAQ,EAAErB,CAAC,CAAC,CAAC,EACnC,MACJ,IAAK,KACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAG,IAAI,CAAC,EACnC,MACJ,QACI,MAAM,IAAI,MAAM,6BAA6B,CACrD,CAGJ,OAAOwB,EAAU,QAAQ,CAC7B,EAgBaC,EAAsB,CAC/BH,EACAC,EACAP,EACAC,IACqB,CACrB,IAAM1B,EAAU+B,EAAQ,OAClB9B,EAAU+B,EAAQ,OAGlBG,EAAmC,MAAM,KAAK,CAAE,OAAQnC,EAAU,CAAE,EAAG,IACzE,MAAM,KAAK,CAAE,OAAQC,EAAU,CAAE,EAAG,KAAO,CAAE,UAAW,KAAM,MAAO,CAAE,EAAE,CAC7E,EAGA,QAAS,EAAI,EAAG,GAAKD,EAAS,IAC1BmC,EAAc,CAAC,EAAE,CAAC,EAAI,CAAE,UAAW,KAAM,MAAO,EAAIvC,EAAiB,WAAY,EAErF,QAASe,EAAI,EAAGA,GAAKV,EAASU,IAC1BwB,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW,OAAQ,MAAOA,EAAIf,EAAiB,WAAY,EAIvF,QAAS,EAAI,EAAG,GAAKI,EAAS,IAC1B,QAASW,EAAI,EAAGA,GAAKV,EAASU,IAAK,CAC/B,IAAMyB,EAAiBd,EACnBS,EAAQ,EAAI,CAAC,EACbC,EAAQrB,EAAI,CAAC,EACbc,EACAC,CACJ,EAEMW,EAAgBF,EAAc,EAAI,CAAC,EAAExB,EAAI,CAAC,EAAE,MAAQyB,EACpDE,EAAUH,EAAc,EAAI,CAAC,EAAExB,CAAC,EAAE,MAAQf,EAAiB,YAC3D2C,EAAYJ,EAAc,CAAC,EAAExB,EAAI,CAAC,EAAE,MAAQf,EAAiB,YAE7D4C,EAAY,KAAK,IAAIH,EAAeC,EAASC,CAAS,EACxDE,EAA4C,OAE5CD,IAAcH,EACdI,EAAgB,WACTD,IAAcF,IACrBG,EAAgB,MAGpBN,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW8B,EAAe,MAAOD,CAAU,CACvE,CAIJ,OAAOX,EAAmBM,EAAeJ,EAASC,CAAO,CAC7D,EC1OO,IAAMU,GAAoB,CAACC,EAAuBC,IAA2B,CAChF,IAAMC,EAAyB,CAAC,EAC5BC,EAAe,EAEnB,QAAWC,KAAcJ,EAAa,CAClC,GAAIG,GAAgBF,EAAa,OAC7B,MAGJ,GAAIG,EAAY,CAEZ,GAAM,CAAE,OAAAC,EAAQ,iBAAAC,CAAiB,EAAIC,EAAuBH,EAAYH,EAAcE,CAAY,EAE9FE,GACAH,EAAa,KAAKG,CAAM,EAE5BF,GAAgBG,CACpB,MAEIJ,EAAa,KAAKD,EAAaE,CAAY,CAAC,EAC5CA,GAER,CAGA,OAAIA,EAAeF,EAAa,QAC5BC,EAAa,KAAK,GAAGD,EAAa,MAAME,CAAY,CAAC,EAGlDD,CACX,EAKMM,EAAuB,CAACJ,EAAoBK,EAAeC,IAAkB,CAC/E,IAAMC,EAAgB,GAAGF,CAAK,IAAIC,CAAK,GACjCE,EAAiB,GAAGF,CAAK,IAAID,CAAK,GAElCI,EAAmBC,EAAoBV,CAAU,EACjDW,EAAeC,EAAoBH,EAAkBC,EAAoBH,CAAa,CAAC,EACvFM,EAAgBD,EAAoBH,EAAkBC,EAAoBF,CAAc,CAAC,EAE/F,OAAOG,GAAgBE,EAAgBN,EAAgBC,CAC3D,EAKML,EAAyB,CAACH,EAAoBH,EAAwBE,IAAyB,CACjG,IAAMe,EAAiBjB,EAAaE,CAAY,EAGhD,GAAIgB,EAA6Bf,EAAYc,CAAc,EACvD,MAAO,CAAE,OAAQA,EAAgB,iBAAkB,CAAE,EAIzD,IAAMT,EAAQR,EAAaE,CAAY,EACjCO,EAAQT,EAAaE,EAAe,CAAC,EAG3C,MAAI,CAACM,GAAS,CAACC,EACJD,EAAQ,CAAE,OAAQA,EAAO,iBAAkB,CAAE,EAAI,CAAE,OAAQ,GAAI,iBAAkB,CAAE,EAIvF,CAAE,OADSD,EAAqBJ,EAAYK,EAAOC,CAAK,EACnC,iBAAkB,CAAE,CACpD,EC3CA,IAAMU,EAAqBC,GAA+B,CACtD,IAAMC,EAAyB,CAAC,EAC5BC,EAAa,EACbC,EAAiB,GAErB,QAASC,EAAI,EAAGA,EAAIJ,EAAI,OAAQI,IACxBJ,EAAII,CAAC,IAAM,MACXF,IACAC,EAAiBC,GAIzB,IAAMC,EAAaH,EAAa,IAAM,EAEtC,MAAI,CAACG,GAAcF,IAAmB,IAClCF,EAAO,KAAK,CACR,KAAM,IACN,MAAOE,EACP,OAAQ,YACR,KAAM,OACV,CAAC,EAGE,CAAE,OAAAF,EAAQ,WAAAI,CAAW,CAChC,EAGaC,EAAW,CAAE,OAAK,OAAK,IAAK,IAAK,IAAK,IAAK,IAAK,GAAI,EAGpDC,EAAgB,IAAI,IAAI,CAAC,OAAK,IAAK,IAAK,GAAG,CAAC,EAG5CC,EAAiB,IAAI,IAAI,CAAC,OAAK,IAAK,IAAK,GAAG,CAAC,EAsBpDC,EAAuBT,GAA+B,CACxD,IAAMC,EAAyB,CAAC,EAC1BS,EAAgD,CAAC,EAEvD,QAASN,EAAI,EAAGA,EAAIJ,EAAI,OAAQI,IAAK,CACjC,IAAMO,EAAOX,EAAII,CAAC,EAElB,GAAIG,EAAc,IAAII,CAAI,EACtBD,EAAM,KAAK,CAAE,KAAAC,EAAM,MAAOP,CAAE,CAAC,UACtBI,EAAe,IAAIG,CAAI,EAAG,CACjC,IAAMC,EAAWF,EAAM,IAAI,EAEtBE,EAOMN,EAASM,EAAS,IAA6B,IAAMD,IAC5DV,EAAO,KAAK,CACR,KAAMW,EAAS,KACf,MAAOA,EAAS,MAChB,OAAQ,aACR,KAAM,SACV,CAAC,EACDX,EAAO,KAAK,CACR,KAAAU,EACA,MAAOP,EACP,OAAQ,aACR,KAAM,SACV,CAAC,GAlBDH,EAAO,KAAK,CACR,KAAAU,EACA,MAAOP,EACP,OAAQ,YACR,KAAM,SACV,CAAC,CAeT,CACJ,CAEA,OAAAM,EAAM,QAAQ,CAAC,CAAE,KAAAC,EAAM,MAAAE,CAAM,IAAM,CAC/BZ,EAAO,KAAK,CACR,KAAAU,EACA,MAAAE,EACA,OAAQ,WACR,KAAM,SACV,CAAC,CACL,CAAC,EAEM,CAAE,OAAAZ,EAAQ,WAAYA,EAAO,SAAW,CAAE,CACrD,EAkBaa,EAAgBd,GAA+B,CACxD,IAAMe,EAAchB,EAAkBC,CAAG,EACnCgB,EAAgBP,EAAoBT,CAAG,EAE7C,MAAO,CACH,OAAQ,CAAC,GAAGe,EAAY,OAAQ,GAAGC,EAAc,MAAM,EAAE,KAAK,CAACC,EAAGC,IAAMD,EAAE,MAAQC,EAAE,KAAK,EACzF,WAAYH,EAAY,YAAcC,EAAc,UACxD,CACJ,EAwCaG,GAAuBC,GAAmC,CACnE,IAAMC,EAAoC,CAAC,EACrCC,EAAQF,EAAK,MAAM;AAAA,CAAI,EACzBG,EAAgB,EAEpB,OAAAD,EAAM,QAAQ,CAACE,EAAMC,IAAc,CAC/B,GAAID,EAAK,OAAS,GAAI,CAClB,IAAME,EAAgBZ,EAAaU,CAAI,EAClCE,EAAc,YACfA,EAAc,OAAO,QAASC,GAAU,CACpCN,EAAgB,KAAK,CACjB,cAAeE,EAAgBI,EAAM,MACrC,KAAMA,EAAM,KACZ,OAAQA,EAAM,OACd,KAAMA,EAAM,IAChB,CAAC,CACL,CAAC,CAET,CAEAJ,GAAiBC,EAAK,QAAUC,EAAYH,EAAM,OAAS,EAAI,EAAI,EACvE,CAAC,EAEMD,CACX,EAiBaO,GAAqB5B,GACvBD,EAAkBC,CAAG,EAAE,WAkBrB6B,GAAuB7B,GACzBS,EAAoBT,CAAG,EAAE,WAkBvBK,GAAcL,GAChBc,EAAad,CAAG,EAAE,WC7R7B,IAAM8B,EAAmB,KAcZC,EAAuBC,GACzBC,EAAS,sBAAsB,KAAKD,CAAI,EAI7CE,EAAkB,IAAI,KAAK,aAAa,OAAO,EAY/CC,EAAkBC,GACbF,EAAgB,OAAOE,CAAG,EAc/BC,EAAeC,IACiC,CAC9C,EAAK,SACL,EAAK,SACL,IAAK,SACL,EAAG,SACH,EAAG,SACH,EAAG,SACH,EAAG,QACP,GACsBA,CAAI,GAAKA,EAc7BC,EAAkBC,GAA8B,CAClD,IAAMC,EAAoC,CACtC,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,GACT,EACMC,EAASF,EAAU,QAAQ,QAAS,EAAE,EACxCG,EAAS,GACb,QAAWL,KAAQI,EACfC,GAAUF,EAAOH,CAAI,EAEzB,IAAMM,EAAS,SAASD,EAAQ,EAAE,EAClC,OAAO,MAAMC,CAAM,EAAI,EAAIA,CAC/B,EAyBMC,EAAqBC,GAAsB,CAC7C,IAAMC,EAAyBD,EAC1B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,oBAAoB,GAAK,CAAC,CAAC,EAE/DgB,EAA8BH,EAC/B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,yBAAyB,GAAK,CAAC,CAAC,EAEpEiB,EAA8BJ,EAC/B,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,4BAA4B,GAAK,CAAC,CAAC,EAEvEkB,EAAmCL,EACpC,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,iCAAiC,GAAK,CAAC,CAAC,EAE5EmB,EAAuBH,EAA4B,IAAKI,GAC1DA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEMgB,EAA2BH,EAAiC,IAAKE,GACnEA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEA,MAAO,CACH,eAAgB,CAAC,GAAGS,EAAwB,GAAGK,CAAoB,EACnE,mBAAoB,CAAC,GAAGF,EAA6B,GAAGI,CAAwB,EAChF,kBAAmBL,EACnB,uBAAwBE,CAC5B,CACJ,EAgBMI,EAAkB,CAACT,EAAmBU,IAAqD,CAE7F,GAD2BV,EAAM,KAAMW,GAAS1B,EAAoB0B,EAAK,IAAI,CAAC,EACtD,MAAO,GAE/B,IAAMC,EAAU,IAAI,IAAIF,EAAW,cAAc,EAC3CG,EAAc,IAAI,IAAIH,EAAW,kBAAkB,EACzD,GAAIE,EAAQ,OAASC,EAAY,KAAM,MAAO,GAG9C,QAAWN,KAAOK,EACd,GAAI,CAACC,EAAY,IAAIN,CAAG,EACpB,MAAO,GAIf,MAAO,EACX,EAmBaO,GAAyCd,GAAoB,CACtE,IAAMe,EAAoBhB,EAAkBC,CAAK,EAEjD,GAAI,CAACS,EAAgBT,EAAOe,CAAiB,EACzC,OAAOf,EAIX,IAAMgB,EAAiBhB,EAAM,IAAKW,GAAS,CACvC,IAAIM,EAAcN,EAAK,KAEjBO,EAAW,gBACjB,OAAAD,EAAcA,EAAY,QAAQC,EAAWC,GAElCA,EAAM,QAAQ,WAAa3B,GAASD,EAAYC,CAAI,CAAC,CAC/D,EACM,CAAE,GAAGmB,EAAM,KAAMM,CAAY,CACxC,CAAC,EAGKG,EAAkBrB,EAAkBiB,CAAc,EAGlDK,EAAa,IAAI,IAAID,EAAgB,cAAc,EACnDE,EAAiB,IAAI,IAAIF,EAAgB,kBAAkB,EAE3DG,EAAiB,CAAC,GAAG,IAAI,IAAIH,EAAgB,cAAc,CAAC,EAC5DI,EAAqB,CAAC,GAAG,IAAI,IAAIJ,EAAgB,kBAAkB,CAAC,EAGpEK,EAAuBF,EAAe,OAAQhB,GAAQ,CAACe,EAAe,IAAIf,CAAG,CAAC,EAE9EmB,EAAsBF,EAAmB,OAAQjB,GAAQ,CAACc,EAAW,IAAId,CAAG,CAAC,EAG7EoB,EAAU,CAAC,GAAGN,EAAY,GAAGC,CAAc,EAE3CM,EAAmB,CAAE,OADTD,EAAQ,OAAS,EAAI,KAAK,IAAI,EAAG,GAAGA,EAAQ,IAAKpB,GAAQd,EAAec,CAAG,CAAC,CAAC,EAAI,GACrD,CAAE,EAGhD,OAAOS,EAAe,IAAKL,GAAS,CAChC,GAAI,CAACA,EAAK,KAAK,SAAS3B,CAAgB,EACpC,OAAO2B,EAEX,IAAIM,EAAcN,EAAK,KAEvB,OAAAM,EAAcA,EAAY,QAAQ,QAAS,IAAM,CAC7C,GAAIN,EAAK,WAAY,CACjB,IAAMkB,EAAeJ,EAAqB,MAAM,EAChD,GAAII,EAAc,OAAOA,CAC7B,KAAO,CAEH,IAAMA,EAAeH,EAAoB,MAAM,EAC/C,GAAIG,EAAc,OAAOA,CAC7B,CAGA,IAAMC,EAAS,IAAIzC,EAAeuC,EAAiB,KAAK,CAAC,IACzD,OAAAA,EAAiB,QACVE,CACX,CAAC,EAEM,CAAE,GAAGnB,EAAM,KAAMM,CAAY,CACxC,CAAC,CACL,EChOO,IAAMc,GAAqBC,GAA0B,CAExD,GAAI,CAACA,GAAQA,EAAK,KAAK,EAAE,SAAW,EAChC,MAAO,GAGX,IAAMC,EAAUD,EAAK,KAAK,EACpBE,EAASD,EAAQ,OAQvB,GALIC,EAAS,GAKTC,EAAoBF,CAAO,EAC3B,MAAO,GAGX,IAAMG,EAAYC,EAAsBJ,CAAO,EAG/C,GAAIK,EAAuBF,EAAWF,CAAM,EACxC,MAAO,GAIX,IAAMK,EAAYC,EAAS,iBAAiB,KAAKP,CAAO,EAGxD,MAAI,CAACM,GAAa,WAAW,KAAKN,CAAO,EAC9B,GAIPM,EACO,CAACE,EAAqBL,EAAWF,CAAM,EAI3CQ,EAAiBN,EAAWF,EAAQD,CAAO,CACtD,EAoBO,SAASI,EAAsBL,EAA8B,CAChE,IAAMW,EAAwB,CAC1B,YAAa,EACb,SAAU,IAAI,IACd,WAAY,EACZ,WAAY,EACZ,iBAAkB,EAClB,WAAY,EACZ,YAAa,CACjB,EAEMC,EAAQ,MAAM,KAAKZ,CAAI,EAE7B,QAAWa,KAAQD,EAEfD,EAAM,SAAS,IAAIE,GAAOF,EAAM,SAAS,IAAIE,CAAI,GAAK,GAAK,CAAC,EAExDL,EAAS,iBAAiB,KAAKK,CAAI,EACnCF,EAAM,cACC,KAAK,KAAKE,CAAI,EACrBF,EAAM,aACC,WAAW,KAAKE,CAAI,EAC3BF,EAAM,aACC,KAAK,KAAKE,CAAI,EACrBF,EAAM,aACC,sBAAsB,KAAKE,CAAI,EACtCF,EAAM,mBAENA,EAAM,cAId,OAAOA,CACX,CAsBO,SAASL,EAAuBF,EAA2BU,EAA6B,CAC3F,IAAIC,EAAc,EACZC,EAAkB,CAAC,IAAK,IAAK,IAAK,IAAK,GAAG,EAEhD,OAAW,CAACH,EAAMI,CAAK,IAAKb,EAAU,SAC9Ba,GAAS,GAAKD,EAAgB,SAASH,CAAI,IAC3CE,GAAeE,GAKvB,OAAOF,EAAcD,EAAa,EACtC,CAqBO,SAASX,EAAoBH,EAAuB,CAavD,MAZsB,CAClB,gBACA,WACA,WACA,aACA,aACA,WACA,aACA,WACA,YACJ,EAEqB,KAAMkB,GAAYA,EAAQ,KAAKlB,CAAI,CAAC,CAC7D,CAuBO,SAASU,EAAiBN,EAA2BU,EAAoBd,EAAuB,CACnG,IAAMmB,EAAef,EAAU,YAAcA,EAAU,WAAaA,EAAU,WAQ9E,OALIe,IAAiB,GAKjBC,EAAehB,EAAWe,EAAcL,CAAU,EAC3C,GAIe,QAAQ,KAAKd,CAAI,GAClBI,EAAU,YAAc,EACtC,IAKqBA,EAAU,YAAc,KAAK,IAAI,EAAGA,EAAU,iBAAmB,CAAC,GACpE,KAAK,IAAIe,EAAc,CAAC,EAAI,GAKtDL,GAAc,GAAKV,EAAU,cAAgB,GAAK,EAAE,QAAQ,KAAKJ,CAAI,GAAKI,EAAU,YAAc,GAC3F,GAIP,YAAY,KAAKJ,CAAI,EACd,GAIJc,GAAc,EACzB,CAyBO,SAASM,EAAehB,EAA2Be,EAAsBL,EAA6B,CACzG,GAAM,CAAE,YAAAO,EAAa,WAAAC,CAAW,EAAIlB,EAapC,OAVIkB,EAAa,GAAKH,IAAiBG,EAAa,GAAKH,GAAgB,GAKrEL,GAAc,IAAMQ,GAAc,GAAKD,IAAgB,GAKvDC,EAAaR,EAAa,EAKlC,CAyBO,SAASL,EAAqBL,EAA2BU,EAA6B,CAkBzF,OAhBIV,EAAU,aAAe,GAKzBA,EAAU,aAAe,GAAKA,EAAU,WAAa,GAAKU,GAAc,IAKxEV,EAAU,aAAe,GAAKA,EAAU,kBAAoB,GAAKU,GAAc,IAM/EV,EAAU,aAAe,GAAKU,GAAc,GAAKV,EAAU,kBAAoB,CAKvF,CC9UA,IAAMmB,GAAmB,CACrBC,EACAC,EACA,CAAE,oBAAAC,EAAqB,YAAAC,CAAY,IACxB,CAEX,GAAIH,IAAkB,KAClB,MAAO,CAACC,CAAS,EAErB,GAAIA,IAAa,KACb,MAAO,CAACD,CAAa,EAIzB,GAAII,EAAoBJ,CAAa,IAAMI,EAAoBH,CAAQ,EACnE,MAAO,CAACD,CAAa,EAIzB,IAAMK,EAASC,EAAwBN,EAAeC,CAAQ,EAC9D,GAAII,EAAQ,OAAOA,EAGnB,IAAME,EAAiBC,EAA0BR,EAAeC,CAAQ,EACxE,GAAIM,EAAgB,OAAOA,EAG3B,GAAIJ,EAAY,SAASH,CAAa,GAAKG,EAAY,SAASF,CAAQ,EAAG,CACvE,IAAMQ,EAAaN,EAAY,KAAMO,GAAWA,IAAWV,GAAiBU,IAAWT,CAAQ,EAC/F,OAAOQ,EAAa,CAACA,CAAU,EAAI,CAACT,CAAa,CACrD,CAGA,IAAMW,EAAqBP,EAAoBJ,CAAa,EACtDY,EAAgBR,EAAoBH,CAAQ,EAGlD,MAAO,CAFYY,EAAoBF,EAAoBC,CAAa,EAEnDV,EAAsBF,EAAgBC,CAAQ,CACvE,EAWMa,GAAwB,CAACC,EAAkBC,IAA8C,CAC3F,GAAID,EAAO,SAAW,EAClB,OAAOA,EAGX,IAAMV,EAAmB,CAAC,EAE1B,QAAWY,KAAgBF,EAAQ,CAC/B,GAAIV,EAAO,SAAW,EAAG,CACrBA,EAAO,KAAKY,CAAY,EACxB,QACJ,CAEA,IAAMC,EAAgBb,EAAO,GAAG,EAAE,EAGlC,GAAIc,EAA6BD,EAAeD,EAAcD,CAAuB,EAAG,CAEhFC,EAAa,OAASC,EAAc,SACpCb,EAAOA,EAAO,OAAS,CAAC,EAAIY,GAEhC,QACJ,CAGIG,EAAqBf,EAAQa,EAAeD,CAAY,GAI5DZ,EAAO,KAAKY,CAAY,CAC5B,CAEA,OAAOZ,CACX,EAYagB,GAAuB,CAACC,EAAsBC,EAAiBC,IAAoC,CAC5G,IAAMC,EAAiBC,EAAaJ,EAAcE,EAAQ,WAAW,EAC/DG,EAAYD,EAAaH,EAASC,EAAQ,WAAW,EAWrDI,EAReC,EACjBJ,EACAE,EACAH,EAAQ,YACRA,EAAQ,mBACZ,EAGkC,QAAQ,CAAC,CAACM,EAAUC,CAAG,IAAMhC,GAAiB+B,EAAUC,EAAKP,CAAO,CAAC,EAKvG,OAFoBV,GAAsBc,EAAcJ,EAAQ,uBAAuB,EAEpE,KAAK,GAAG,CAC/B,EAEaQ,GAAU,CACnBF,EACAG,EACA,CACI,wBAAAjB,EAA0B,GAC1B,oBAAAd,EAAsB,GACtB,YAAAC,CACJ,IAEOkB,GAAqBS,EAAUG,EAAY,CAAE,wBAAAjB,EAAyB,oBAAAd,EAAqB,YAAAC,CAAY,CAAC","names":["PATTERNS","normalizeArabicText","text","extractDigits","match","tokenizeText","preserveSymbols","processedText","symbol","symbolRegex","handleFootnoteFusion","result","previousToken","currentToken","prevIsStandalone","currHasEmbedded","currIsStandalone","prevHasEmbedded","prevDigits","currDigits","handleFootnoteSelection","tokenA","tokenB","aHasEmbedded","bHasEmbedded","handleStandaloneFootnotes","aIsFootnote","bIsFootnote","ALIGNMENT_SCORES","calculateLevenshteinDistance","textA","textB","lengthA","lengthB","shorter","longer","shortLen","longLen","previousRow","_","index","i","currentRow","j","substitutionCost","minCost","calculateSimilarity","maxLength","distance","areSimilarAfterNormalization","threshold","normalizedA","normalizeArabicText","normalizedB","calculateAlignmentScore","tokenA","tokenB","typoSymbols","similarityThreshold","isTypoSymbol","isHighlySimilar","backtrackAlignment","matrix","tokensA","tokensB","alignment","alignTokenSequences","scoringMatrix","alignmentScore","diagonalScore","upScore","leftScore","bestScore","bestDirection","alignTextSegments","targetLines","segmentLines","alignedLines","segmentIndex","targetLine","result","segmentsConsumed","processAlignmentTarget","findBestSegmentMerge","partA","partB","mergedForward","mergedReversed","normalizedTarget","normalizeArabicText","scoreForward","calculateSimilarity","scoreReversed","currentSegment","areSimilarAfterNormalization","checkQuoteBalance","str","errors","quoteCount","lastQuoteIndex","i","isBalanced","BRACKETS","OPEN_BRACKETS","CLOSE_BRACKETS","checkBracketBalance","stack","char","lastOpen","index","checkBalance","quoteResult","bracketResult","a","b","getUnbalancedErrors","text","characterErrors","lines","absoluteIndex","line","lineIndex","balanceResult","error","areQuotesBalanced","areBracketsBalanced","INVALID_FOOTNOTE","hasInvalidFootnotes","text","PATTERNS","arabicFormatter","numberToArabic","num","ocrToArabic","char","arabicToNumber","arabicStr","lookup","digits","numStr","parsed","extractReferences","lines","arabicReferencesInBody","b","ocrConfusedReferencesInBody","arabicReferencesInFootnotes","ocrConfusedReferencesInFootnotes","convertedOcrBodyRefs","ref","convertedOcrFootnoteRefs","needsCorrection","references","line","bodySet","footnoteSet","correctReferences","initialReferences","sanitizedLines","updatedText","ocrRegex","match","cleanReferences","bodyRefSet","footnoteRefSet","uniqueBodyRefs","uniqueFootnoteRefs","bodyRefsForFootnotes","footnoteRefsForBody","allRefs","referenceCounter","availableRef","newRef","isArabicTextNoise","text","trimmed","length","isBasicNoisePattern","charStats","analyzeCharacterStats","hasExcessiveRepetition","hasArabic","PATTERNS","isValidArabicContent","isNonArabicNoise","stats","chars","char","textLength","repeatCount","repetitiveChars","count","pattern","contentChars","isSpacingNoise","arabicCount","spaceCount","selectBestTokens","originalToken","altToken","similarityThreshold","typoSymbols","normalizeArabicText","result","handleFootnoteSelection","footnoteResult","handleStandaloneFootnotes","typoSymbol","symbol","normalizedOriginal","normalizedAlt","calculateSimilarity","removeDuplicateTokens","tokens","highSimilarityThreshold","currentToken","previousToken","areSimilarAfterNormalization","handleFootnoteFusion","processTextAlignment","originalText","altText","options","originalTokens","tokenizeText","altTokens","mergedTokens","alignTokenSequences","original","alt","fixTypo","correction"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "baburchi",
3
- "version": "1.2.0",
3
+ "version": "1.4.0",
4
4
  "author": "Ragaeeb Haq",
5
5
  "repository": {
6
6
  "type": "git",
@@ -9,24 +9,24 @@
9
9
  "main": "dist/index.js",
10
10
  "module": "dist/index.ts",
11
11
  "devDependencies": {
12
- "@eslint/js": "^9.30.1",
13
- "@types/bun": "^1.2.17",
14
- "eslint": "^9.30.1",
15
- "eslint-config-prettier": "^10.1.5",
12
+ "@eslint/js": "^9.32.0",
13
+ "@types/bun": "^1.2.19",
14
+ "eslint": "^9.32.0",
15
+ "eslint-config-prettier": "^10.1.8",
16
16
  "eslint-plugin-perfectionist": "^4.15.0",
17
- "eslint-plugin-prettier": "^5.5.1",
17
+ "eslint-plugin-prettier": "^5.5.3",
18
18
  "globals": "^16.3.0",
19
19
  "prettier": "^3.6.2",
20
- "semantic-release": "^24.2.6",
20
+ "semantic-release": "^24.2.7",
21
21
  "tsup": "^8.5.0",
22
- "typescript-eslint": "^8.35.1"
22
+ "typescript-eslint": "^8.39.0"
23
23
  },
24
24
  "bugs": {
25
25
  "url": "https://github.com/ragaeeb/baburchi/issues"
26
26
  },
27
27
  "description": "A lightweight TypeScript library designed to fix typos in OCR post-processing.",
28
28
  "engines": {
29
- "bun": ">=1.2.17",
29
+ "bun": ">=1.2.19",
30
30
  "node": ">=22.0.0"
31
31
  },
32
32
  "exports": {