baburchi 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -61,7 +61,7 @@ type TextLine = {
|
|
|
61
61
|
* const corrected = correctReferences(lines);
|
|
62
62
|
* // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
|
|
63
63
|
*/
|
|
64
|
-
declare const correctReferences: (lines:
|
|
64
|
+
declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
|
|
65
65
|
|
|
66
66
|
/**
|
|
67
67
|
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
|
package/dist/index.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
var u={arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},p=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),F=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},
|
|
1
|
+
var u={arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},p=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),F=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},T=(e,t=[])=>{let n=e;for(let r of t){let s=new RegExp(r,"g");n=n.replace(s,` ${r} `)}return n.trim().split(u.whitespace).filter(Boolean)},A=(e,t,n)=>{let r=u.footnoteStandalone.test(t),s=u.footnoteEmbedded.test(n),i=u.footnoteStandalone.test(n),c=u.footnoteEmbedded.test(t),o=F(t),a=F(n);return r&&s&&o===a?(e[e.length-1]=n,!0):!!(c&&i&&o===a)},E=(e,t)=>{let n=u.footnoteEmbedded.test(e),r=u.footnoteEmbedded.test(t);return n&&!r?[e]:r&&!n?[t]:n&&r?[e.length<=t.length?e:t]:null},C=(e,t)=>{let n=u.footnoteStandalone.test(e),r=u.footnoteStandalone.test(t);return n&&!r?[e,t]:r&&!n?[t,e]:n&&r?[e.length<=t.length?e:t]:null};var x={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},w=(e,t)=>{let n=e.length,r=t.length;if(n===0)return r;if(r===0)return n;let[s,i]=n<=r?[e,t]:[t,e],c=s.length,o=i.length,a=Array.from({length:c+1},(f,g)=>g);for(let f=1;f<=o;f++){let g=[f];for(let m=1;m<=c;m++){let b=i[f-1]===s[m-1]?0:1,l=Math.min(a[m]+1,g[m-1]+1,a[m-1]+b);g.push(l)}a=g}return a[c]},R=(e,t)=>{let n=Math.max(e.length,t.length)||1,r=w(e,t);return(n-r)/n},P=(e,t,n=.6)=>{let r=p(e),s=p(t);return R(r,s)>=n},I=(e,t,n,r)=>{let s=p(e),i=p(t);if(s===i)return x.PERFECT_MATCH;let c=n.includes(e)||n.includes(t),o=R(s,i)>=r;return c||o?x.SOFT_MATCH:x.MISMATCH_PENALTY},L=(e,t,n)=>{let r=[],s=t.length,i=n.length;for(;s>0||i>0;)switch(e[s][i].direction){case"diagonal":r.push([t[--s],n[--i]]);break;case"left":r.push([null,n[--i]]);break;case"up":r.push([t[--s],null]);break;default:throw new Error("Invalid alignment direction")}return r.reverse()},O=(e,t,n,r)=>{let s=e.length,i=t.length,c=Array.from({length:s+1},()=>Array.from({length:i+1},()=>({direction:null,score:0})));for(let o=1;o<=s;o++)c[o][0]={direction:"up",score:o*x.GAP_PENALTY};for(let o=1;o<=i;o++)c[0][o]={direction:"left",score:o*x.GAP_PENALTY};for(let o=1;o<=s;o++)for(let a=1;a<=i;a++){let f=I(e[o-1],t[a-1],n,r),g=c[o-1][a-1].score+f,m=c[o-1][a].score+x.GAP_PENALTY,b=c[o][a-1].score+x.GAP_PENALTY,l=Math.max(g,m,b),d="left";l===g?d="diagonal":l===m&&(d="up"),c[o][a]={direction:d,score:l}}return L(c,e,t)};var z="()",v=e=>u.invalidReferenceRegex.test(e),_=new Intl.NumberFormat("ar-SA"),D=e=>_.format(e),y=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,H=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},n=e.replace(/[()]/g,""),r="";for(let i of n)r+=t[i];let s=parseInt(r,10);return isNaN(s)?0:s},M=e=>{let t=e.filter(o=>!o.isFootnote).flatMap(o=>o.text.match(u.arabicReferenceRegex)||[]),n=e.filter(o=>!o.isFootnote).flatMap(o=>o.text.match(u.ocrConfusedReferenceRegex)||[]),r=e.filter(o=>o.isFootnote).flatMap(o=>o.text.match(u.arabicFootnoteReferenceRegex)||[]),s=e.filter(o=>o.isFootnote).flatMap(o=>o.text.match(u.ocrConfusedFootnoteReferenceRegex)||[]),i=n.map(o=>o.replace(/[.1OV9]/g,a=>y(a))),c=s.map(o=>o.replace(/[.1OV9]/g,a=>y(a)));return{bodyReferences:[...t,...i],footnoteReferences:[...r,...c],ocrConfusedInBody:n,ocrConfusedInFootnotes:s}},V=(e,t)=>{if(e.some(i=>v(i.text)))return!0;let r=new Set(t.bodyReferences),s=new Set(t.footnoteReferences);if(r.size!==s.size)return!0;for(let i of r)if(!s.has(i))return!0;return!1},K=e=>{let t=M(e);if(!V(e,t))return e;let n=e.map(l=>{let d=l.text,S=/\([.1OV9]+\)/g;return d=d.replace(S,h=>h.replace(/[.1OV9]/g,N=>y(N))),{...l,text:d}}),r=M(n),s=new Set(r.bodyReferences),i=new Set(r.footnoteReferences),c=[...new Set(r.bodyReferences)],o=[...new Set(r.footnoteReferences)],a=c.filter(l=>!i.has(l)),f=o.filter(l=>!s.has(l)),g=[...s,...i],b={count:(g.length>0?Math.max(0,...g.map(l=>H(l))):0)+1};return n.map(l=>{if(!l.text.includes(z))return l;let d=l.text;return d=d.replace(/\(\)/g,()=>{if(l.isFootnote){let h=a.shift();if(h)return h}else{let h=f.shift();if(h)return h}let S=`(${D(b.count)})`;return b.count++,S}),{...l,text:d}})};var Y=(e,t,{similarityThreshold:n,typoSymbols:r})=>{if(e===null)return[t];if(t===null)return[e];if(p(e)===p(t))return[e];let s=E(e,t);if(s)return s;let i=C(e,t);if(i)return i;if(r.includes(e)||r.includes(t)){let f=r.find(g=>g===e||g===t);return f?[f]:[e]}let c=p(e),o=p(t);return[R(c,o)>n?e:t]},G=(e,t)=>{if(e.length===0)return e;let n=[];for(let r of e){if(n.length===0){n.push(r);continue}let s=n.at(-1);if(P(s,r,t)){r.length<s.length&&(n[n.length-1]=r);continue}A(n,s,r)||n.push(r)}return n},j=(e,t,n)=>{let r=T(e,n.typoSymbols),s=T(t,n.typoSymbols),c=O(r,s,n.typoSymbols,n.similarityThreshold).flatMap(([a,f])=>Y(a,f,n));return G(c,n.highSimilarityThreshold).join(" ")},Z=(e,t,{highSimilarityThreshold:n=.8,similarityThreshold:r=.6,typoSymbols:s})=>j(e,t,{highSimilarityThreshold:n,similarityThreshold:r,typoSymbols:s});export{u as PATTERNS,O as alignTokenSequences,P as areSimilarAfterNormalization,L as backtrackAlignment,I as calculateAlignmentScore,w as calculateLevenshteinDistance,R as calculateSimilarity,K as correctReferences,F as extractDigits,Z as fixTypo,A as handleFootnoteFusion,E as handleFootnoteSelection,C as handleStandaloneFootnotes,v as hasInvalidFootnotes,p as normalizeArabicText,j as processTextAlignment,T as tokenizeText};
|
|
2
2
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/textUtils.ts","../src/similarity.ts","../src/footnotes.ts","../src/index.ts"],"sourcesContent":["/**\n * Collection of regex patterns used throughout the library for text processing\n */\nexport const PATTERNS = {\n /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */\n arabicDigits: /[0-9\\u0660-\\u0669]+/,\n\n /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\\([\\u0660-\\u0669]+\\) */\n arabicFootnoteReferenceRegex: /^\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */\n arabicLettersAndDigits: /[0-9\\u0621-\\u063A\\u0641-\\u064A\\u0660-\\u0669]+/g,\n\n /** Matches Arabic punctuation marks and whitespace characters */\n arabicPunctuationAndWhitespace: /[\\s\\u060C\\u061B\\u061F\\u06D4]+/,\n\n /** Matches footnote references with Arabic-Indic digits in parentheses: \\([\\u0660-\\u0669]+\\) */\n arabicReferenceRegex: /\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic diacritical marks (harakat, tanween, etc.) */\n diacritics: /[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]/g,\n\n /** Matches embedded footnotes within text: \\([0-9\\u0660-\\u0669]+\\) */\n footnoteEmbedded: /\\([0-9\\u0660-\\u0669]+\\)/,\n\n /** Matches standalone footnote markers at line start/end: ^\\(?[0-9\\u0660-\\u0669]+\\)?[،.]?$ */\n footnoteStandalone: /^\\(?[0-9\\u0660-\\u0669]+\\)?[،.]?$/,\n\n /** Matches invalid/problematic footnote references: empty \"()\" or OCR-confused endings */\n invalidReferenceRegex: /\\(\\)|\\([.1OV9]+\\)/g, // Combined pattern for detecting any invalid/problematic references\n\n /** Matches OCR-confused footnote references at line start with characters like .1OV9 */\n ocrConfusedFootnoteReferenceRegex: /^\\([.1OV9]+\\)/g,\n\n /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */\n ocrConfusedReferenceRegex: /\\([.1OV9]+\\)/g,\n\n /** Matches Arabic tatweel (kashida) character used for text stretching */\n tatweel: /\\u0640/g,\n\n /** Matches one or more whitespace characters */\n whitespace: /\\s+/,\n};\n\n/**\n * Normalizes Arabic text by removing diacritics, and tatweel marks.\n * This normalization enables better text comparison by focusing on core characters\n * while ignoring decorative elements that don't affect meaning.\n *\n * @param text - Arabic text to normalize\n * @returns Normalized text with diacritics, tatweel, and basic tags removed\n * @example\n * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'\n */\nexport const normalizeArabicText = (text: string): string => {\n return text.replace(PATTERNS.tatweel, '').replace(PATTERNS.diacritics, '').trim();\n};\n\n/**\n * Extracts the first sequence of Arabic or Western digits from text.\n * Used primarily for footnote number comparison to match related footnote elements.\n *\n * @param text - Text containing digits to extract\n * @returns First digit sequence found, or empty string if none found\n * @example\n * extractDigits('(٥)أخرجه البخاري') // Returns '٥'\n * extractDigits('See note (123)') // Returns '123'\n */\nexport const extractDigits = (text: string): string => {\n const match = text.match(PATTERNS.arabicDigits);\n return match ? match[0] : '';\n};\n\n/**\n * Tokenizes text into individual words while preserving special symbols.\n * Removes HTML tags, adds spacing around preserved symbols to ensure they\n * are tokenized separately, then splits on whitespace.\n *\n * @param text - Text to tokenize\n * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens\n * @returns Array of tokens, or empty array if input is empty/whitespace\n * @example\n * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']\n */\nexport const tokenizeText = (text: string, preserveSymbols: string[] = []): string[] => {\n let processedText = text;\n\n // Add spaces around each preserve symbol to ensure they're tokenized separately\n for (const symbol of preserveSymbols) {\n const symbolRegex = new RegExp(symbol, 'g');\n processedText = processedText.replace(symbolRegex, ` ${symbol} `);\n }\n\n return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);\n};\n\n/**\n * Handles fusion of standalone and embedded footnotes during token processing.\n * Detects patterns where standalone footnotes should be merged with embedded ones\n * or where trailing standalone footnotes should be skipped.\n *\n * @param result - Current result array being built\n * @param previousToken - The previous token in the sequence\n * @param currentToken - The current token being processed\n * @returns True if the current token was handled (fused or skipped), false otherwise\n * @example\n * // (٥) + (٥)أخرجه → result gets (٥)أخرجه\n * // (٥)أخرجه + (٥) → (٥) is skipped\n */\nexport const handleFootnoteFusion = (result: string[], previousToken: string, currentToken: string): boolean => {\n const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);\n const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);\n const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);\n const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);\n\n const prevDigits = extractDigits(previousToken);\n const currDigits = extractDigits(currentToken);\n\n // Replace standalone with fused version: (٥) + (٥)أخرجه → (٥)أخرجه\n if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {\n result[result.length - 1] = currentToken;\n return true;\n }\n\n // Skip trailing standalone: (٥)أخرجه + (٥) → (٥)أخرجه\n if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) {\n return true;\n }\n\n return false;\n};\n\n/**\n * Handles selection logic for tokens with embedded footnotes during alignment.\n * Prefers tokens that contain embedded footnotes over plain text, and among\n * tokens with embedded footnotes, prefers the shorter one.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']\n * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']\n */\nexport const handleFootnoteSelection = (tokenA: string, tokenB: string): null | string[] => {\n const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);\n const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);\n\n if (aHasEmbedded && !bHasEmbedded) return [tokenA];\n if (bHasEmbedded && !aHasEmbedded) return [tokenB];\n if (aHasEmbedded && bHasEmbedded) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n\n/**\n * Handles selection logic for standalone footnote tokens during alignment.\n * Manages cases where one or both tokens are standalone footnotes, preserving\n * both tokens when one is a footnote and the other is regular text.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']\n * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)\n */\nexport const handleStandaloneFootnotes = (tokenA: string, tokenB: string): null | string[] => {\n const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);\n const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);\n\n if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];\n if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];\n if (aIsFootnote && bIsFootnote) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n","import { normalizeArabicText } from './textUtils';\n\n// Alignment scoring constants\nconst ALIGNMENT_SCORES = {\n GAP_PENALTY: -1,\n MISMATCH_PENALTY: -2,\n PERFECT_MATCH: 2,\n SOFT_MATCH: 1,\n};\n\n/**\n * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.\n * The Levenshtein distance is the minimum number of single-character edits (insertions,\n * deletions, or substitutions) required to change one string into another.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Minimum edit distance between the two strings\n * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths\n * @example\n * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3\n * calculateLevenshteinDistance('', 'hello') // Returns 5\n */\nexport const calculateLevenshteinDistance = (textA: string, textB: string): number => {\n const lengthA = textA.length;\n const lengthB = textB.length;\n\n if (lengthA === 0) {\n return lengthB;\n }\n\n if (lengthB === 0) {\n return lengthA;\n }\n\n // Use shorter string for the array to optimize space\n const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];\n const shortLen = shorter.length;\n const longLen = longer.length;\n\n let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);\n\n for (let i = 1; i <= longLen; i++) {\n const currentRow = [i];\n\n for (let j = 1; j <= shortLen; j++) {\n const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;\n const minCost = Math.min(\n previousRow[j] + 1, // deletion\n currentRow[j - 1] + 1, // insertion\n previousRow[j - 1] + substitutionCost, // substitution\n );\n currentRow.push(minCost);\n }\n\n previousRow = currentRow;\n }\n\n return previousRow[shortLen];\n};\n\n/**\n * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.\n * Uses Levenshtein distance normalized by the length of the longer string.\n * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)\n * @example\n * calculateSimilarity('hello', 'hello') // Returns 1.0\n * calculateSimilarity('hello', 'help') // Returns 0.6\n */\nexport const calculateSimilarity = (textA: string, textB: string): number => {\n const maxLength = Math.max(textA.length, textB.length) || 1;\n const distance = calculateLevenshteinDistance(textA, textB);\n return (maxLength - distance) / maxLength;\n};\n\n/**\n * Checks if two texts are similar after Arabic normalization.\n * Normalizes both texts by removing diacritics and decorative elements,\n * then compares their similarity against the provided threshold.\n *\n * @param textA - First text to compare\n * @param textB - Second text to compare\n * @param threshold - Similarity threshold (0.0 to 1.0)\n * @returns True if normalized texts meet the similarity threshold\n * @example\n * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true\n */\nexport const areSimilarAfterNormalization = (textA: string, textB: string, threshold: number = 0.6): boolean => {\n const normalizedA = normalizeArabicText(textA);\n const normalizedB = normalizeArabicText(textB);\n return calculateSimilarity(normalizedA, normalizedB) >= threshold;\n};\n\n/**\n * Calculates alignment score for two tokens in sequence alignment.\n * Uses different scoring criteria: perfect match after normalization gets highest score,\n * typo symbols or highly similar tokens get soft match score, mismatches get penalty.\n *\n * @param tokenA - First token to score\n * @param tokenB - Second token to score\n * @param typoSymbols - Array of special symbols that get preferential treatment\n * @param similarityThreshold - Threshold for considering tokens highly similar\n * @returns Alignment score (higher is better match)\n * @example\n * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)\n * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity\n */\nexport const calculateAlignmentScore = (\n tokenA: string,\n tokenB: string,\n typoSymbols: string[],\n similarityThreshold: number,\n): number => {\n const normalizedA = normalizeArabicText(tokenA);\n const normalizedB = normalizeArabicText(tokenB);\n\n // Perfect match after normalization\n if (normalizedA === normalizedB) {\n return ALIGNMENT_SCORES.PERFECT_MATCH;\n }\n\n // Check if either token is a typo symbol or high similarity\n const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);\n const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;\n\n if (isTypoSymbol || isHighlySimilar) {\n return ALIGNMENT_SCORES.SOFT_MATCH;\n }\n\n return ALIGNMENT_SCORES.MISMATCH_PENALTY;\n};\n\ntype AlignedTokenPair = [null | string, null | string];\n\ntype AlignmentCell = {\n direction: 'diagonal' | 'left' | 'up' | null;\n score: number;\n};\n\n/**\n * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.\n * Follows the directional indicators in the matrix to build the sequence of aligned\n * token pairs from the Needleman-Wunsch algorithm.\n *\n * @param matrix - Scoring matrix with directional information from alignment\n * @param tokensA - First sequence of tokens\n * @param tokensB - Second sequence of tokens\n * @returns Array of aligned token pairs, where null indicates a gap\n * @throws Error if invalid alignment direction is encountered\n */\nexport const backtrackAlignment = (\n matrix: AlignmentCell[][],\n tokensA: string[],\n tokensB: string[],\n): AlignedTokenPair[] => {\n const alignment: AlignedTokenPair[] = [];\n let i = tokensA.length;\n let j = tokensB.length;\n\n while (i > 0 || j > 0) {\n const currentCell = matrix[i][j];\n\n switch (currentCell.direction) {\n case 'diagonal':\n alignment.push([tokensA[--i], tokensB[--j]]);\n break;\n case 'left':\n alignment.push([null, tokensB[--j]]);\n break;\n case 'up':\n alignment.push([tokensA[--i], null]);\n break;\n default:\n throw new Error('Invalid alignment direction');\n }\n }\n\n return alignment.reverse();\n};\n\n/**\n * Performs global sequence alignment using the Needleman-Wunsch algorithm.\n * Aligns two token sequences to find the optimal pairing that maximizes\n * the total alignment score, handling insertions, deletions, and substitutions.\n *\n * @param tokensA - First sequence of tokens to align\n * @param tokensB - Second sequence of tokens to align\n * @param typoSymbols - Special symbols that affect scoring\n * @param similarityThreshold - Threshold for high similarity scoring\n * @returns Array of aligned token pairs, with null indicating gaps\n * @example\n * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)\n * // Returns [['a', 'a'], ['b', 'c']]\n */\nexport const alignTokenSequences = (\n tokensA: string[],\n tokensB: string[],\n typoSymbols: string[],\n similarityThreshold: number,\n): AlignedTokenPair[] => {\n const lengthA = tokensA.length;\n const lengthB = tokensB.length;\n\n // Initialize scoring matrix\n const scoringMatrix: AlignmentCell[][] = Array.from({ length: lengthA + 1 }, () =>\n Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 })),\n );\n\n // Initialize first row and column\n for (let i = 1; i <= lengthA; i++) {\n scoringMatrix[i][0] = { direction: 'up', score: i * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n for (let j = 1; j <= lengthB; j++) {\n scoringMatrix[0][j] = { direction: 'left', score: j * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n\n // Fill scoring matrix\n for (let i = 1; i <= lengthA; i++) {\n for (let j = 1; j <= lengthB; j++) {\n const alignmentScore = calculateAlignmentScore(\n tokensA[i - 1],\n tokensB[j - 1],\n typoSymbols,\n similarityThreshold,\n );\n\n const diagonalScore = scoringMatrix[i - 1][j - 1].score + alignmentScore;\n const upScore = scoringMatrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY;\n const leftScore = scoringMatrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY;\n\n const bestScore = Math.max(diagonalScore, upScore, leftScore);\n let bestDirection: 'diagonal' | 'left' | 'up' = 'left';\n\n if (bestScore === diagonalScore) {\n bestDirection = 'diagonal';\n } else if (bestScore === upScore) {\n bestDirection = 'up';\n }\n\n scoringMatrix[i][j] = { direction: bestDirection, score: bestScore };\n }\n }\n\n // Backtrack to build alignment\n return backtrackAlignment(scoringMatrix, tokensA, tokensB);\n};\n","import { PATTERNS } from './textUtils';\n\nconst INVALID_FOOTNOTE = '()';\n\n/**\n * Checks if the given text contains invalid footnote references.\n * Invalid footnotes include empty parentheses \"()\" or OCR-confused characters\n * like \".1OV9\" that were misrecognized instead of Arabic numerals.\n *\n * @param text - Text to check for invalid footnote patterns\n * @returns True if text contains invalid footnote references, false otherwise\n * @example\n * hasInvalidFootnotes('This text has ()') // Returns true\n * hasInvalidFootnotes('This text has (١)') // Returns false\n * hasInvalidFootnotes('OCR mistake (O)') // Returns true\n */\nexport const hasInvalidFootnotes = (text: string): boolean => {\n return PATTERNS.invalidReferenceRegex.test(text);\n};\n\n// Arabic number formatter instance\nconst arabicFormatter = new Intl.NumberFormat('ar-SA');\n\n/**\n * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.\n * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.\n *\n * @param num - The number to convert to Arabic numerals\n * @returns String representation using Arabic-Indic digits (٠-٩)\n * @example\n * numberToArabic(123) // Returns '١٢٣'\n * numberToArabic(5) // Returns '٥'\n */\nconst numberToArabic = (num: number): string => {\n return arabicFormatter.format(num);\n};\n\n/**\n * Converts OCR-confused characters to their corresponding Arabic-Indic numerals.\n * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.\n *\n * @param char - Single character that may be an OCR mistake\n * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists\n * @example\n * ocrToArabic('O') // Returns '٥' (O often confused with ٥)\n * ocrToArabic('1') // Returns '١' (1 often confused with ١)\n * ocrToArabic('.') // Returns '٠' (dot often confused with ٠)\n */\nconst ocrToArabic = (char: string): string => {\n const ocrToArabicMap: { [key: string]: string } = {\n '1': '١',\n '9': '٩',\n '.': '٠',\n O: '٥',\n o: '٥',\n V: '٧',\n v: '٧',\n };\n return ocrToArabicMap[char] || char;\n};\n\n/**\n * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.\n * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.\n *\n * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(١٢٣)'\n * @returns Parsed number, or 0 if parsing fails\n * @example\n * arabicToNumber('(١٢٣)') // Returns 123\n * arabicToNumber('(٥)') // Returns 5\n * arabicToNumber('invalid') // Returns 0\n */\nconst arabicToNumber = (arabicStr: string): number => {\n const lookup: { [key: string]: string } = {\n '٠': '0',\n '١': '1',\n '٢': '2',\n '٣': '3',\n '٤': '4',\n '٥': '5',\n '٦': '6',\n '٧': '7',\n '٨': '8',\n '٩': '9',\n };\n const digits = arabicStr.replace(/[()]/g, '');\n let numStr = '';\n for (const char of digits) {\n numStr += lookup[char];\n }\n const parsed = parseInt(numStr, 10);\n return isNaN(parsed) ? 0 : parsed;\n};\n\ntype TextLine = {\n isFootnote?: boolean;\n text: string;\n};\n\n/**\n * Extracts all footnote references from text lines, categorizing them by type and location.\n * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.\n *\n * @param lines - Array of text line objects with optional isFootnote flag\n * @returns Object containing categorized reference arrays:\n * - bodyReferences: All valid references found in body text\n * - footnoteReferences: All valid references found in footnotes\n * - ocrConfusedInBody: OCR-confused references in body text (for tracking)\n * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)\n * @example\n * const lines = [\n * { text: 'Body with (١) and (O)', isFootnote: false },\n * { text: '(١) Footnote text', isFootnote: true }\n * ];\n * const refs = extractReferences(lines);\n * // refs.bodyReferences contains ['(١)', '(٥)'] - OCR 'O' converted to '٥'\n */\nconst extractReferences = (lines: TextLine[]) => {\n const arabicReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);\n\n const ocrConfusedReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);\n\n const arabicReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);\n\n const ocrConfusedReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);\n\n const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n return {\n bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],\n footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],\n ocrConfusedInBody: ocrConfusedReferencesInBody,\n ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes,\n };\n};\n\n/**\n * Determines if footnote reference correction is needed by checking for:\n * 1. Invalid footnote patterns (empty parentheses, OCR mistakes)\n * 2. Mismatched sets of references between body text and footnotes\n * 3. Different counts of references in body vs footnotes\n *\n * @param lines - Array of text line objects to analyze\n * @param references - Extracted reference data from extractReferences()\n * @returns True if correction is needed, false if references are already correct\n * @example\n * const lines = [{ text: 'Text with ()', isFootnote: false }];\n * const refs = extractReferences(lines);\n * needsCorrection(lines, refs) // Returns true due to invalid \"()\" reference\n */\nconst needsCorrection = (lines: TextLine[], references: ReturnType<typeof extractReferences>) => {\n const mistakenReferences = lines.some((line) => hasInvalidFootnotes(line.text));\n if (mistakenReferences) return true;\n\n const bodySet = new Set(references.bodyReferences);\n const footnoteSet = new Set(references.footnoteReferences);\n if (bodySet.size !== footnoteSet.size) return true;\n\n // Check if the sets contain the same elements\n for (const ref of bodySet) {\n if (!footnoteSet.has(ref)) {\n return true;\n }\n }\n\n return false;\n};\n\n/**\n * Corrects footnote references in an array of text lines by:\n * 1. Converting OCR-confused characters to proper Arabic numerals\n * 2. Filling in empty \"()\" references with appropriate numbers\n * 3. Ensuring footnote references in body text match those in footnotes\n * 4. Generating new reference numbers when needed\n *\n * @param lines - Array of text line objects, each with optional isFootnote flag\n * @returns Array of corrected text lines with proper footnote references\n * @example\n * const lines = [\n * { text: 'Main text with ()', isFootnote: false },\n * { text: '() This is a footnote', isFootnote: true }\n * ];\n * const corrected = correctReferences(lines);\n * // Returns lines with \"()\" replaced by proper Arabic numerals like \"(١)\"\n */\nexport const correctReferences = (lines: TextLine[]): TextLine[] => {\n const initialReferences = extractReferences(lines);\n\n if (!needsCorrection(lines, initialReferences)) {\n return lines;\n }\n\n // Pass 1: Sanitize lines by correcting only OCR characters inside reference markers.\n const sanitizedLines = lines.map((line) => {\n let updatedText = line.text;\n // This regex finds the full reference, e.g., \"(O)\" or \"(1)\"\n const ocrRegex = /\\([.1OV9]+\\)/g;\n updatedText = updatedText.replace(ocrRegex, (match) => {\n // This replace acts *inside* the found match, e.g., on \"O\" or \"1\"\n return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));\n });\n return { ...line, text: updatedText };\n });\n\n // Pass 2: Analyze the sanitized lines to get a clear and accurate picture of references.\n const cleanReferences = extractReferences(sanitizedLines);\n\n // Step 3: Create queues of \"unmatched\" references for two-way pairing.\n const bodyRefSet = new Set(cleanReferences.bodyReferences);\n const footnoteRefSet = new Set(cleanReferences.footnoteReferences);\n\n const uniqueBodyRefs = [...new Set(cleanReferences.bodyReferences)];\n const uniqueFootnoteRefs = [...new Set(cleanReferences.footnoteReferences)];\n\n // Queue 1: Body references available for footnotes.\n const bodyRefsForFootnotes = uniqueBodyRefs.filter((ref) => !footnoteRefSet.has(ref));\n // Queue 2: Footnote references available for the body.\n const footnoteRefsForBody = uniqueFootnoteRefs.filter((ref) => !bodyRefSet.has(ref));\n\n // Step 4: Determine the starting point for any completely new reference numbers.\n const allRefs = [...bodyRefSet, ...footnoteRefSet];\n const maxRefNum = allRefs.length > 0 ? Math.max(0, ...allRefs.map((ref) => arabicToNumber(ref))) : 0;\n const referenceCounter = { count: maxRefNum + 1 };\n\n // Step 5: Map over the sanitized lines, filling in '()' using the queues.\n return sanitizedLines.map((line) => {\n if (!line.text.includes(INVALID_FOOTNOTE)) {\n return line;\n }\n let updatedText = line.text;\n\n updatedText = updatedText.replace(/\\(\\)/g, () => {\n if (line.isFootnote) {\n const availableRef = bodyRefsForFootnotes.shift();\n if (availableRef) return availableRef;\n } else {\n // It's body text\n const availableRef = footnoteRefsForBody.shift();\n if (availableRef) return availableRef;\n }\n\n // If no available partner reference exists, generate a new one.\n const newRef = `(${numberToArabic(referenceCounter.count)})`;\n referenceCounter.count++;\n return newRef;\n });\n\n return { ...line, text: updatedText };\n });\n};\n","import type { FixTypoOptions } from './types';\n\nimport { alignTokenSequences, areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport {\n handleFootnoteFusion,\n handleFootnoteSelection,\n handleStandaloneFootnotes,\n normalizeArabicText,\n tokenizeText,\n} from './textUtils';\n\n/**\n * Selects the best token(s) from an aligned pair during typo correction.\n * Uses various heuristics including normalization, footnote handling, typo symbols,\n * and similarity scores to determine which token(s) to keep.\n *\n * @param originalToken - Token from the original OCR text (may be null)\n * @param altToken - Token from the alternative OCR text (may be null)\n * @param options - Configuration options including typo symbols and similarity threshold\n * @returns Array of selected tokens (usually contains one token, but may contain multiple)\n */\nconst selectBestTokens = (\n originalToken: null | string,\n altToken: null | string,\n { similarityThreshold, typoSymbols }: FixTypoOptions,\n): string[] => {\n // Handle missing tokens\n if (originalToken === null) {\n return [altToken!];\n }\n if (altToken === null) {\n return [originalToken];\n }\n\n // Preserve original if same after normalization (keeps diacritics)\n if (normalizeArabicText(originalToken) === normalizeArabicText(altToken)) {\n return [originalToken];\n }\n\n // Handle embedded footnotes\n const result = handleFootnoteSelection(originalToken, altToken);\n if (result) return result;\n\n // Handle standalone footnotes\n const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);\n if (footnoteResult) return footnoteResult;\n\n // Handle typo symbols - prefer the symbol itself\n if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {\n const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);\n return typoSymbol ? [typoSymbol] : [originalToken];\n }\n\n // Choose based on similarity\n const normalizedOriginal = normalizeArabicText(originalToken);\n const normalizedAlt = normalizeArabicText(altToken);\n const similarity = calculateSimilarity(normalizedOriginal, normalizedAlt);\n\n return [similarity > similarityThreshold ? originalToken : altToken];\n};\n\n/**\n * Removes duplicate tokens and handles footnote fusion in post-processing.\n * Identifies and removes tokens that are highly similar while preserving\n * important variations. Also handles special cases like footnote merging.\n *\n * @param tokens - Array of tokens to process\n * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)\n * @returns Array of tokens with duplicates removed and footnotes fused\n */\nconst removeDuplicateTokens = (tokens: string[], highSimilarityThreshold: number): string[] => {\n if (tokens.length === 0) {\n return tokens;\n }\n\n const result: string[] = [];\n\n for (const currentToken of tokens) {\n if (result.length === 0) {\n result.push(currentToken);\n continue;\n }\n\n const previousToken = result.at(-1)!;\n\n // Handle ordinary echoes (similar tokens)\n if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {\n // Keep the shorter version\n if (currentToken.length < previousToken.length) {\n result[result.length - 1] = currentToken;\n }\n continue;\n }\n\n // Handle footnote fusion cases\n if (handleFootnoteFusion(result, previousToken, currentToken)) {\n continue;\n }\n\n result.push(currentToken);\n }\n\n return result;\n};\n\n/**\n * Processes text alignment between original and alternate OCR results to fix typos.\n * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,\n * then selects the best tokens and performs post-processing.\n *\n * @param originalText - Original OCR text that may contain typos\n * @param altText - Reference text from alternate OCR for comparison\n * @param options - Configuration options for alignment and selection\n * @returns Corrected text with typos fixed\n */\nexport const processTextAlignment = (originalText: string, altText: string, options: FixTypoOptions): string => {\n const originalTokens = tokenizeText(originalText, options.typoSymbols);\n const altTokens = tokenizeText(altText, options.typoSymbols);\n\n // Align token sequences\n const alignedPairs = alignTokenSequences(\n originalTokens,\n altTokens,\n options.typoSymbols,\n options.similarityThreshold,\n );\n\n // Select best tokens from each aligned pair\n const mergedTokens = alignedPairs.flatMap(([original, alt]) => selectBestTokens(original, alt, options));\n\n // Remove duplicates and handle post-processing\n const finalTokens = removeDuplicateTokens(mergedTokens, options.highSimilarityThreshold);\n\n return finalTokens.join(' ');\n};\n\nexport const fixTypo = (\n original: string,\n correction: string,\n {\n highSimilarityThreshold = 0.8,\n similarityThreshold = 0.6,\n typoSymbols,\n }: Partial<FixTypoOptions> & Pick<FixTypoOptions, 'typoSymbols'>,\n) => {\n return processTextAlignment(original, correction, { highSimilarityThreshold, similarityThreshold, typoSymbols });\n};\n\nexport * from './footnotes';\nexport * from './similarity';\nexport * from './textUtils';\n"],"mappings":"AAGO,IAAMA,EAAW,CAEpB,aAAc,sBAGd,6BAA8B,yBAG9B,uBAAwB,iDAGxB,+BAAgC,gCAGhC,qBAAsB,wBAGtB,WAAY,mDAGZ,iBAAkB,0BAGlB,mBAAoB,mCAGpB,sBAAuB,qBAGvB,kCAAmC,iBAGnC,0BAA2B,gBAG3B,QAAS,UAGT,WAAY,KAChB,EAYaC,EAAuBC,GACzBA,EAAK,QAAQF,EAAS,QAAS,EAAE,EAAE,QAAQA,EAAS,WAAY,EAAE,EAAE,KAAK,EAavEG,EAAiBD,GAAyB,CACnD,IAAME,EAAQF,EAAK,MAAMF,EAAS,YAAY,EAC9C,OAAOI,EAAQA,EAAM,CAAC,EAAI,EAC9B,EAaaC,EAAe,CAACH,EAAcI,EAA4B,CAAC,IAAgB,CACpF,IAAIC,EAAgBL,EAGpB,QAAWM,KAAUF,EAAiB,CAClC,IAAMG,EAAc,IAAI,OAAOD,EAAQ,GAAG,EAC1CD,EAAgBA,EAAc,QAAQE,EAAa,IAAID,CAAM,GAAG,CACpE,CAEA,OAAOD,EAAc,KAAK,EAAE,MAAMP,EAAS,UAAU,EAAE,OAAO,OAAO,CACzE,EAeaU,EAAuB,CAACC,EAAkBC,EAAuBC,IAAkC,CAC5G,IAAMC,EAAmBd,EAAS,mBAAmB,KAAKY,CAAa,EACjEG,EAAkBf,EAAS,iBAAiB,KAAKa,CAAY,EAC7DG,EAAmBhB,EAAS,mBAAmB,KAAKa,CAAY,EAChEI,EAAkBjB,EAAS,iBAAiB,KAAKY,CAAa,EAE9DM,EAAaf,EAAcS,CAAa,EACxCO,EAAahB,EAAcU,CAAY,EAG7C,OAAIC,GAAoBC,GAAmBG,IAAeC,GACtDR,EAAOA,EAAO,OAAS,CAAC,EAAIE,EACrB,IAIP,GAAAI,GAAmBD,GAAoBE,IAAeC,EAK9D,EAcaC,EAA0B,CAACC,EAAgBC,IAAoC,CACxF,IAAMC,EAAevB,EAAS,iBAAiB,KAAKqB,CAAM,EACpDG,EAAexB,EAAS,iBAAiB,KAAKsB,CAAM,EAE1D,OAAIC,GAAgB,CAACC,EAAqB,CAACH,CAAM,EAC7CG,GAAgB,CAACD,EAAqB,CAACD,CAAM,EAC7CC,GAAgBC,EACT,CAACH,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EAcaG,EAA4B,CAACJ,EAAgBC,IAAoC,CAC1F,IAAMI,EAAc1B,EAAS,mBAAmB,KAAKqB,CAAM,EACrDM,EAAc3B,EAAS,mBAAmB,KAAKsB,CAAM,EAE3D,OAAII,GAAe,CAACC,EAAoB,CAACN,EAAQC,CAAM,EACnDK,GAAe,CAACD,EAAoB,CAACJ,EAAQD,CAAM,EACnDK,GAAeC,EACR,CAACN,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,ECjLA,IAAMM,EAAmB,CACrB,YAAa,GACb,iBAAkB,GAClB,cAAe,EACf,WAAY,CAChB,EAeaC,EAA+B,CAACC,EAAeC,IAA0B,CAClF,IAAMC,EAAUF,EAAM,OAChBG,EAAUF,EAAM,OAEtB,GAAIC,IAAY,EACZ,OAAOC,EAGX,GAAIA,IAAY,EACZ,OAAOD,EAIX,GAAM,CAACE,EAASC,CAAM,EAAIH,GAAWC,EAAU,CAACH,EAAOC,CAAK,EAAI,CAACA,EAAOD,CAAK,EACvEM,EAAWF,EAAQ,OACnBG,EAAUF,EAAO,OAEnBG,EAAc,MAAM,KAAK,CAAE,OAAQF,EAAW,CAAE,EAAG,CAACG,EAAGC,IAAUA,CAAK,EAE1E,QAASC,EAAI,EAAGA,GAAKJ,EAASI,IAAK,CAC/B,IAAMC,EAAa,CAACD,CAAC,EAErB,QAASE,EAAI,EAAGA,GAAKP,EAAUO,IAAK,CAChC,IAAMC,EAAmBT,EAAOM,EAAI,CAAC,IAAMP,EAAQS,EAAI,CAAC,EAAI,EAAI,EAC1DE,EAAU,KAAK,IACjBP,EAAYK,CAAC,EAAI,EACjBD,EAAWC,EAAI,CAAC,EAAI,EACpBL,EAAYK,EAAI,CAAC,EAAIC,CACzB,EACAF,EAAW,KAAKG,CAAO,CAC3B,CAEAP,EAAcI,CAClB,CAEA,OAAOJ,EAAYF,CAAQ,CAC/B,EAcaU,EAAsB,CAAChB,EAAeC,IAA0B,CACzE,IAAMgB,EAAY,KAAK,IAAIjB,EAAM,OAAQC,EAAM,MAAM,GAAK,EACpDiB,EAAWnB,EAA6BC,EAAOC,CAAK,EAC1D,OAAQgB,EAAYC,GAAYD,CACpC,EAcaE,EAA+B,CAACnB,EAAeC,EAAemB,EAAoB,KAAiB,CAC5G,IAAMC,EAAcC,EAAoBtB,CAAK,EACvCuB,EAAcD,EAAoBrB,CAAK,EAC7C,OAAOe,EAAoBK,EAAaE,CAAW,GAAKH,CAC5D,EAgBaI,EAA0B,CACnCC,EACAC,EACAC,EACAC,IACS,CACT,IAAMP,EAAcC,EAAoBG,CAAM,EACxCF,EAAcD,EAAoBI,CAAM,EAG9C,GAAIL,IAAgBE,EAChB,OAAOzB,EAAiB,cAI5B,IAAM+B,EAAeF,EAAY,SAASF,CAAM,GAAKE,EAAY,SAASD,CAAM,EAC1EI,EAAkBd,EAAoBK,EAAaE,CAAW,GAAKK,EAEzE,OAAIC,GAAgBC,EACThC,EAAiB,WAGrBA,EAAiB,gBAC5B,EAoBaiC,EAAqB,CAC9BC,EACAC,EACAC,IACqB,CACrB,IAAMC,EAAgC,CAAC,EACnCxB,EAAIsB,EAAQ,OACZpB,EAAIqB,EAAQ,OAEhB,KAAOvB,EAAI,GAAKE,EAAI,GAGhB,OAFoBmB,EAAOrB,CAAC,EAAEE,CAAC,EAEX,UAAW,CAC3B,IAAK,WACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAGuB,EAAQ,EAAErB,CAAC,CAAC,CAAC,EAC3C,MACJ,IAAK,OACDsB,EAAU,KAAK,CAAC,KAAMD,EAAQ,EAAErB,CAAC,CAAC,CAAC,EACnC,MACJ,IAAK,KACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAG,IAAI,CAAC,EACnC,MACJ,QACI,MAAM,IAAI,MAAM,6BAA6B,CACrD,CAGJ,OAAOwB,EAAU,QAAQ,CAC7B,EAgBaC,EAAsB,CAC/BH,EACAC,EACAP,EACAC,IACqB,CACrB,IAAM1B,EAAU+B,EAAQ,OAClB9B,EAAU+B,EAAQ,OAGlBG,EAAmC,MAAM,KAAK,CAAE,OAAQnC,EAAU,CAAE,EAAG,IACzE,MAAM,KAAK,CAAE,OAAQC,EAAU,CAAE,EAAG,KAAO,CAAE,UAAW,KAAM,MAAO,CAAE,EAAE,CAC7E,EAGA,QAASQ,EAAI,EAAGA,GAAKT,EAASS,IAC1B0B,EAAc1B,CAAC,EAAE,CAAC,EAAI,CAAE,UAAW,KAAM,MAAOA,EAAIb,EAAiB,WAAY,EAErF,QAASe,EAAI,EAAGA,GAAKV,EAASU,IAC1BwB,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW,OAAQ,MAAOA,EAAIf,EAAiB,WAAY,EAIvF,QAASa,EAAI,EAAGA,GAAKT,EAASS,IAC1B,QAASE,EAAI,EAAGA,GAAKV,EAASU,IAAK,CAC/B,IAAMyB,EAAiBd,EACnBS,EAAQtB,EAAI,CAAC,EACbuB,EAAQrB,EAAI,CAAC,EACbc,EACAC,CACJ,EAEMW,EAAgBF,EAAc1B,EAAI,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQyB,EACpDE,EAAUH,EAAc1B,EAAI,CAAC,EAAEE,CAAC,EAAE,MAAQf,EAAiB,YAC3D2C,EAAYJ,EAAc1B,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQf,EAAiB,YAE7D4C,EAAY,KAAK,IAAIH,EAAeC,EAASC,CAAS,EACxDE,EAA4C,OAE5CD,IAAcH,EACdI,EAAgB,WACTD,IAAcF,IACrBG,EAAgB,MAGpBN,EAAc1B,CAAC,EAAEE,CAAC,EAAI,CAAE,UAAW8B,EAAe,MAAOD,CAAU,CACvE,CAIJ,OAAOX,EAAmBM,EAAeJ,EAASC,CAAO,CAC7D,ECvPA,IAAMU,EAAmB,KAcZC,EAAuBC,GACzBC,EAAS,sBAAsB,KAAKD,CAAI,EAI7CE,EAAkB,IAAI,KAAK,aAAa,OAAO,EAY/CC,EAAkBC,GACbF,EAAgB,OAAOE,CAAG,EAc/BC,EAAeC,IACiC,CAC9C,EAAK,SACL,EAAK,SACL,IAAK,SACL,EAAG,SACH,EAAG,SACH,EAAG,SACH,EAAG,QACP,GACsBA,CAAI,GAAKA,EAc7BC,EAAkBC,GAA8B,CAClD,IAAMC,EAAoC,CACtC,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,GACT,EACMC,EAASF,EAAU,QAAQ,QAAS,EAAE,EACxCG,EAAS,GACb,QAAWL,KAAQI,EACfC,GAAUF,EAAOH,CAAI,EAEzB,IAAMM,EAAS,SAASD,EAAQ,EAAE,EAClC,OAAO,MAAMC,CAAM,EAAI,EAAIA,CAC/B,EAyBMC,EAAqBC,GAAsB,CAC7C,IAAMC,EAAyBD,EAC1B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,oBAAoB,GAAK,CAAC,CAAC,EAE/DgB,EAA8BH,EAC/B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,yBAAyB,GAAK,CAAC,CAAC,EAEpEiB,EAA8BJ,EAC/B,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,4BAA4B,GAAK,CAAC,CAAC,EAEvEkB,EAAmCL,EACpC,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,iCAAiC,GAAK,CAAC,CAAC,EAE5EmB,EAAuBH,EAA4B,IAAKI,GAC1DA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEMgB,EAA2BH,EAAiC,IAAKE,GACnEA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEA,MAAO,CACH,eAAgB,CAAC,GAAGS,EAAwB,GAAGK,CAAoB,EACnE,mBAAoB,CAAC,GAAGF,EAA6B,GAAGI,CAAwB,EAChF,kBAAmBL,EACnB,uBAAwBE,CAC5B,CACJ,EAgBMI,EAAkB,CAACT,EAAmBU,IAAqD,CAE7F,GAD2BV,EAAM,KAAMW,GAAS1B,EAAoB0B,EAAK,IAAI,CAAC,EACtD,MAAO,GAE/B,IAAMC,EAAU,IAAI,IAAIF,EAAW,cAAc,EAC3CG,EAAc,IAAI,IAAIH,EAAW,kBAAkB,EACzD,GAAIE,EAAQ,OAASC,EAAY,KAAM,MAAO,GAG9C,QAAWN,KAAOK,EACd,GAAI,CAACC,EAAY,IAAIN,CAAG,EACpB,MAAO,GAIf,MAAO,EACX,EAmBaO,EAAqBd,GAAkC,CAChE,IAAMe,EAAoBhB,EAAkBC,CAAK,EAEjD,GAAI,CAACS,EAAgBT,EAAOe,CAAiB,EACzC,OAAOf,EAIX,IAAMgB,EAAiBhB,EAAM,IAAKW,GAAS,CACvC,IAAIM,EAAcN,EAAK,KAEjBO,EAAW,gBACjB,OAAAD,EAAcA,EAAY,QAAQC,EAAWC,GAElCA,EAAM,QAAQ,WAAa3B,GAASD,EAAYC,CAAI,CAAC,CAC/D,EACM,CAAE,GAAGmB,EAAM,KAAMM,CAAY,CACxC,CAAC,EAGKG,EAAkBrB,EAAkBiB,CAAc,EAGlDK,EAAa,IAAI,IAAID,EAAgB,cAAc,EACnDE,EAAiB,IAAI,IAAIF,EAAgB,kBAAkB,EAE3DG,EAAiB,CAAC,GAAG,IAAI,IAAIH,EAAgB,cAAc,CAAC,EAC5DI,EAAqB,CAAC,GAAG,IAAI,IAAIJ,EAAgB,kBAAkB,CAAC,EAGpEK,EAAuBF,EAAe,OAAQhB,GAAQ,CAACe,EAAe,IAAIf,CAAG,CAAC,EAE9EmB,EAAsBF,EAAmB,OAAQjB,GAAQ,CAACc,EAAW,IAAId,CAAG,CAAC,EAG7EoB,EAAU,CAAC,GAAGN,EAAY,GAAGC,CAAc,EAE3CM,EAAmB,CAAE,OADTD,EAAQ,OAAS,EAAI,KAAK,IAAI,EAAG,GAAGA,EAAQ,IAAKpB,GAAQd,EAAec,CAAG,CAAC,CAAC,EAAI,GACrD,CAAE,EAGhD,OAAOS,EAAe,IAAKL,GAAS,CAChC,GAAI,CAACA,EAAK,KAAK,SAAS3B,CAAgB,EACpC,OAAO2B,EAEX,IAAIM,EAAcN,EAAK,KAEvB,OAAAM,EAAcA,EAAY,QAAQ,QAAS,IAAM,CAC7C,GAAIN,EAAK,WAAY,CACjB,IAAMkB,EAAeJ,EAAqB,MAAM,EAChD,GAAII,EAAc,OAAOA,CAC7B,KAAO,CAEH,IAAMA,EAAeH,EAAoB,MAAM,EAC/C,GAAIG,EAAc,OAAOA,CAC7B,CAGA,IAAMC,EAAS,IAAIzC,EAAeuC,EAAiB,KAAK,CAAC,IACzD,OAAAA,EAAiB,QACVE,CACX,CAAC,EAEM,CAAE,GAAGnB,EAAM,KAAMM,CAAY,CACxC,CAAC,CACL,EClPA,IAAMc,EAAmB,CACrBC,EACAC,EACA,CAAE,oBAAAC,EAAqB,YAAAC,CAAY,IACxB,CAEX,GAAIH,IAAkB,KAClB,MAAO,CAACC,CAAS,EAErB,GAAIA,IAAa,KACb,MAAO,CAACD,CAAa,EAIzB,GAAII,EAAoBJ,CAAa,IAAMI,EAAoBH,CAAQ,EACnE,MAAO,CAACD,CAAa,EAIzB,IAAMK,EAASC,EAAwBN,EAAeC,CAAQ,EAC9D,GAAII,EAAQ,OAAOA,EAGnB,IAAME,EAAiBC,EAA0BR,EAAeC,CAAQ,EACxE,GAAIM,EAAgB,OAAOA,EAG3B,GAAIJ,EAAY,SAASH,CAAa,GAAKG,EAAY,SAASF,CAAQ,EAAG,CACvE,IAAMQ,EAAaN,EAAY,KAAMO,GAAWA,IAAWV,GAAiBU,IAAWT,CAAQ,EAC/F,OAAOQ,EAAa,CAACA,CAAU,EAAI,CAACT,CAAa,CACrD,CAGA,IAAMW,EAAqBP,EAAoBJ,CAAa,EACtDY,EAAgBR,EAAoBH,CAAQ,EAGlD,MAAO,CAFYY,EAAoBF,EAAoBC,CAAa,EAEnDV,EAAsBF,EAAgBC,CAAQ,CACvE,EAWMa,EAAwB,CAACC,EAAkBC,IAA8C,CAC3F,GAAID,EAAO,SAAW,EAClB,OAAOA,EAGX,IAAMV,EAAmB,CAAC,EAE1B,QAAWY,KAAgBF,EAAQ,CAC/B,GAAIV,EAAO,SAAW,EAAG,CACrBA,EAAO,KAAKY,CAAY,EACxB,QACJ,CAEA,IAAMC,EAAgBb,EAAO,GAAG,EAAE,EAGlC,GAAIc,EAA6BD,EAAeD,EAAcD,CAAuB,EAAG,CAEhFC,EAAa,OAASC,EAAc,SACpCb,EAAOA,EAAO,OAAS,CAAC,EAAIY,GAEhC,QACJ,CAGIG,EAAqBf,EAAQa,EAAeD,CAAY,GAI5DZ,EAAO,KAAKY,CAAY,CAC5B,CAEA,OAAOZ,CACX,EAYagB,EAAuB,CAACC,EAAsBC,EAAiBC,IAAoC,CAC5G,IAAMC,EAAiBC,EAAaJ,EAAcE,EAAQ,WAAW,EAC/DG,EAAYD,EAAaH,EAASC,EAAQ,WAAW,EAWrDI,EAReC,EACjBJ,EACAE,EACAH,EAAQ,YACRA,EAAQ,mBACZ,EAGkC,QAAQ,CAAC,CAACM,EAAUC,CAAG,IAAMhC,EAAiB+B,EAAUC,EAAKP,CAAO,CAAC,EAKvG,OAFoBV,EAAsBc,EAAcJ,EAAQ,uBAAuB,EAEpE,KAAK,GAAG,CAC/B,EAEaQ,EAAU,CACnBF,EACAG,EACA,CACI,wBAAAjB,EAA0B,GAC1B,oBAAAd,EAAsB,GACtB,YAAAC,CACJ,IAEOkB,EAAqBS,EAAUG,EAAY,CAAE,wBAAAjB,EAAyB,oBAAAd,EAAqB,YAAAC,CAAY,CAAC","names":["PATTERNS","normalizeArabicText","text","extractDigits","match","tokenizeText","preserveSymbols","processedText","symbol","symbolRegex","handleFootnoteFusion","result","previousToken","currentToken","prevIsStandalone","currHasEmbedded","currIsStandalone","prevHasEmbedded","prevDigits","currDigits","handleFootnoteSelection","tokenA","tokenB","aHasEmbedded","bHasEmbedded","handleStandaloneFootnotes","aIsFootnote","bIsFootnote","ALIGNMENT_SCORES","calculateLevenshteinDistance","textA","textB","lengthA","lengthB","shorter","longer","shortLen","longLen","previousRow","_","index","i","currentRow","j","substitutionCost","minCost","calculateSimilarity","maxLength","distance","areSimilarAfterNormalization","threshold","normalizedA","normalizeArabicText","normalizedB","calculateAlignmentScore","tokenA","tokenB","typoSymbols","similarityThreshold","isTypoSymbol","isHighlySimilar","backtrackAlignment","matrix","tokensA","tokensB","alignment","alignTokenSequences","scoringMatrix","alignmentScore","diagonalScore","upScore","leftScore","bestScore","bestDirection","INVALID_FOOTNOTE","hasInvalidFootnotes","text","PATTERNS","arabicFormatter","numberToArabic","num","ocrToArabic","char","arabicToNumber","arabicStr","lookup","digits","numStr","parsed","extractReferences","lines","arabicReferencesInBody","b","ocrConfusedReferencesInBody","arabicReferencesInFootnotes","ocrConfusedReferencesInFootnotes","convertedOcrBodyRefs","ref","convertedOcrFootnoteRefs","needsCorrection","references","line","bodySet","footnoteSet","correctReferences","initialReferences","sanitizedLines","updatedText","ocrRegex","match","cleanReferences","bodyRefSet","footnoteRefSet","uniqueBodyRefs","uniqueFootnoteRefs","bodyRefsForFootnotes","footnoteRefsForBody","allRefs","referenceCounter","availableRef","newRef","selectBestTokens","originalToken","altToken","similarityThreshold","typoSymbols","normalizeArabicText","result","handleFootnoteSelection","footnoteResult","handleStandaloneFootnotes","typoSymbol","symbol","normalizedOriginal","normalizedAlt","calculateSimilarity","removeDuplicateTokens","tokens","highSimilarityThreshold","currentToken","previousToken","areSimilarAfterNormalization","handleFootnoteFusion","processTextAlignment","originalText","altText","options","originalTokens","tokenizeText","altTokens","mergedTokens","alignTokenSequences","original","alt","fixTypo","correction"]}
|
|
1
|
+
{"version":3,"sources":["../src/textUtils.ts","../src/similarity.ts","../src/footnotes.ts","../src/index.ts"],"sourcesContent":["/**\n * Collection of regex patterns used throughout the library for text processing\n */\nexport const PATTERNS = {\n /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */\n arabicDigits: /[0-9\\u0660-\\u0669]+/,\n\n /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\\([\\u0660-\\u0669]+\\) */\n arabicFootnoteReferenceRegex: /^\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */\n arabicLettersAndDigits: /[0-9\\u0621-\\u063A\\u0641-\\u064A\\u0660-\\u0669]+/g,\n\n /** Matches Arabic punctuation marks and whitespace characters */\n arabicPunctuationAndWhitespace: /[\\s\\u060C\\u061B\\u061F\\u06D4]+/,\n\n /** Matches footnote references with Arabic-Indic digits in parentheses: \\([\\u0660-\\u0669]+\\) */\n arabicReferenceRegex: /\\([\\u0660-\\u0669]+\\)/g,\n\n /** Matches Arabic diacritical marks (harakat, tanween, etc.) */\n diacritics: /[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]/g,\n\n /** Matches embedded footnotes within text: \\([0-9\\u0660-\\u0669]+\\) */\n footnoteEmbedded: /\\([0-9\\u0660-\\u0669]+\\)/,\n\n /** Matches standalone footnote markers at line start/end: ^\\(?[0-9\\u0660-\\u0669]+\\)?[،.]?$ */\n footnoteStandalone: /^\\(?[0-9\\u0660-\\u0669]+\\)?[،.]?$/,\n\n /** Matches invalid/problematic footnote references: empty \"()\" or OCR-confused endings */\n invalidReferenceRegex: /\\(\\)|\\([.1OV9]+\\)/g, // Combined pattern for detecting any invalid/problematic references\n\n /** Matches OCR-confused footnote references at line start with characters like .1OV9 */\n ocrConfusedFootnoteReferenceRegex: /^\\([.1OV9]+\\)/g,\n\n /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */\n ocrConfusedReferenceRegex: /\\([.1OV9]+\\)/g,\n\n /** Matches Arabic tatweel (kashida) character used for text stretching */\n tatweel: /\\u0640/g,\n\n /** Matches one or more whitespace characters */\n whitespace: /\\s+/,\n};\n\n/**\n * Normalizes Arabic text by removing diacritics, and tatweel marks.\n * This normalization enables better text comparison by focusing on core characters\n * while ignoring decorative elements that don't affect meaning.\n *\n * @param text - Arabic text to normalize\n * @returns Normalized text with diacritics, tatweel, and basic tags removed\n * @example\n * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'\n */\nexport const normalizeArabicText = (text: string): string => {\n return text.replace(PATTERNS.tatweel, '').replace(PATTERNS.diacritics, '').trim();\n};\n\n/**\n * Extracts the first sequence of Arabic or Western digits from text.\n * Used primarily for footnote number comparison to match related footnote elements.\n *\n * @param text - Text containing digits to extract\n * @returns First digit sequence found, or empty string if none found\n * @example\n * extractDigits('(٥)أخرجه البخاري') // Returns '٥'\n * extractDigits('See note (123)') // Returns '123'\n */\nexport const extractDigits = (text: string): string => {\n const match = text.match(PATTERNS.arabicDigits);\n return match ? match[0] : '';\n};\n\n/**\n * Tokenizes text into individual words while preserving special symbols.\n * Removes HTML tags, adds spacing around preserved symbols to ensure they\n * are tokenized separately, then splits on whitespace.\n *\n * @param text - Text to tokenize\n * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens\n * @returns Array of tokens, or empty array if input is empty/whitespace\n * @example\n * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']\n */\nexport const tokenizeText = (text: string, preserveSymbols: string[] = []): string[] => {\n let processedText = text;\n\n // Add spaces around each preserve symbol to ensure they're tokenized separately\n for (const symbol of preserveSymbols) {\n const symbolRegex = new RegExp(symbol, 'g');\n processedText = processedText.replace(symbolRegex, ` ${symbol} `);\n }\n\n return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);\n};\n\n/**\n * Handles fusion of standalone and embedded footnotes during token processing.\n * Detects patterns where standalone footnotes should be merged with embedded ones\n * or where trailing standalone footnotes should be skipped.\n *\n * @param result - Current result array being built\n * @param previousToken - The previous token in the sequence\n * @param currentToken - The current token being processed\n * @returns True if the current token was handled (fused or skipped), false otherwise\n * @example\n * // (٥) + (٥)أخرجه → result gets (٥)أخرجه\n * // (٥)أخرجه + (٥) → (٥) is skipped\n */\nexport const handleFootnoteFusion = (result: string[], previousToken: string, currentToken: string): boolean => {\n const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);\n const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);\n const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);\n const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);\n\n const prevDigits = extractDigits(previousToken);\n const currDigits = extractDigits(currentToken);\n\n // Replace standalone with fused version: (٥) + (٥)أخرجه → (٥)أخرجه\n if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {\n result[result.length - 1] = currentToken;\n return true;\n }\n\n // Skip trailing standalone: (٥)أخرجه + (٥) → (٥)أخرجه\n if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) {\n return true;\n }\n\n return false;\n};\n\n/**\n * Handles selection logic for tokens with embedded footnotes during alignment.\n * Prefers tokens that contain embedded footnotes over plain text, and among\n * tokens with embedded footnotes, prefers the shorter one.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']\n * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']\n */\nexport const handleFootnoteSelection = (tokenA: string, tokenB: string): null | string[] => {\n const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);\n const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);\n\n if (aHasEmbedded && !bHasEmbedded) return [tokenA];\n if (bHasEmbedded && !aHasEmbedded) return [tokenB];\n if (aHasEmbedded && bHasEmbedded) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n\n/**\n * Handles selection logic for standalone footnote tokens during alignment.\n * Manages cases where one or both tokens are standalone footnotes, preserving\n * both tokens when one is a footnote and the other is regular text.\n *\n * @param tokenA - First token to compare\n * @param tokenB - Second token to compare\n * @returns Array containing selected token(s), or null if no special handling needed\n * @example\n * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']\n * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)\n */\nexport const handleStandaloneFootnotes = (tokenA: string, tokenB: string): null | string[] => {\n const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);\n const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);\n\n if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];\n if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];\n if (aIsFootnote && bIsFootnote) {\n return [tokenA.length <= tokenB.length ? tokenA : tokenB];\n }\n\n return null;\n};\n","import { normalizeArabicText } from './textUtils';\n\n// Alignment scoring constants\nconst ALIGNMENT_SCORES = {\n GAP_PENALTY: -1,\n MISMATCH_PENALTY: -2,\n PERFECT_MATCH: 2,\n SOFT_MATCH: 1,\n};\n\n/**\n * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.\n * The Levenshtein distance is the minimum number of single-character edits (insertions,\n * deletions, or substitutions) required to change one string into another.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Minimum edit distance between the two strings\n * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths\n * @example\n * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3\n * calculateLevenshteinDistance('', 'hello') // Returns 5\n */\nexport const calculateLevenshteinDistance = (textA: string, textB: string): number => {\n const lengthA = textA.length;\n const lengthB = textB.length;\n\n if (lengthA === 0) {\n return lengthB;\n }\n\n if (lengthB === 0) {\n return lengthA;\n }\n\n // Use shorter string for the array to optimize space\n const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];\n const shortLen = shorter.length;\n const longLen = longer.length;\n\n let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);\n\n for (let i = 1; i <= longLen; i++) {\n const currentRow = [i];\n\n for (let j = 1; j <= shortLen; j++) {\n const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;\n const minCost = Math.min(\n previousRow[j] + 1, // deletion\n currentRow[j - 1] + 1, // insertion\n previousRow[j - 1] + substitutionCost, // substitution\n );\n currentRow.push(minCost);\n }\n\n previousRow = currentRow;\n }\n\n return previousRow[shortLen];\n};\n\n/**\n * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.\n * Uses Levenshtein distance normalized by the length of the longer string.\n * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.\n *\n * @param textA - First string to compare\n * @param textB - Second string to compare\n * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)\n * @example\n * calculateSimilarity('hello', 'hello') // Returns 1.0\n * calculateSimilarity('hello', 'help') // Returns 0.6\n */\nexport const calculateSimilarity = (textA: string, textB: string): number => {\n const maxLength = Math.max(textA.length, textB.length) || 1;\n const distance = calculateLevenshteinDistance(textA, textB);\n return (maxLength - distance) / maxLength;\n};\n\n/**\n * Checks if two texts are similar after Arabic normalization.\n * Normalizes both texts by removing diacritics and decorative elements,\n * then compares their similarity against the provided threshold.\n *\n * @param textA - First text to compare\n * @param textB - Second text to compare\n * @param threshold - Similarity threshold (0.0 to 1.0)\n * @returns True if normalized texts meet the similarity threshold\n * @example\n * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true\n */\nexport const areSimilarAfterNormalization = (textA: string, textB: string, threshold: number = 0.6): boolean => {\n const normalizedA = normalizeArabicText(textA);\n const normalizedB = normalizeArabicText(textB);\n return calculateSimilarity(normalizedA, normalizedB) >= threshold;\n};\n\n/**\n * Calculates alignment score for two tokens in sequence alignment.\n * Uses different scoring criteria: perfect match after normalization gets highest score,\n * typo symbols or highly similar tokens get soft match score, mismatches get penalty.\n *\n * @param tokenA - First token to score\n * @param tokenB - Second token to score\n * @param typoSymbols - Array of special symbols that get preferential treatment\n * @param similarityThreshold - Threshold for considering tokens highly similar\n * @returns Alignment score (higher is better match)\n * @example\n * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)\n * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity\n */\nexport const calculateAlignmentScore = (\n tokenA: string,\n tokenB: string,\n typoSymbols: string[],\n similarityThreshold: number,\n): number => {\n const normalizedA = normalizeArabicText(tokenA);\n const normalizedB = normalizeArabicText(tokenB);\n\n // Perfect match after normalization\n if (normalizedA === normalizedB) {\n return ALIGNMENT_SCORES.PERFECT_MATCH;\n }\n\n // Check if either token is a typo symbol or high similarity\n const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);\n const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;\n\n if (isTypoSymbol || isHighlySimilar) {\n return ALIGNMENT_SCORES.SOFT_MATCH;\n }\n\n return ALIGNMENT_SCORES.MISMATCH_PENALTY;\n};\n\ntype AlignedTokenPair = [null | string, null | string];\n\ntype AlignmentCell = {\n direction: 'diagonal' | 'left' | 'up' | null;\n score: number;\n};\n\n/**\n * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.\n * Follows the directional indicators in the matrix to build the sequence of aligned\n * token pairs from the Needleman-Wunsch algorithm.\n *\n * @param matrix - Scoring matrix with directional information from alignment\n * @param tokensA - First sequence of tokens\n * @param tokensB - Second sequence of tokens\n * @returns Array of aligned token pairs, where null indicates a gap\n * @throws Error if invalid alignment direction is encountered\n */\nexport const backtrackAlignment = (\n matrix: AlignmentCell[][],\n tokensA: string[],\n tokensB: string[],\n): AlignedTokenPair[] => {\n const alignment: AlignedTokenPair[] = [];\n let i = tokensA.length;\n let j = tokensB.length;\n\n while (i > 0 || j > 0) {\n const currentCell = matrix[i][j];\n\n switch (currentCell.direction) {\n case 'diagonal':\n alignment.push([tokensA[--i], tokensB[--j]]);\n break;\n case 'left':\n alignment.push([null, tokensB[--j]]);\n break;\n case 'up':\n alignment.push([tokensA[--i], null]);\n break;\n default:\n throw new Error('Invalid alignment direction');\n }\n }\n\n return alignment.reverse();\n};\n\n/**\n * Performs global sequence alignment using the Needleman-Wunsch algorithm.\n * Aligns two token sequences to find the optimal pairing that maximizes\n * the total alignment score, handling insertions, deletions, and substitutions.\n *\n * @param tokensA - First sequence of tokens to align\n * @param tokensB - Second sequence of tokens to align\n * @param typoSymbols - Special symbols that affect scoring\n * @param similarityThreshold - Threshold for high similarity scoring\n * @returns Array of aligned token pairs, with null indicating gaps\n * @example\n * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)\n * // Returns [['a', 'a'], ['b', 'c']]\n */\nexport const alignTokenSequences = (\n tokensA: string[],\n tokensB: string[],\n typoSymbols: string[],\n similarityThreshold: number,\n): AlignedTokenPair[] => {\n const lengthA = tokensA.length;\n const lengthB = tokensB.length;\n\n // Initialize scoring matrix\n const scoringMatrix: AlignmentCell[][] = Array.from({ length: lengthA + 1 }, () =>\n Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 })),\n );\n\n // Initialize first row and column\n for (let i = 1; i <= lengthA; i++) {\n scoringMatrix[i][0] = { direction: 'up', score: i * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n for (let j = 1; j <= lengthB; j++) {\n scoringMatrix[0][j] = { direction: 'left', score: j * ALIGNMENT_SCORES.GAP_PENALTY };\n }\n\n // Fill scoring matrix\n for (let i = 1; i <= lengthA; i++) {\n for (let j = 1; j <= lengthB; j++) {\n const alignmentScore = calculateAlignmentScore(\n tokensA[i - 1],\n tokensB[j - 1],\n typoSymbols,\n similarityThreshold,\n );\n\n const diagonalScore = scoringMatrix[i - 1][j - 1].score + alignmentScore;\n const upScore = scoringMatrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY;\n const leftScore = scoringMatrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY;\n\n const bestScore = Math.max(diagonalScore, upScore, leftScore);\n let bestDirection: 'diagonal' | 'left' | 'up' = 'left';\n\n if (bestScore === diagonalScore) {\n bestDirection = 'diagonal';\n } else if (bestScore === upScore) {\n bestDirection = 'up';\n }\n\n scoringMatrix[i][j] = { direction: bestDirection, score: bestScore };\n }\n }\n\n // Backtrack to build alignment\n return backtrackAlignment(scoringMatrix, tokensA, tokensB);\n};\n","import { PATTERNS } from './textUtils';\n\nconst INVALID_FOOTNOTE = '()';\n\n/**\n * Checks if the given text contains invalid footnote references.\n * Invalid footnotes include empty parentheses \"()\" or OCR-confused characters\n * like \".1OV9\" that were misrecognized instead of Arabic numerals.\n *\n * @param text - Text to check for invalid footnote patterns\n * @returns True if text contains invalid footnote references, false otherwise\n * @example\n * hasInvalidFootnotes('This text has ()') // Returns true\n * hasInvalidFootnotes('This text has (١)') // Returns false\n * hasInvalidFootnotes('OCR mistake (O)') // Returns true\n */\nexport const hasInvalidFootnotes = (text: string): boolean => {\n return PATTERNS.invalidReferenceRegex.test(text);\n};\n\n// Arabic number formatter instance\nconst arabicFormatter = new Intl.NumberFormat('ar-SA');\n\n/**\n * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.\n * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.\n *\n * @param num - The number to convert to Arabic numerals\n * @returns String representation using Arabic-Indic digits (٠-٩)\n * @example\n * numberToArabic(123) // Returns '١٢٣'\n * numberToArabic(5) // Returns '٥'\n */\nconst numberToArabic = (num: number): string => {\n return arabicFormatter.format(num);\n};\n\n/**\n * Converts OCR-confused characters to their corresponding Arabic-Indic numerals.\n * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.\n *\n * @param char - Single character that may be an OCR mistake\n * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists\n * @example\n * ocrToArabic('O') // Returns '٥' (O often confused with ٥)\n * ocrToArabic('1') // Returns '١' (1 often confused with ١)\n * ocrToArabic('.') // Returns '٠' (dot often confused with ٠)\n */\nconst ocrToArabic = (char: string): string => {\n const ocrToArabicMap: { [key: string]: string } = {\n '1': '١',\n '9': '٩',\n '.': '٠',\n O: '٥',\n o: '٥',\n V: '٧',\n v: '٧',\n };\n return ocrToArabicMap[char] || char;\n};\n\n/**\n * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.\n * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.\n *\n * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(١٢٣)'\n * @returns Parsed number, or 0 if parsing fails\n * @example\n * arabicToNumber('(١٢٣)') // Returns 123\n * arabicToNumber('(٥)') // Returns 5\n * arabicToNumber('invalid') // Returns 0\n */\nconst arabicToNumber = (arabicStr: string): number => {\n const lookup: { [key: string]: string } = {\n '٠': '0',\n '١': '1',\n '٢': '2',\n '٣': '3',\n '٤': '4',\n '٥': '5',\n '٦': '6',\n '٧': '7',\n '٨': '8',\n '٩': '9',\n };\n const digits = arabicStr.replace(/[()]/g, '');\n let numStr = '';\n for (const char of digits) {\n numStr += lookup[char];\n }\n const parsed = parseInt(numStr, 10);\n return isNaN(parsed) ? 0 : parsed;\n};\n\ntype TextLine = {\n isFootnote?: boolean;\n text: string;\n};\n\n/**\n * Extracts all footnote references from text lines, categorizing them by type and location.\n * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.\n *\n * @param lines - Array of text line objects with optional isFootnote flag\n * @returns Object containing categorized reference arrays:\n * - bodyReferences: All valid references found in body text\n * - footnoteReferences: All valid references found in footnotes\n * - ocrConfusedInBody: OCR-confused references in body text (for tracking)\n * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)\n * @example\n * const lines = [\n * { text: 'Body with (١) and (O)', isFootnote: false },\n * { text: '(١) Footnote text', isFootnote: true }\n * ];\n * const refs = extractReferences(lines);\n * // refs.bodyReferences contains ['(١)', '(٥)'] - OCR 'O' converted to '٥'\n */\nconst extractReferences = (lines: TextLine[]) => {\n const arabicReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);\n\n const ocrConfusedReferencesInBody = lines\n .filter((b) => !b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);\n\n const arabicReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);\n\n const ocrConfusedReferencesInFootnotes = lines\n .filter((b) => b.isFootnote)\n .flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);\n\n const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) =>\n ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)),\n );\n\n return {\n bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],\n footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],\n ocrConfusedInBody: ocrConfusedReferencesInBody,\n ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes,\n };\n};\n\n/**\n * Determines if footnote reference correction is needed by checking for:\n * 1. Invalid footnote patterns (empty parentheses, OCR mistakes)\n * 2. Mismatched sets of references between body text and footnotes\n * 3. Different counts of references in body vs footnotes\n *\n * @param lines - Array of text line objects to analyze\n * @param references - Extracted reference data from extractReferences()\n * @returns True if correction is needed, false if references are already correct\n * @example\n * const lines = [{ text: 'Text with ()', isFootnote: false }];\n * const refs = extractReferences(lines);\n * needsCorrection(lines, refs) // Returns true due to invalid \"()\" reference\n */\nconst needsCorrection = (lines: TextLine[], references: ReturnType<typeof extractReferences>) => {\n const mistakenReferences = lines.some((line) => hasInvalidFootnotes(line.text));\n if (mistakenReferences) return true;\n\n const bodySet = new Set(references.bodyReferences);\n const footnoteSet = new Set(references.footnoteReferences);\n if (bodySet.size !== footnoteSet.size) return true;\n\n // Check if the sets contain the same elements\n for (const ref of bodySet) {\n if (!footnoteSet.has(ref)) {\n return true;\n }\n }\n\n return false;\n};\n\n/**\n * Corrects footnote references in an array of text lines by:\n * 1. Converting OCR-confused characters to proper Arabic numerals\n * 2. Filling in empty \"()\" references with appropriate numbers\n * 3. Ensuring footnote references in body text match those in footnotes\n * 4. Generating new reference numbers when needed\n *\n * @param lines - Array of text line objects, each with optional isFootnote flag\n * @returns Array of corrected text lines with proper footnote references\n * @example\n * const lines = [\n * { text: 'Main text with ()', isFootnote: false },\n * { text: '() This is a footnote', isFootnote: true }\n * ];\n * const corrected = correctReferences(lines);\n * // Returns lines with \"()\" replaced by proper Arabic numerals like \"(١)\"\n */\nexport const correctReferences = <T extends TextLine>(lines: T[]): T[] => {\n const initialReferences = extractReferences(lines);\n\n if (!needsCorrection(lines, initialReferences)) {\n return lines;\n }\n\n // Pass 1: Sanitize lines by correcting only OCR characters inside reference markers.\n const sanitizedLines = lines.map((line) => {\n let updatedText = line.text;\n // This regex finds the full reference, e.g., \"(O)\" or \"(1)\"\n const ocrRegex = /\\([.1OV9]+\\)/g;\n updatedText = updatedText.replace(ocrRegex, (match) => {\n // This replace acts *inside* the found match, e.g., on \"O\" or \"1\"\n return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));\n });\n return { ...line, text: updatedText };\n });\n\n // Pass 2: Analyze the sanitized lines to get a clear and accurate picture of references.\n const cleanReferences = extractReferences(sanitizedLines);\n\n // Step 3: Create queues of \"unmatched\" references for two-way pairing.\n const bodyRefSet = new Set(cleanReferences.bodyReferences);\n const footnoteRefSet = new Set(cleanReferences.footnoteReferences);\n\n const uniqueBodyRefs = [...new Set(cleanReferences.bodyReferences)];\n const uniqueFootnoteRefs = [...new Set(cleanReferences.footnoteReferences)];\n\n // Queue 1: Body references available for footnotes.\n const bodyRefsForFootnotes = uniqueBodyRefs.filter((ref) => !footnoteRefSet.has(ref));\n // Queue 2: Footnote references available for the body.\n const footnoteRefsForBody = uniqueFootnoteRefs.filter((ref) => !bodyRefSet.has(ref));\n\n // Step 4: Determine the starting point for any completely new reference numbers.\n const allRefs = [...bodyRefSet, ...footnoteRefSet];\n const maxRefNum = allRefs.length > 0 ? Math.max(0, ...allRefs.map((ref) => arabicToNumber(ref))) : 0;\n const referenceCounter = { count: maxRefNum + 1 };\n\n // Step 5: Map over the sanitized lines, filling in '()' using the queues.\n return sanitizedLines.map((line) => {\n if (!line.text.includes(INVALID_FOOTNOTE)) {\n return line;\n }\n let updatedText = line.text;\n\n updatedText = updatedText.replace(/\\(\\)/g, () => {\n if (line.isFootnote) {\n const availableRef = bodyRefsForFootnotes.shift();\n if (availableRef) return availableRef;\n } else {\n // It's body text\n const availableRef = footnoteRefsForBody.shift();\n if (availableRef) return availableRef;\n }\n\n // If no available partner reference exists, generate a new one.\n const newRef = `(${numberToArabic(referenceCounter.count)})`;\n referenceCounter.count++;\n return newRef;\n });\n\n return { ...line, text: updatedText };\n });\n};\n","import type { FixTypoOptions } from './types';\n\nimport { alignTokenSequences, areSimilarAfterNormalization, calculateSimilarity } from './similarity';\nimport {\n handleFootnoteFusion,\n handleFootnoteSelection,\n handleStandaloneFootnotes,\n normalizeArabicText,\n tokenizeText,\n} from './textUtils';\n\n/**\n * Selects the best token(s) from an aligned pair during typo correction.\n * Uses various heuristics including normalization, footnote handling, typo symbols,\n * and similarity scores to determine which token(s) to keep.\n *\n * @param originalToken - Token from the original OCR text (may be null)\n * @param altToken - Token from the alternative OCR text (may be null)\n * @param options - Configuration options including typo symbols and similarity threshold\n * @returns Array of selected tokens (usually contains one token, but may contain multiple)\n */\nconst selectBestTokens = (\n originalToken: null | string,\n altToken: null | string,\n { similarityThreshold, typoSymbols }: FixTypoOptions,\n): string[] => {\n // Handle missing tokens\n if (originalToken === null) {\n return [altToken!];\n }\n if (altToken === null) {\n return [originalToken];\n }\n\n // Preserve original if same after normalization (keeps diacritics)\n if (normalizeArabicText(originalToken) === normalizeArabicText(altToken)) {\n return [originalToken];\n }\n\n // Handle embedded footnotes\n const result = handleFootnoteSelection(originalToken, altToken);\n if (result) return result;\n\n // Handle standalone footnotes\n const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);\n if (footnoteResult) return footnoteResult;\n\n // Handle typo symbols - prefer the symbol itself\n if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {\n const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);\n return typoSymbol ? [typoSymbol] : [originalToken];\n }\n\n // Choose based on similarity\n const normalizedOriginal = normalizeArabicText(originalToken);\n const normalizedAlt = normalizeArabicText(altToken);\n const similarity = calculateSimilarity(normalizedOriginal, normalizedAlt);\n\n return [similarity > similarityThreshold ? originalToken : altToken];\n};\n\n/**\n * Removes duplicate tokens and handles footnote fusion in post-processing.\n * Identifies and removes tokens that are highly similar while preserving\n * important variations. Also handles special cases like footnote merging.\n *\n * @param tokens - Array of tokens to process\n * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)\n * @returns Array of tokens with duplicates removed and footnotes fused\n */\nconst removeDuplicateTokens = (tokens: string[], highSimilarityThreshold: number): string[] => {\n if (tokens.length === 0) {\n return tokens;\n }\n\n const result: string[] = [];\n\n for (const currentToken of tokens) {\n if (result.length === 0) {\n result.push(currentToken);\n continue;\n }\n\n const previousToken = result.at(-1)!;\n\n // Handle ordinary echoes (similar tokens)\n if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {\n // Keep the shorter version\n if (currentToken.length < previousToken.length) {\n result[result.length - 1] = currentToken;\n }\n continue;\n }\n\n // Handle footnote fusion cases\n if (handleFootnoteFusion(result, previousToken, currentToken)) {\n continue;\n }\n\n result.push(currentToken);\n }\n\n return result;\n};\n\n/**\n * Processes text alignment between original and alternate OCR results to fix typos.\n * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,\n * then selects the best tokens and performs post-processing.\n *\n * @param originalText - Original OCR text that may contain typos\n * @param altText - Reference text from alternate OCR for comparison\n * @param options - Configuration options for alignment and selection\n * @returns Corrected text with typos fixed\n */\nexport const processTextAlignment = (originalText: string, altText: string, options: FixTypoOptions): string => {\n const originalTokens = tokenizeText(originalText, options.typoSymbols);\n const altTokens = tokenizeText(altText, options.typoSymbols);\n\n // Align token sequences\n const alignedPairs = alignTokenSequences(\n originalTokens,\n altTokens,\n options.typoSymbols,\n options.similarityThreshold,\n );\n\n // Select best tokens from each aligned pair\n const mergedTokens = alignedPairs.flatMap(([original, alt]) => selectBestTokens(original, alt, options));\n\n // Remove duplicates and handle post-processing\n const finalTokens = removeDuplicateTokens(mergedTokens, options.highSimilarityThreshold);\n\n return finalTokens.join(' ');\n};\n\nexport const fixTypo = (\n original: string,\n correction: string,\n {\n highSimilarityThreshold = 0.8,\n similarityThreshold = 0.6,\n typoSymbols,\n }: Partial<FixTypoOptions> & Pick<FixTypoOptions, 'typoSymbols'>,\n) => {\n return processTextAlignment(original, correction, { highSimilarityThreshold, similarityThreshold, typoSymbols });\n};\n\nexport * from './footnotes';\nexport * from './similarity';\nexport * from './textUtils';\n"],"mappings":"AAGO,IAAMA,EAAW,CAEpB,aAAc,sBAGd,6BAA8B,yBAG9B,uBAAwB,iDAGxB,+BAAgC,gCAGhC,qBAAsB,wBAGtB,WAAY,mDAGZ,iBAAkB,0BAGlB,mBAAoB,mCAGpB,sBAAuB,qBAGvB,kCAAmC,iBAGnC,0BAA2B,gBAG3B,QAAS,UAGT,WAAY,KAChB,EAYaC,EAAuBC,GACzBA,EAAK,QAAQF,EAAS,QAAS,EAAE,EAAE,QAAQA,EAAS,WAAY,EAAE,EAAE,KAAK,EAavEG,EAAiBD,GAAyB,CACnD,IAAME,EAAQF,EAAK,MAAMF,EAAS,YAAY,EAC9C,OAAOI,EAAQA,EAAM,CAAC,EAAI,EAC9B,EAaaC,EAAe,CAACH,EAAcI,EAA4B,CAAC,IAAgB,CACpF,IAAIC,EAAgBL,EAGpB,QAAWM,KAAUF,EAAiB,CAClC,IAAMG,EAAc,IAAI,OAAOD,EAAQ,GAAG,EAC1CD,EAAgBA,EAAc,QAAQE,EAAa,IAAID,CAAM,GAAG,CACpE,CAEA,OAAOD,EAAc,KAAK,EAAE,MAAMP,EAAS,UAAU,EAAE,OAAO,OAAO,CACzE,EAeaU,EAAuB,CAACC,EAAkBC,EAAuBC,IAAkC,CAC5G,IAAMC,EAAmBd,EAAS,mBAAmB,KAAKY,CAAa,EACjEG,EAAkBf,EAAS,iBAAiB,KAAKa,CAAY,EAC7DG,EAAmBhB,EAAS,mBAAmB,KAAKa,CAAY,EAChEI,EAAkBjB,EAAS,iBAAiB,KAAKY,CAAa,EAE9DM,EAAaf,EAAcS,CAAa,EACxCO,EAAahB,EAAcU,CAAY,EAG7C,OAAIC,GAAoBC,GAAmBG,IAAeC,GACtDR,EAAOA,EAAO,OAAS,CAAC,EAAIE,EACrB,IAIP,GAAAI,GAAmBD,GAAoBE,IAAeC,EAK9D,EAcaC,EAA0B,CAACC,EAAgBC,IAAoC,CACxF,IAAMC,EAAevB,EAAS,iBAAiB,KAAKqB,CAAM,EACpDG,EAAexB,EAAS,iBAAiB,KAAKsB,CAAM,EAE1D,OAAIC,GAAgB,CAACC,EAAqB,CAACH,CAAM,EAC7CG,GAAgB,CAACD,EAAqB,CAACD,CAAM,EAC7CC,GAAgBC,EACT,CAACH,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,EAcaG,EAA4B,CAACJ,EAAgBC,IAAoC,CAC1F,IAAMI,EAAc1B,EAAS,mBAAmB,KAAKqB,CAAM,EACrDM,EAAc3B,EAAS,mBAAmB,KAAKsB,CAAM,EAE3D,OAAII,GAAe,CAACC,EAAoB,CAACN,EAAQC,CAAM,EACnDK,GAAe,CAACD,EAAoB,CAACJ,EAAQD,CAAM,EACnDK,GAAeC,EACR,CAACN,EAAO,QAAUC,EAAO,OAASD,EAASC,CAAM,EAGrD,IACX,ECjLA,IAAMM,EAAmB,CACrB,YAAa,GACb,iBAAkB,GAClB,cAAe,EACf,WAAY,CAChB,EAeaC,EAA+B,CAACC,EAAeC,IAA0B,CAClF,IAAMC,EAAUF,EAAM,OAChBG,EAAUF,EAAM,OAEtB,GAAIC,IAAY,EACZ,OAAOC,EAGX,GAAIA,IAAY,EACZ,OAAOD,EAIX,GAAM,CAACE,EAASC,CAAM,EAAIH,GAAWC,EAAU,CAACH,EAAOC,CAAK,EAAI,CAACA,EAAOD,CAAK,EACvEM,EAAWF,EAAQ,OACnBG,EAAUF,EAAO,OAEnBG,EAAc,MAAM,KAAK,CAAE,OAAQF,EAAW,CAAE,EAAG,CAACG,EAAGC,IAAUA,CAAK,EAE1E,QAASC,EAAI,EAAGA,GAAKJ,EAASI,IAAK,CAC/B,IAAMC,EAAa,CAACD,CAAC,EAErB,QAASE,EAAI,EAAGA,GAAKP,EAAUO,IAAK,CAChC,IAAMC,EAAmBT,EAAOM,EAAI,CAAC,IAAMP,EAAQS,EAAI,CAAC,EAAI,EAAI,EAC1DE,EAAU,KAAK,IACjBP,EAAYK,CAAC,EAAI,EACjBD,EAAWC,EAAI,CAAC,EAAI,EACpBL,EAAYK,EAAI,CAAC,EAAIC,CACzB,EACAF,EAAW,KAAKG,CAAO,CAC3B,CAEAP,EAAcI,CAClB,CAEA,OAAOJ,EAAYF,CAAQ,CAC/B,EAcaU,EAAsB,CAAChB,EAAeC,IAA0B,CACzE,IAAMgB,EAAY,KAAK,IAAIjB,EAAM,OAAQC,EAAM,MAAM,GAAK,EACpDiB,EAAWnB,EAA6BC,EAAOC,CAAK,EAC1D,OAAQgB,EAAYC,GAAYD,CACpC,EAcaE,EAA+B,CAACnB,EAAeC,EAAemB,EAAoB,KAAiB,CAC5G,IAAMC,EAAcC,EAAoBtB,CAAK,EACvCuB,EAAcD,EAAoBrB,CAAK,EAC7C,OAAOe,EAAoBK,EAAaE,CAAW,GAAKH,CAC5D,EAgBaI,EAA0B,CACnCC,EACAC,EACAC,EACAC,IACS,CACT,IAAMP,EAAcC,EAAoBG,CAAM,EACxCF,EAAcD,EAAoBI,CAAM,EAG9C,GAAIL,IAAgBE,EAChB,OAAOzB,EAAiB,cAI5B,IAAM+B,EAAeF,EAAY,SAASF,CAAM,GAAKE,EAAY,SAASD,CAAM,EAC1EI,EAAkBd,EAAoBK,EAAaE,CAAW,GAAKK,EAEzE,OAAIC,GAAgBC,EACThC,EAAiB,WAGrBA,EAAiB,gBAC5B,EAoBaiC,EAAqB,CAC9BC,EACAC,EACAC,IACqB,CACrB,IAAMC,EAAgC,CAAC,EACnCxB,EAAIsB,EAAQ,OACZpB,EAAIqB,EAAQ,OAEhB,KAAOvB,EAAI,GAAKE,EAAI,GAGhB,OAFoBmB,EAAOrB,CAAC,EAAEE,CAAC,EAEX,UAAW,CAC3B,IAAK,WACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAGuB,EAAQ,EAAErB,CAAC,CAAC,CAAC,EAC3C,MACJ,IAAK,OACDsB,EAAU,KAAK,CAAC,KAAMD,EAAQ,EAAErB,CAAC,CAAC,CAAC,EACnC,MACJ,IAAK,KACDsB,EAAU,KAAK,CAACF,EAAQ,EAAEtB,CAAC,EAAG,IAAI,CAAC,EACnC,MACJ,QACI,MAAM,IAAI,MAAM,6BAA6B,CACrD,CAGJ,OAAOwB,EAAU,QAAQ,CAC7B,EAgBaC,EAAsB,CAC/BH,EACAC,EACAP,EACAC,IACqB,CACrB,IAAM1B,EAAU+B,EAAQ,OAClB9B,EAAU+B,EAAQ,OAGlBG,EAAmC,MAAM,KAAK,CAAE,OAAQnC,EAAU,CAAE,EAAG,IACzE,MAAM,KAAK,CAAE,OAAQC,EAAU,CAAE,EAAG,KAAO,CAAE,UAAW,KAAM,MAAO,CAAE,EAAE,CAC7E,EAGA,QAASQ,EAAI,EAAGA,GAAKT,EAASS,IAC1B0B,EAAc1B,CAAC,EAAE,CAAC,EAAI,CAAE,UAAW,KAAM,MAAOA,EAAIb,EAAiB,WAAY,EAErF,QAASe,EAAI,EAAGA,GAAKV,EAASU,IAC1BwB,EAAc,CAAC,EAAExB,CAAC,EAAI,CAAE,UAAW,OAAQ,MAAOA,EAAIf,EAAiB,WAAY,EAIvF,QAASa,EAAI,EAAGA,GAAKT,EAASS,IAC1B,QAASE,EAAI,EAAGA,GAAKV,EAASU,IAAK,CAC/B,IAAMyB,EAAiBd,EACnBS,EAAQtB,EAAI,CAAC,EACbuB,EAAQrB,EAAI,CAAC,EACbc,EACAC,CACJ,EAEMW,EAAgBF,EAAc1B,EAAI,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQyB,EACpDE,EAAUH,EAAc1B,EAAI,CAAC,EAAEE,CAAC,EAAE,MAAQf,EAAiB,YAC3D2C,EAAYJ,EAAc1B,CAAC,EAAEE,EAAI,CAAC,EAAE,MAAQf,EAAiB,YAE7D4C,EAAY,KAAK,IAAIH,EAAeC,EAASC,CAAS,EACxDE,EAA4C,OAE5CD,IAAcH,EACdI,EAAgB,WACTD,IAAcF,IACrBG,EAAgB,MAGpBN,EAAc1B,CAAC,EAAEE,CAAC,EAAI,CAAE,UAAW8B,EAAe,MAAOD,CAAU,CACvE,CAIJ,OAAOX,EAAmBM,EAAeJ,EAASC,CAAO,CAC7D,ECvPA,IAAMU,EAAmB,KAcZC,EAAuBC,GACzBC,EAAS,sBAAsB,KAAKD,CAAI,EAI7CE,EAAkB,IAAI,KAAK,aAAa,OAAO,EAY/CC,EAAkBC,GACbF,EAAgB,OAAOE,CAAG,EAc/BC,EAAeC,IACiC,CAC9C,EAAK,SACL,EAAK,SACL,IAAK,SACL,EAAG,SACH,EAAG,SACH,EAAG,SACH,EAAG,QACP,GACsBA,CAAI,GAAKA,EAc7BC,EAAkBC,GAA8B,CAClD,IAAMC,EAAoC,CACtC,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,IACL,SAAK,GACT,EACMC,EAASF,EAAU,QAAQ,QAAS,EAAE,EACxCG,EAAS,GACb,QAAWL,KAAQI,EACfC,GAAUF,EAAOH,CAAI,EAEzB,IAAMM,EAAS,SAASD,EAAQ,EAAE,EAClC,OAAO,MAAMC,CAAM,EAAI,EAAIA,CAC/B,EAyBMC,EAAqBC,GAAsB,CAC7C,IAAMC,EAAyBD,EAC1B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,oBAAoB,GAAK,CAAC,CAAC,EAE/DgB,EAA8BH,EAC/B,OAAQE,GAAM,CAACA,EAAE,UAAU,EAC3B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,yBAAyB,GAAK,CAAC,CAAC,EAEpEiB,EAA8BJ,EAC/B,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,4BAA4B,GAAK,CAAC,CAAC,EAEvEkB,EAAmCL,EACpC,OAAQE,GAAMA,EAAE,UAAU,EAC1B,QAASA,GAAMA,EAAE,KAAK,MAAMf,EAAS,iCAAiC,GAAK,CAAC,CAAC,EAE5EmB,EAAuBH,EAA4B,IAAKI,GAC1DA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEMgB,EAA2BH,EAAiC,IAAKE,GACnEA,EAAI,QAAQ,WAAaf,GAASD,EAAYC,CAAI,CAAC,CACvD,EAEA,MAAO,CACH,eAAgB,CAAC,GAAGS,EAAwB,GAAGK,CAAoB,EACnE,mBAAoB,CAAC,GAAGF,EAA6B,GAAGI,CAAwB,EAChF,kBAAmBL,EACnB,uBAAwBE,CAC5B,CACJ,EAgBMI,EAAkB,CAACT,EAAmBU,IAAqD,CAE7F,GAD2BV,EAAM,KAAMW,GAAS1B,EAAoB0B,EAAK,IAAI,CAAC,EACtD,MAAO,GAE/B,IAAMC,EAAU,IAAI,IAAIF,EAAW,cAAc,EAC3CG,EAAc,IAAI,IAAIH,EAAW,kBAAkB,EACzD,GAAIE,EAAQ,OAASC,EAAY,KAAM,MAAO,GAG9C,QAAWN,KAAOK,EACd,GAAI,CAACC,EAAY,IAAIN,CAAG,EACpB,MAAO,GAIf,MAAO,EACX,EAmBaO,EAAyCd,GAAoB,CACtE,IAAMe,EAAoBhB,EAAkBC,CAAK,EAEjD,GAAI,CAACS,EAAgBT,EAAOe,CAAiB,EACzC,OAAOf,EAIX,IAAMgB,EAAiBhB,EAAM,IAAKW,GAAS,CACvC,IAAIM,EAAcN,EAAK,KAEjBO,EAAW,gBACjB,OAAAD,EAAcA,EAAY,QAAQC,EAAWC,GAElCA,EAAM,QAAQ,WAAa3B,GAASD,EAAYC,CAAI,CAAC,CAC/D,EACM,CAAE,GAAGmB,EAAM,KAAMM,CAAY,CACxC,CAAC,EAGKG,EAAkBrB,EAAkBiB,CAAc,EAGlDK,EAAa,IAAI,IAAID,EAAgB,cAAc,EACnDE,EAAiB,IAAI,IAAIF,EAAgB,kBAAkB,EAE3DG,EAAiB,CAAC,GAAG,IAAI,IAAIH,EAAgB,cAAc,CAAC,EAC5DI,EAAqB,CAAC,GAAG,IAAI,IAAIJ,EAAgB,kBAAkB,CAAC,EAGpEK,EAAuBF,EAAe,OAAQhB,GAAQ,CAACe,EAAe,IAAIf,CAAG,CAAC,EAE9EmB,EAAsBF,EAAmB,OAAQjB,GAAQ,CAACc,EAAW,IAAId,CAAG,CAAC,EAG7EoB,EAAU,CAAC,GAAGN,EAAY,GAAGC,CAAc,EAE3CM,EAAmB,CAAE,OADTD,EAAQ,OAAS,EAAI,KAAK,IAAI,EAAG,GAAGA,EAAQ,IAAKpB,GAAQd,EAAec,CAAG,CAAC,CAAC,EAAI,GACrD,CAAE,EAGhD,OAAOS,EAAe,IAAKL,GAAS,CAChC,GAAI,CAACA,EAAK,KAAK,SAAS3B,CAAgB,EACpC,OAAO2B,EAEX,IAAIM,EAAcN,EAAK,KAEvB,OAAAM,EAAcA,EAAY,QAAQ,QAAS,IAAM,CAC7C,GAAIN,EAAK,WAAY,CACjB,IAAMkB,EAAeJ,EAAqB,MAAM,EAChD,GAAII,EAAc,OAAOA,CAC7B,KAAO,CAEH,IAAMA,EAAeH,EAAoB,MAAM,EAC/C,GAAIG,EAAc,OAAOA,CAC7B,CAGA,IAAMC,EAAS,IAAIzC,EAAeuC,EAAiB,KAAK,CAAC,IACzD,OAAAA,EAAiB,QACVE,CACX,CAAC,EAEM,CAAE,GAAGnB,EAAM,KAAMM,CAAY,CACxC,CAAC,CACL,EClPA,IAAMc,EAAmB,CACrBC,EACAC,EACA,CAAE,oBAAAC,EAAqB,YAAAC,CAAY,IACxB,CAEX,GAAIH,IAAkB,KAClB,MAAO,CAACC,CAAS,EAErB,GAAIA,IAAa,KACb,MAAO,CAACD,CAAa,EAIzB,GAAII,EAAoBJ,CAAa,IAAMI,EAAoBH,CAAQ,EACnE,MAAO,CAACD,CAAa,EAIzB,IAAMK,EAASC,EAAwBN,EAAeC,CAAQ,EAC9D,GAAII,EAAQ,OAAOA,EAGnB,IAAME,EAAiBC,EAA0BR,EAAeC,CAAQ,EACxE,GAAIM,EAAgB,OAAOA,EAG3B,GAAIJ,EAAY,SAASH,CAAa,GAAKG,EAAY,SAASF,CAAQ,EAAG,CACvE,IAAMQ,EAAaN,EAAY,KAAMO,GAAWA,IAAWV,GAAiBU,IAAWT,CAAQ,EAC/F,OAAOQ,EAAa,CAACA,CAAU,EAAI,CAACT,CAAa,CACrD,CAGA,IAAMW,EAAqBP,EAAoBJ,CAAa,EACtDY,EAAgBR,EAAoBH,CAAQ,EAGlD,MAAO,CAFYY,EAAoBF,EAAoBC,CAAa,EAEnDV,EAAsBF,EAAgBC,CAAQ,CACvE,EAWMa,EAAwB,CAACC,EAAkBC,IAA8C,CAC3F,GAAID,EAAO,SAAW,EAClB,OAAOA,EAGX,IAAMV,EAAmB,CAAC,EAE1B,QAAWY,KAAgBF,EAAQ,CAC/B,GAAIV,EAAO,SAAW,EAAG,CACrBA,EAAO,KAAKY,CAAY,EACxB,QACJ,CAEA,IAAMC,EAAgBb,EAAO,GAAG,EAAE,EAGlC,GAAIc,EAA6BD,EAAeD,EAAcD,CAAuB,EAAG,CAEhFC,EAAa,OAASC,EAAc,SACpCb,EAAOA,EAAO,OAAS,CAAC,EAAIY,GAEhC,QACJ,CAGIG,EAAqBf,EAAQa,EAAeD,CAAY,GAI5DZ,EAAO,KAAKY,CAAY,CAC5B,CAEA,OAAOZ,CACX,EAYagB,EAAuB,CAACC,EAAsBC,EAAiBC,IAAoC,CAC5G,IAAMC,EAAiBC,EAAaJ,EAAcE,EAAQ,WAAW,EAC/DG,EAAYD,EAAaH,EAASC,EAAQ,WAAW,EAWrDI,EAReC,EACjBJ,EACAE,EACAH,EAAQ,YACRA,EAAQ,mBACZ,EAGkC,QAAQ,CAAC,CAACM,EAAUC,CAAG,IAAMhC,EAAiB+B,EAAUC,EAAKP,CAAO,CAAC,EAKvG,OAFoBV,EAAsBc,EAAcJ,EAAQ,uBAAuB,EAEpE,KAAK,GAAG,CAC/B,EAEaQ,EAAU,CACnBF,EACAG,EACA,CACI,wBAAAjB,EAA0B,GAC1B,oBAAAd,EAAsB,GACtB,YAAAC,CACJ,IAEOkB,EAAqBS,EAAUG,EAAY,CAAE,wBAAAjB,EAAyB,oBAAAd,EAAqB,YAAAC,CAAY,CAAC","names":["PATTERNS","normalizeArabicText","text","extractDigits","match","tokenizeText","preserveSymbols","processedText","symbol","symbolRegex","handleFootnoteFusion","result","previousToken","currentToken","prevIsStandalone","currHasEmbedded","currIsStandalone","prevHasEmbedded","prevDigits","currDigits","handleFootnoteSelection","tokenA","tokenB","aHasEmbedded","bHasEmbedded","handleStandaloneFootnotes","aIsFootnote","bIsFootnote","ALIGNMENT_SCORES","calculateLevenshteinDistance","textA","textB","lengthA","lengthB","shorter","longer","shortLen","longLen","previousRow","_","index","i","currentRow","j","substitutionCost","minCost","calculateSimilarity","maxLength","distance","areSimilarAfterNormalization","threshold","normalizedA","normalizeArabicText","normalizedB","calculateAlignmentScore","tokenA","tokenB","typoSymbols","similarityThreshold","isTypoSymbol","isHighlySimilar","backtrackAlignment","matrix","tokensA","tokensB","alignment","alignTokenSequences","scoringMatrix","alignmentScore","diagonalScore","upScore","leftScore","bestScore","bestDirection","INVALID_FOOTNOTE","hasInvalidFootnotes","text","PATTERNS","arabicFormatter","numberToArabic","num","ocrToArabic","char","arabicToNumber","arabicStr","lookup","digits","numStr","parsed","extractReferences","lines","arabicReferencesInBody","b","ocrConfusedReferencesInBody","arabicReferencesInFootnotes","ocrConfusedReferencesInFootnotes","convertedOcrBodyRefs","ref","convertedOcrFootnoteRefs","needsCorrection","references","line","bodySet","footnoteSet","correctReferences","initialReferences","sanitizedLines","updatedText","ocrRegex","match","cleanReferences","bodyRefSet","footnoteRefSet","uniqueBodyRefs","uniqueFootnoteRefs","bodyRefsForFootnotes","footnoteRefsForBody","allRefs","referenceCounter","availableRef","newRef","selectBestTokens","originalToken","altToken","similarityThreshold","typoSymbols","normalizeArabicText","result","handleFootnoteSelection","footnoteResult","handleStandaloneFootnotes","typoSymbol","symbol","normalizedOriginal","normalizedAlt","calculateSimilarity","removeDuplicateTokens","tokens","highSimilarityThreshold","currentToken","previousToken","areSimilarAfterNormalization","handleFootnoteFusion","processTextAlignment","originalText","altText","options","originalTokens","tokenizeText","altTokens","mergedTokens","alignTokenSequences","original","alt","fixTypo","correction"]}
|