baburchi 1.7.0 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,2650 @@
1
- var zt="\u0627\u0647\u0640",h={arabicCharacters:/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,whitespace:/\s+/},N=e=>{let t=e.match(h.arabicDigits);return t?t[0]:""},w=(e,t=[])=>{let n=e;for(let r of t){let o=new RegExp(r,"g");n=n.replace(o,` ${r} `)}return n.trim().split(h.whitespace).filter(Boolean)},k=(e,t,n)=>{let r=h.footnoteStandalone.test(t),o=h.footnoteEmbedded.test(n),s=h.footnoteStandalone.test(n),a=h.footnoteEmbedded.test(t),i=N(t),c=N(n);return r&&o&&i===c?(e[e.length-1]=n,!0):!!(a&&s&&i===c)},W=(e,t)=>{let n=h.footnoteEmbedded.test(e),r=h.footnoteEmbedded.test(t);return n&&!r?[e]:r&&!n?[t]:n&&r?[e.length<=t.length?e:t]:null},$=(e,t)=>{let n=h.footnoteStandalone.test(e),r=h.footnoteStandalone.test(t);return n&&!r?[e,t]:r&&!n?[t,e]:n&&r?[e.length<=t.length?e:t]:null},D=e=>e.replace(/\s*\(\u00AC[\u0660-\u0669]+\)\s*/g," ").replace(/ +/g," ").trim(),j=e=>e.replace(/\s*\([٠-٩]{1}(\s+[\u0600-\u06FF])?\)\s*/g," ").replace(/ +/g," ").trim(),wt=e=>e.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu,"$1 \u0647\u0640"),vt=e=>e.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu,"$1\u0627\u0647\u0640");var se=/\s+/g,Z=/\u0640/g,ae=/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,ie=/[أإآٱ]/g,ce=/\u0649/g,le=/\u0629/g,ue=/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g,fe=/[A-Za-z]+[0-9]*|[0-9]+|[¬§`=]|[/]{2,}|[&]|[ﷺ]/g,me=/[^\u0621-\u063A\u0641-\u064A\u0671\u067E\u0686\u06A4-\u06AF\u06CC\u06D2\u06D3]/g,ge=/[^\u0621-\u063A\u0641-\u064A\u0671\u067E\u0686\u06A4-\u06AF\u06CC\u06D2\u06D3\s]/g,pe=e=>e===32,de=e=>e>=48&&e<=57||e>=1632&&e<=1641,he=e=>e.replace(Z,(t,n,r)=>{let o=n-1;for(;o>=0&&pe(r.charCodeAt(o));)o--;if(o>=0){let s=r.charCodeAt(o);if(de(s)||s===1607)return"\u0640"}return""}),be=e=>e.replace(/([0-9\u0660-\u0669][0-9\u0660-\u0669/\-\s]*?)\s*ه(?:ـ)?(?=(?:\s|$|[^\p{L}\p{N}]))/gu,"$1"),Ae=(e,t)=>t&&e.normalize?e.normalize("NFC"):e,xe=(e,t,n)=>t?e.replace(ue,n?" ":""):e,ye=(e,t,n)=>(t&&(e=e.replace(ae,"")),n==="safe"?he(e):n==="all"?e.replace(Z,""):e),Se=(e,t,n,r)=>(t&&(e=e.replace(ie,"\u0627")),n&&(e=e.replace(ce,"\u064A")),r&&(e=e.replace(le,"\u0647")),e),Me=(e,t)=>t?e.replace(fe," "):e,Ce=(e,t,n)=>t?e.replace(ge," "):n?e.replace(me,""):e,Te=(e,t,n)=>(t&&(e=e.replace(se," ")),n&&(e=e.trim()),e),b=(e,t)=>t===void 0?e:!!t,Re=(e,t)=>t===void 0?e:t===!0?"safe":t===!1?!1:t,G={aggressive:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!0,nfc:!0,normalizeAlif:!0,removeHijriMarker:!0,replaceAlifMaqsurah:!0,replaceTaMarbutahWithHa:!0,stripDiacritics:!0,stripFootnotes:!0,stripLatinAndSymbols:!0,stripTatweel:"all",stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1},light:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!0,normalizeAlif:!1,removeHijriMarker:!1,replaceAlifMaqsurah:!1,replaceTaMarbutahWithHa:!1,stripDiacritics:!1,stripFootnotes:!1,stripLatinAndSymbols:!1,stripTatweel:!1,stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1},search:{collapseWhitespace:!0,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!0,normalizeAlif:!0,removeHijriMarker:!0,replaceAlifMaqsurah:!0,replaceTaMarbutahWithHa:!1,stripDiacritics:!0,stripFootnotes:!0,stripLatinAndSymbols:!1,stripTatweel:"all",stripZeroWidth:!0,trim:!0,zeroWidthToSpace:!1}},Fe={collapseWhitespace:!1,keepOnlyArabicLetters:!1,lettersAndSpacesOnly:!1,nfc:!1,normalizeAlif:!1,removeHijriMarker:!1,replaceAlifMaqsurah:!1,replaceTaMarbutahWithHa:!1,stripDiacritics:!1,stripFootnotes:!1,stripLatinAndSymbols:!1,stripTatweel:!1,stripZeroWidth:!1,trim:!1,zeroWidthToSpace:!1},d=(e,t="search")=>{if(!e)return"";let n,r=null;if(typeof t=="string")n=G[t];else{let T=t.base??"light";n=T==="none"?Fe:G[T],r=t}let o=b(n.nfc,r?.nfc),s=b(n.stripZeroWidth,r?.stripZeroWidth),a=b(n.zeroWidthToSpace,r?.zeroWidthToSpace),i=b(n.stripDiacritics,r?.stripDiacritics),c=b(n.stripFootnotes,r?.stripFootnotes),u=b(n.normalizeAlif,r?.normalizeAlif),l=b(n.replaceAlifMaqsurah,r?.replaceAlifMaqsurah),f=b(n.replaceTaMarbutahWithHa,r?.replaceTaMarbutahWithHa),m=b(n.stripLatinAndSymbols,r?.stripLatinAndSymbols),g=b(n.lettersAndSpacesOnly,r?.lettersAndSpacesOnly),A=b(n.keepOnlyArabicLetters,r?.keepOnlyArabicLetters),M=b(n.collapseWhitespace,r?.collapseWhitespace),x=b(n.trim,r?.trim),C=b(n.removeHijriMarker,r?.removeHijriMarker),z=Re(n.stripTatweel,r?.stripTatweel),p=e;return p=Ae(p,o),p=xe(p,s,a),C&&(p=be(p)),p=ye(p,i,z),p=Se(p,u,l,f),c&&(p=D(p),p=j(p)),g||(p=Me(p,m)),p=Ce(p,g,A),p=Te(p,M,x),p};var V=(e,t)=>{let n=e.length,r=t.length;if(n===0)return r;if(r===0)return n;let[o,s]=n<=r?[e,t]:[t,e],a=o.length,i=s.length,c=Array.from({length:a+1},(u,l)=>l);for(let u=1;u<=i;u++){let l=[u];for(let f=1;f<=a;f++){let m=s[u-1]===o[f-1]?0:1,g=Math.min(c[f]+1,l[f-1]+1,c[f-1]+m);l.push(g)}c=l}return c[a]},Pe=(e,t,n)=>Math.abs(e.length-t.length)>n?n+1:e.length===0?t.length<=n?t.length:n+1:t.length===0?e.length<=n?e.length:n+1:null,Ee=e=>{let t=new Int16Array(e+1),n=new Int16Array(e+1);for(let r=0;r<=e;r++)t[r]=r;return[t,n]},ze=(e,t,n)=>({from:Math.max(1,e-t),to:Math.min(n,e+t)}),we=(e,t,n,r,o,s)=>{let a=e[n-1]===t[r-1]?0:1,i=o[r]+1,c=s[r-1]+1,u=o[r-1]+a;return Math.min(i,c,u)},ve=(e,t,n,r,o,s)=>{let a=t.length,i=r+1,{from:c,to:u}=ze(n,r,a);s[0]=n;let l=n;for(let f=1;f<c;f++)s[f]=i;for(let f=u+1;f<=a;f++)s[f]=i;for(let f=c;f<=u;f++){let m=we(e,t,n,f,o,s);s[f]=m,m<l&&(l=m)}return l},v=(e,t,n)=>{let r=n+1,o=Pe(e,t,n);if(o!==null)return o;if(e.length>t.length)return v(t,e,n);let[s,a]=Ee(t.length);for(let i=1;i<=e.length;i++){if(ve(e,t,i,n,s,a)>n)return r;let u=s;s=a,a=u}return s[t.length]<=n?s[t.length]:r};var y={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},S=(e,t)=>{let n=Math.max(e.length,t.length)||1,r=V(e,t);return(n-r)/n},R=(e,t,n=.6)=>{let r=d(e),o=d(t);return S(r,o)>=n},Lt=(e,t,n,r)=>{let o=d(e),s=d(t);if(o===s)return y.PERFECT_MATCH;let a=n.includes(e)||n.includes(t),i=S(o,s)>=r;return a||i?y.SOFT_MATCH:y.MISMATCH_PENALTY},He=(e,t,n)=>{let r=[],o=t.length,s=n.length;for(;o>0||s>0;)switch(e[o][s].direction){case"diagonal":r.push([t[--o],n[--s]]);break;case"left":r.push([null,n[--s]]);break;case"up":r.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return r.reverse()},Ie=(e,t)=>{let n=Array.from({length:e+1},()=>Array.from({length:t+1},()=>({direction:null,score:0})));for(let r=1;r<=e;r++)n[r][0]={direction:"up",score:r*y.GAP_PENALTY};for(let r=1;r<=t;r++)n[0][r]={direction:"left",score:r*y.GAP_PENALTY};return n},Oe=(e,t,n)=>{let r=Math.max(e,t,n);return r===e?{direction:"diagonal",score:r}:r===t?{direction:"up",score:r}:{direction:"left",score:r}},Y=(e,t,n,r)=>{let o=e.length,s=t.length,a=Ie(o,s),i=new Set(n),c=e.map(l=>d(l)),u=t.map(l=>d(l));for(let l=1;l<=o;l++)for(let f=1;f<=s;f++){let m=c[l-1],g=u[f-1],A;if(m===g)A=y.PERFECT_MATCH;else{let T=i.has(e[l-1])||i.has(t[f-1]),oe=S(m,g)>=r;A=T||oe?y.SOFT_MATCH:y.MISMATCH_PENALTY}let M=a[l-1][f-1].score+A,x=a[l-1][f].score+y.GAP_PENALTY,C=a[l][f-1].score+y.GAP_PENALTY,{direction:z,score:p}=Oe(M,x,C);a[l][f]={direction:z,score:p}}return He(a,e,t)};var $t=(e,t)=>{let n=[],r=0;for(let o of e){if(r>=t.length)break;if(o){let{result:s,segmentsConsumed:a}=qe(o,t,r);s&&n.push(s),r+=a}else n.push(t[r]),r++}return r<t.length&&n.push(...t.slice(r)),n},Be=(e,t,n)=>{let r=`${t} ${n}`,o=`${n} ${t}`,s=d(e),a=S(s,d(r)),i=S(s,d(o));return a>=i?r:o},qe=(e,t,n)=>{let r=t[n];if(R(e,r))return{result:r,segmentsConsumed:1};let o=t[n],s=t[n+1];return!o||!s?o?{result:o,segmentsConsumed:1}:{result:"",segmentsConsumed:0}:{result:Be(e,o,s),segmentsConsumed:2}};var U=e=>{let t=[],n=0,r=-1;for(let s=0;s<e.length;s++)e[s]==='"'&&(n++,r=s);let o=n%2===0;return!o&&r!==-1&&t.push({char:'"',index:r,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},_e={"\xAB":"\xBB","(":")","[":"]","{":"}"},Le=new Set(["\xAB","(","[","{"]),Ne=new Set(["\xBB",")","]","}"]),X=e=>{let t=[],n=[];for(let r=0;r<e.length;r++){let o=e[r];if(Le.has(o))n.push({char:o,index:r});else if(Ne.has(o)){let s=n.pop();s?_e[s.char]!==o&&(t.push({char:s.char,index:s.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:r,reason:"mismatched",type:"bracket"})):t.push({char:o,index:r,reason:"unmatched",type:"bracket"})}}return n.forEach(({char:r,index:o})=>{t.push({char:r,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},Q=e=>{let t=U(e),n=X(e);return{errors:[...t.errors,...n.errors].sort((r,o)=>r.index-o.index),isBalanced:t.isBalanced&&n.isBalanced}},jt=e=>{let t=[],n=e.split(`
2
- `),r=0;return n.forEach((o,s)=>{if(o.length>10){let a=Q(o);a.isBalanced||a.errors.forEach(i=>{t.push({absoluteIndex:r+i.index,char:i.char,reason:i.reason,type:i.type})})}r+=o.length+(s<n.length-1?1:0)}),t},Gt=e=>U(e).isBalanced,Zt=e=>X(e).isBalanced,Vt=e=>Q(e).isBalanced;var ke="()",We=e=>h.invalidReferenceRegex.test(e),$e=new Intl.NumberFormat("ar-SA"),De=e=>$e.format(e),H=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,je=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},n=e.replace(/[()]/g,""),r="";for(let s of n)r+=t[s];let o=parseInt(r,10);return isNaN(o)?0:o},K=e=>{let t=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(h.arabicReferenceRegex)||[]),n=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(h.ocrConfusedReferenceRegex)||[]),r=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(h.arabicFootnoteReferenceRegex)||[]),o=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(h.ocrConfusedFootnoteReferenceRegex)||[]),s=n.map(i=>i.replace(/[.1OV9]/g,c=>H(c))),a=o.map(i=>i.replace(/[.1OV9]/g,c=>H(c)));return{bodyReferences:[...t,...s],footnoteReferences:[...r,...a],ocrConfusedInBody:n,ocrConfusedInFootnotes:o}},Ge=(e,t)=>{if(e.some(s=>We(s.text)))return!0;let r=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(r.size!==o.size)return!0;for(let s of r)if(!o.has(s))return!0;return!1},Xt=e=>{let t=K(e);if(!Ge(e,t))return e;let n=e.map(g=>{let A=g.text,M=/\([.1OV9]+\)/g;return A=A.replace(M,x=>x.replace(/[.1OV9]/g,C=>H(C))),{...g,text:A}}),r=K(n),o=new Set(r.bodyReferences),s=new Set(r.footnoteReferences),a=[...new Set(r.bodyReferences)],i=[...new Set(r.footnoteReferences)],c=a.filter(g=>!s.has(g)),u=i.filter(g=>!o.has(g)),l=[...o,...s],m={count:(l.length>0?Math.max(0,...l.map(g=>je(g))):0)+1};return n.map(g=>{if(!g.text.includes(ke))return g;let A=g.text;return A=A.replace(/\(\)/g,()=>{if(g.isFootnote){let x=c.shift();if(x)return x}else{let x=u.shift();if(x)return x}let M=`(${De(m.count)})`;return m.count++,M}),{...g,text:A}})};var F=class{next=new Map;link=0;out=[]},I=class{nodes=[new F];add(t,n){let r=0;for(let o=0;o<t.length;o++){let s=t[o],a=this.nodes[r].next.get(s);a===void 0&&(a=this.nodes.length,this.nodes[r].next.set(s,a),this.nodes.push(new F)),r=a}this.nodes[r].out.push(n)}build(){let t=[];for(let[,n]of this.nodes[0].next)this.nodes[n].link=0,t.push(n);for(let n=0;n<t.length;n++){let r=t[n];for(let[o,s]of this.nodes[r].next){t.push(s);let a=this.nodes[r].link;for(;a!==0&&!this.nodes[a].next.has(o);)a=this.nodes[a].link;let i=this.nodes[a].next.get(o);this.nodes[s].link=i===void 0?0:i;let c=this.nodes[this.nodes[s].link].out;c.length&&this.nodes[s].out.push(...c)}}}find(t,n){let r=0;for(let o=0;o<t.length;o++){let s=t[o];for(;r!==0&&!this.nodes[r].next.has(s);)r=this.nodes[r].link;let a=this.nodes[r].next.get(s);if(r=a===void 0?0:a,this.nodes[r].out.length)for(let i of this.nodes[r].out)n(i,o+1)}}},P=e=>{let t=new I;for(let n=0;n<e.length;n++){let r=e[n];r.length>0&&t.add(r,n)}return t.build(),t};var O={enableFuzzy:!0,gramsPerExcerpt:5,log:()=>{},maxCandidatesPerExcerpt:40,maxEditAbs:3,maxEditRel:.1,q:4,seamLen:512};var J=200,Ze=80;function B(e){let t=[],n=[],r=[],o=0;for(let s=0;s<e.length;s++){let a=e[s];n.push(o),r.push(a.length),t.push(a),o+=a.length,s+1<e.length&&(t.push(" "),o+=1)}return{book:t.join(""),lens:r,starts:n}}function q(e,t){let n=0,r=t.length-1,o=0;for(;n<=r;){let s=n+r>>1;t[s]<=e?(o=s,n=s+1):r=s-1}return o}function ee(e,t,n,r,o){let s=P(n),a=new Int32Array(o).fill(-1),i=new Uint8Array(o);return s.find(e,(c,u)=>{let l=n[c],f=u-l.length,m=q(f,t);for(let g of r[c])i[g]||(a[g]=m,i[g]=1)}),{result:a,seenExact:i}}function _(e){let t=new Map,n=[],r=[];for(let o=0;o<e.length;o++){let s=e[o],a=t.get(s);a===void 0?(a=r.length,t.set(s,a),r.push(s),n.push([o])):n[a].push(o)}return{keyToPatId:t,patIdToOrigIdxs:n,patterns:r}}var L=(e,t,n,r,o)=>{let s=e.length,a=Math.min(o,Math.max(6,Math.ceil(s*.12))),i=Math.floor(a/2),c=t.start-i,u=t.seam?r[t.page]?.text:n[t.page];if(!u)return null;let l=Ve(t,n,r,c,s,a),f=Ke(l,t,u,c,s,a),m=Je(t,u,c,s,a,o);return et(f,e,m)},Ve=(e,t,n,r,o,s)=>(a=0,i=0)=>e.seam?Ye(n,e.page,r,o,s):Ue(t,e.page,r,o,s,a,i),Ye=(e,t,n,r,o)=>{let s=e[t]?.text;if(!s)return null;let a=Math.max(0,n),i=r+o,c=Math.min(s.length,a+i);return c>a?s.slice(a,c):null},Ue=(e,t,n,r,o,s,a)=>{let i=e[t];if(!i)return null;let c=r+o,u=n,l="";if(u<0){let m=Math.max(0,-u-a);m>0&&(l+=Xe(e,t,m)),u=0}let f=Math.min(i.length-s,Math.max(0,u)+c-l.length);return f>u&&(l+=i.slice(Math.max(0,u),f)),l+=Qe(e,t,c-l.length),l.length?l:null},Xe=(e,t,n)=>{let r=n,o=t-1,s=[];for(;r>0&&o>=0;){let a=e[o];if(!a)break;let i=Math.min(r,a.length),c=a.slice(a.length-i);s.unshift(c),r-=c.length,o--}return s.length?`${s.join(" ")} `:""},Qe=(e,t,n)=>{let r="",o=t+1;for(;n>0&&o<e.length;){let s=e[o];if(!s)break;let a=s.slice(0,n);if(!a.length)break;r+=` ${a}`,n-=a.length,o++}return r},Ke=(e,t,n,r,o,s)=>{let a=[],i=o+s,c=!t.seam&&r+i>n.length,u=!t.seam&&r<0,l=e(0,0);if(l&&a.push(l),c){let f=Math.min(J,Math.max(0,n.length-Math.max(0,r)));if(f>0){let m=e(f,0);m&&a.push(m)}}if(u){let f=e(0,Math.min(J,-r));f&&a.push(f)}return a},Je=(e,t,n,r,o,s)=>{let a=r+o,i=!e.seam&&n+a>t.length,c=!e.seam&&n<0,u=Math.min(2,Math.max(1,Math.ceil(r*.005)));return i||c||e.seam?s+Math.min(Ze,Math.ceil(r*.08)):s+u},et=(e,t,n)=>{let r=null;for(let o of e){let s=v(t,o,n);s<=n&&(r==null||s<r)&&(r=s)}return r==null?null:{acceptance:n,dist:r}};var E=class{q;map=new Map;gramFreq=new Map;constructor(t){this.q=t}addText(t,n,r){let o=this.q,s=n.length;if(!(s<o))for(let a=0;a+o<=s;a++){let i=n.slice(a,a+o),c=this.map.get(i);c||(c=[],this.map.set(i,c)),c.push({page:t,pos:a,seam:r}),this.gramFreq.set(i,(this.gramFreq.get(i)??0)+1)}}pickRare(t,n){n=Math.max(1,Math.floor(n));let r=[],o=new Set,s=this.q;for(let i=0;i+s<=t.length;i++){let c=t.slice(i,i+s);if(o.has(c))continue;o.add(c);let u=this.gramFreq.get(c)??2147483647;r.push({freq:u,gram:c,offset:i})}r.sort((i,c)=>i.freq-c.freq);let a=[];for(let i of r)if(this.map.has(i.gram)&&(a.push({gram:i.gram,offset:i.offset}),a.length>=n))return a;if(a.length<n){let i=new Set(a.map(c=>c.gram));for(let c=r.length-1;c>=0&&a.length<n;c--){let u=r[c];this.map.has(u.gram)&&!i.has(u.gram)&&(a.push({gram:u.gram,offset:u.offset}),i.add(u.gram))}}return a}getPostings(t){return this.map.get(t)}};function te(e,t){let n=[];for(let r=0;r+1<e.length;r++){let o=e[r].slice(-t),s=e[r+1].slice(0,t),a=`${o} ${s}`;n.push({startPage:r,text:a})}return n}function ne(e,t,n){let r=new E(n);for(let o=0;o<e.length;o++)r.addText(o,e[o],!1);for(let o=0;o<t.length;o++)r.addText(o,t[o].text,!0);return r}function re(e,t,n){let r=t.pickRare(e,n.gramsPerExcerpt);if(r.length===0)return[];let o=[],s=new Set,a=e.length;e:for(let{gram:i,offset:c}of r){let u=t.getPostings(i);if(u)for(let l of u){let f=l.pos-c;if(f<-Math.floor(a*.25))continue;let m=Math.max(0,f),g=`${l.page}:${m}:${l.seam?1:0}`;if(!s.has(g)&&(o.push({page:l.page,seam:l.seam,start:m}),s.add(g),o.length>=n.maxCandidatesPerExcerpt))break e}}return o}function tt(e,t,n,r,o){if(e.length===0)return null;let s=nt(e,o);o.log("maxDist",s);let a=new Set,i=null;for(let c of t){if(rt(c,a))continue;let u=ot(c,e,n,r,s,o);if(u&&(i=at(i,u,c),o.log("findBest best",i),u.dist===0))break}return i}function nt(e,t){return Math.max(t.maxEditAbs,Math.ceil(t.maxEditRel*e.length))}function rt(e,t){let n=`${e.page}:${e.start}:${e.seam?1:0}`;return t.has(n)?!0:(t.add(n),!1)}function ot(e,t,n,r,o,s){let a=L(t,e,n,r,o),i=a?.dist??null,c=a?.acceptance??o;return s.log("dist",i),st(i,c)?{acceptance:c,dist:i}:null}function st(e,t){return e!==null&&e<=t}function at(e,t,n){let r={dist:t.dist,page:n.page};return e?it(t.dist,n.page,e.dist,e.page)?r:e:r}function it(e,t,n,r){return e<n||e===n&&t<r}function ct(e,t,n,r,o){if(!o.enableFuzzy)return;let s=te(t,o.seamLen),a=ne(t,s,o.q);for(let i=0;i<e.length;i++){if(n[i])continue;let c=e[i];if(o.log("excerpt",c),!c||c.length<o.q)continue;let u=re(c,a,o);if(o.log("candidates",u),u.length===0)continue;let l=tt(c,u,t,s,o);o.log("best",l),l&&(r[i]=l.page,n[i]=1)}}function un(e,t,n={}){let r={...O,...n},o=e.map(m=>d(m,"aggressive")),s=t.map(m=>d(m,"aggressive"));n.log&&(n.log("pages",e),n.log("excerpts",t),n.log("pagesN",o),n.log("excerptsN",s));let{patIdToOrigIdxs:a,patterns:i}=_(s),{book:c,starts:u}=B(o),{result:l,seenExact:f}=ee(c,u,i,a,t.length);return n.log&&(n.log("findExactMatches result",l),n.log("seenExact",f)),f.every(m=>m===1)||ct(s,o,f,l,r),n.log&&n.log("performFuzzyMatching result",l),Array.from(l)}function lt(e,t,n,r,o){P(n).find(e,(a,i)=>{let c=n[a],u=i-c.length,l=q(u,t);for(let f of r[a]){let m=o[f],g=m.get(l);(!g||!g.exact)&&m.set(l,{exact:!0,score:1,seam:!1})}})}function ut(e,t,n,r,o,s,a){let i=`${e.page}:${e.start}:${e.seam?1:0}`;if(a.has(i))return;a.add(i);let c=L(t,e,n,r,o);if(!c)return;let{dist:u,acceptance:l}=c;if(u>l)return;let f=1-u/l,m=s.get(e.page);(!m||!m.exact&&f>m.score)&&s.set(e.page,{exact:!1,score:f,seam:e.seam})}function ft(e,t,n,r,o,s,a){if(Array.from(s[e].values()).some(m=>m.exact)||!t||t.length<a.q)return;let c=re(t,o,a);if(c.length===0)return;let u=Math.max(a.maxEditAbs,Math.ceil(a.maxEditRel*t.length)),l=new Set,f=s[e];for(let m of c)ut(m,t,n,r,u,f,l)}function mt(e,t,n,r){let o=te(t,r.seamLen),s=ne(t,o,r.q);for(let a=0;a<e.length;a++)ft(a,e[a],t,o,s,n,r)}var gt=e=>e.size===0?[]:(pt(e),bt(e),xt(e)),pt=e=>{let t=Array.from(e.keys()).sort((n,r)=>n-r);for(let n of t){let r=e.get(n),o=e.get(n+1);if(dt(r,o)){let s=ht(n,r,o);e.delete(s)}}},dt=(e,t)=>!!(e?.seam&&t?.seam),ht=(e,t,n)=>n.score>t.score?e:(n.score<t.score,e+1),bt=e=>{let t=Array.from(e.entries()).filter(([,n])=>n.seam).map(([n])=>n);for(let n of t){let r=e.get(n),o=e.get(n+1);At(r,o)&&e.delete(n)}},At=(e,t)=>t?t.exact||!t.seam&&t.score>=e.score:!1,xt=e=>{let t=[],n=[];for(let r of e.entries())r[1].exact?t.push(r):n.push(r);return t.sort((r,o)=>r[0]-o[0]),n.sort((r,o)=>o[1].score-r[1].score||r[0]-o[0]),[...t,...n].map(r=>r[0])};function fn(e,t,n={}){let r={...O,...n},o=e.map(f=>d(f,"aggressive")),s=t.map(f=>d(f,"aggressive"));n.log&&(n.log("pages",e),n.log("excerpts",t),n.log("pagesN",o),n.log("excerptsN",s));let{patIdToOrigIdxs:a,patterns:i}=_(s),{book:c,starts:u}=B(o),l=Array.from({length:t.length},()=>new Map);return lt(c,u,i,a,l),r.enableFuzzy&&mt(s,o,l,r),l.map(f=>gt(f))}var pn=e=>{if(!e||e.trim().length===0)return!0;let t=e.trim(),n=t.length;if(n<2||Mt(t))return!0;let r=yt(t);if(St(r,n))return!0;let o=h.arabicCharacters.test(t);return!o&&/[a-zA-Z]/.test(t)?!0:o?!Rt(r,n):Ct(r,n,t)};function yt(e){let t={arabicCount:0,charFreq:new Map,digitCount:0,latinCount:0,punctuationCount:0,spaceCount:0,symbolCount:0},n=Array.from(e);for(let r of n)t.charFreq.set(r,(t.charFreq.get(r)||0)+1),h.arabicCharacters.test(r)?t.arabicCount++:/\d/.test(r)?t.digitCount++:/[a-zA-Z]/.test(r)?t.latinCount++:/\s/.test(r)?t.spaceCount++:/[.,;:()[\]{}"""''`]/.test(r)?t.punctuationCount++:t.symbolCount++;return t}function St(e,t){let n=0,r=["!",".","-","=","_"];for(let[o,s]of e.charFreq)s>=5&&r.includes(o)&&(n+=s);return n/t>.4}function Mt(e){return[/^[-=_━≺≻\s]*$/,/^[.\s]*$/,/^[!\s]*$/,/^[A-Z\s]*$/,/^[-\d\s]*$/,/^\d+\s*$/,/^[A-Z]\s*$/,/^[—\s]*$/,/^[्र\s-]*$/].some(n=>n.test(e))}function Ct(e,t,n){let r=e.arabicCount+e.latinCount+e.digitCount;return r===0||Tt(e,r,t)?!0:/[٠-٩]/.test(n)&&e.digitCount>=3?!1:(e.symbolCount+Math.max(0,e.punctuationCount-5))/Math.max(r,1)>2||t<=5&&e.arabicCount===0&&!(/^\d+$/.test(n)&&e.digitCount>=3)?!0:/^\d{3,4}$/.test(n)?!1:t<=10}function Tt(e,t,n){let{arabicCount:r,spaceCount:o}=e;return o>0&&t===o+1&&t<=5||n<=10&&o>=2&&r===0||o/n>.6}function Rt(e,t){return e.arabicCount>=3||e.arabicCount>=1&&e.digitCount>0&&t<=20||e.arabicCount>=2&&e.punctuationCount<=2&&t<=10||e.arabicCount>=1&&t<=5&&e.punctuationCount<=1}var Ft=(e,t,{similarityThreshold:n,typoSymbols:r})=>{if(e===null)return[t];if(t===null)return[e];if(d(e)===d(t))return[e];let o=W(e,t);if(o)return o;let s=$(e,t);if(s)return s;if(r.includes(e)||r.includes(t)){let u=r.find(l=>l===e||l===t);return u?[u]:[e]}let a=d(e),i=d(t);return[S(a,i)>n?e:t]},Pt=(e,t)=>{if(e.length===0)return e;let n=[];for(let r of e){if(n.length===0){n.push(r);continue}let o=n.at(-1);if(R(o,r,t)){r.length<o.length&&(n[n.length-1]=r);continue}k(n,o,r)||n.push(r)}return n},Et=(e,t,n)=>{let r=w(e,n.typoSymbols),o=w(t,n.typoSymbols),a=Y(r,o,n.typoSymbols,n.similarityThreshold).flatMap(([c,u])=>Ft(c,u,n));return Pt(a,n.highSimilarityThreshold).join(" ")},xn=(e,t,{highSimilarityThreshold:n=.8,similarityThreshold:r=.6,typoSymbols:o})=>Et(e,t,{highSimilarityThreshold:n,similarityThreshold:r,typoSymbols:o});export{_e as BRACKETS,Ne as CLOSE_BRACKETS,zt as INTAHA_ACTUAL,Le as OPEN_BRACKETS,h as PATTERNS,$t as alignTextSegments,Y as alignTokenSequences,yt as analyzeCharacterStats,Zt as areBracketsBalanced,Gt as areQuotesBalanced,R as areSimilarAfterNormalization,He as backtrackAlignment,v as boundedLevenshtein,Lt as calculateAlignmentScore,V as calculateLevenshteinDistance,S as calculateSimilarity,Q as checkBalance,Xt as correctReferences,N as extractDigits,un as findMatches,fn as findMatchesAll,xn as fixTypo,jt as getUnbalancedErrors,k as handleFootnoteFusion,W as handleFootnoteSelection,$ as handleStandaloneFootnotes,St as hasExcessiveRepetition,We as hasInvalidFootnotes,pn as isArabicTextNoise,Vt as isBalanced,Mt as isBasicNoisePattern,Ct as isNonArabicNoise,Tt as isSpacingNoise,Rt as isValidArabicContent,Et as processTextAlignment,D as removeFootnoteReferencesSimple,j as removeSingleDigitFootnoteReferences,d as sanitizeArabic,wt as standardizeHijriSymbol,vt as standardizeIntahaSymbol,w as tokenizeText};
1
+ //#region src/utils/sanitize.ts
2
+ const PRESETS = {
3
+ aggressive: {
4
+ collapseWhitespace: true,
5
+ keepOnlyArabicLetters: false,
6
+ lettersAndSpacesOnly: true,
7
+ nfc: true,
8
+ normalizeAlif: true,
9
+ removeHijriMarker: true,
10
+ replaceAlifMaqsurah: true,
11
+ replaceTaMarbutahWithHa: true,
12
+ stripDiacritics: true,
13
+ stripFootnotes: true,
14
+ stripLatinAndSymbols: true,
15
+ stripTatweel: "all",
16
+ stripZeroWidth: true,
17
+ trim: true,
18
+ zeroWidthToSpace: false
19
+ },
20
+ light: {
21
+ collapseWhitespace: true,
22
+ keepOnlyArabicLetters: false,
23
+ lettersAndSpacesOnly: false,
24
+ nfc: true,
25
+ normalizeAlif: false,
26
+ removeHijriMarker: false,
27
+ replaceAlifMaqsurah: false,
28
+ replaceTaMarbutahWithHa: false,
29
+ stripDiacritics: false,
30
+ stripFootnotes: false,
31
+ stripLatinAndSymbols: false,
32
+ stripTatweel: false,
33
+ stripZeroWidth: true,
34
+ trim: true,
35
+ zeroWidthToSpace: false
36
+ },
37
+ search: {
38
+ collapseWhitespace: true,
39
+ keepOnlyArabicLetters: false,
40
+ lettersAndSpacesOnly: false,
41
+ nfc: true,
42
+ normalizeAlif: true,
43
+ removeHijriMarker: true,
44
+ replaceAlifMaqsurah: true,
45
+ replaceTaMarbutahWithHa: false,
46
+ stripDiacritics: true,
47
+ stripFootnotes: true,
48
+ stripLatinAndSymbols: false,
49
+ stripTatweel: "all",
50
+ stripZeroWidth: true,
51
+ trim: true,
52
+ zeroWidthToSpace: false
53
+ }
54
+ };
55
+ const PRESET_NONE = {
56
+ collapseWhitespace: false,
57
+ keepOnlyArabicLetters: false,
58
+ lettersAndSpacesOnly: false,
59
+ nfc: false,
60
+ normalizeAlif: false,
61
+ removeHijriMarker: false,
62
+ replaceAlifMaqsurah: false,
63
+ replaceTaMarbutahWithHa: false,
64
+ stripDiacritics: false,
65
+ stripFootnotes: false,
66
+ stripLatinAndSymbols: false,
67
+ stripTatweel: false,
68
+ stripZeroWidth: false,
69
+ trim: false,
70
+ zeroWidthToSpace: false
71
+ };
72
+ const CHAR_SPACE = 32;
73
+ const CHAR_TATWEEL = 1600;
74
+ const CHAR_HA = 1607;
75
+ const CHAR_YA = 1610;
76
+ const CHAR_WAW = 1608;
77
+ const CHAR_ALIF = 1575;
78
+ const CHAR_ALIF_MADDA = 1570;
79
+ const CHAR_ALIF_HAMZA_ABOVE = 1571;
80
+ const CHAR_WAW_HAMZA_ABOVE = 1572;
81
+ const CHAR_ALIF_HAMZA_BELOW = 1573;
82
+ const CHAR_YEH_HAMZA_ABOVE = 1574;
83
+ const CHAR_ALIF_WASLA = 1649;
84
+ const CHAR_ALIF_MAQSURAH = 1609;
85
+ const CHAR_TA_MARBUTAH = 1577;
86
+ const CHAR_MADDA_ABOVE = 1619;
87
+ const CHAR_HAMZA_ABOVE_MARK = 1620;
88
+ const CHAR_HAMZA_BELOW_MARK = 1621;
89
+ let sharedBuffer = new Uint16Array(2048);
90
+ const decoder = new TextDecoder("utf-16le");
91
+ const isDiacritic = (code) => {
92
+ return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === 1648 || code >= 1750 && code <= 1773;
93
+ };
94
+ const isZeroWidth = (code) => {
95
+ return code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
96
+ };
97
+ const isLatinOrDigit = (code) => {
98
+ return code >= 65 && code <= 90 || code >= 97 && code <= 122 || code >= 48 && code <= 57;
99
+ };
100
+ const isSymbol = (code) => {
101
+ return code === 172 || code === 167 || code === 96 || code === 61 || code === 38 || code === 65018;
102
+ };
103
+ const isArabicLetter = (code) => {
104
+ return code >= 1569 && code <= 1594 || code >= 1601 && code <= 1610 || code === 1649 || code === 1662 || code === 1670 || code >= 1700 && code <= 1711 || code === 1740 || code === 1746 || code === 1747;
105
+ };
106
+ /**
107
+ * Checks whether a code point represents a Western or Arabic-Indic digit.
108
+ *
109
+ * @param code - The numeric code point to evaluate.
110
+ * @returns True when the code point is a digit in either numeral system.
111
+ */
112
+ const isDigit = (code) => code >= 48 && code <= 57 || code >= 1632 && code <= 1641;
113
+ /**
114
+ * Resolves a boolean by taking an optional override over a preset value.
115
+ *
116
+ * @param presetValue - The value defined by the preset.
117
+ * @param override - Optional override provided by the caller.
118
+ * @returns The resolved boolean value.
119
+ */
120
+ const resolveBoolean = (presetValue, override) => override === void 0 ? presetValue : !!override;
121
+ /**
122
+ * Resolves the tatweel mode by taking an optional override over a preset mode.
123
+ * An override of `true` maps to `'safe'` for convenience.
124
+ *
125
+ * @param presetValue - The mode specified by the preset.
126
+ * @param override - Optional override provided by the caller.
127
+ * @returns The resolved tatweel mode.
128
+ */
129
+ const resolveTatweelMode = (presetValue, override) => {
130
+ if (override === void 0) return presetValue;
131
+ if (override === true) return "safe";
132
+ if (override === false) return false;
133
+ return override;
134
+ };
135
+ /**
136
+ * Internal sanitization logic that applies all transformations to a single string.
137
+ * Uses single-pass character transformation for maximum performance when possible.
138
+ * This function assumes all options have been pre-resolved for maximum performance.
139
+ */
140
+ const applySanitization = (input, options) => {
141
+ if (!input) return "";
142
+ const { nfc, stripZW, zwAsSpace, removeHijri, removeDia, tatweelMode, normAlif, maqToYa, taToHa, removeFootnotes, lettersSpacesOnly, stripNoise, lettersOnly, collapseWS, doTrim } = options;
143
+ /**
144
+ * NFC Normalization (Fast Path)
145
+ *
146
+ * `String.prototype.normalize('NFC')` is extremely expensive under high throughput.
147
+ * For Arabic OCR text, the main canonical compositions we care about are:
148
+ * - ا + ◌ٓ (U+0653) → آ
149
+ * - ا + ◌ٔ (U+0654) → أ
150
+ * - ا + ◌ٕ (U+0655) → إ
151
+ * - و + ◌ٔ (U+0654) → ؤ
152
+ * - ي + ◌ٔ (U+0654) → ئ
153
+ *
154
+ * We implement these compositions inline during the main loop, avoiding full NFC
155
+ * normalization in the common case while preserving behavior needed by our sanitizer.
156
+ */
157
+ const text = input;
158
+ const len = text.length;
159
+ if (len > sharedBuffer.length) sharedBuffer = new Uint16Array(len + 1024);
160
+ const buffer = sharedBuffer;
161
+ let bufIdx = 0;
162
+ let lastWasSpace = false;
163
+ let start = 0;
164
+ if (doTrim) while (start < len && text.charCodeAt(start) <= 32) start++;
165
+ for (let i = start; i < len; i++) {
166
+ const code = text.charCodeAt(i);
167
+ if (code <= 32) {
168
+ if (lettersOnly) continue;
169
+ if (collapseWS) {
170
+ if (!lastWasSpace && bufIdx > 0) {
171
+ buffer[bufIdx++] = CHAR_SPACE;
172
+ lastWasSpace = true;
173
+ }
174
+ } else {
175
+ buffer[bufIdx++] = CHAR_SPACE;
176
+ lastWasSpace = false;
177
+ }
178
+ continue;
179
+ }
180
+ if (nfc) {
181
+ if (code === CHAR_MADDA_ABOVE || code === CHAR_HAMZA_ABOVE_MARK || code === CHAR_HAMZA_BELOW_MARK) {
182
+ const prevIdx = bufIdx - 1;
183
+ if (prevIdx >= 0) {
184
+ const prev = buffer[prevIdx];
185
+ let composed = 0;
186
+ if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
187
+ else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
188
+ else composed = CHAR_ALIF_HAMZA_BELOW;
189
+ else if (code === CHAR_HAMZA_ABOVE_MARK) {
190
+ if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
191
+ else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
192
+ }
193
+ if (composed !== 0) {
194
+ buffer[prevIdx] = composed;
195
+ continue;
196
+ }
197
+ }
198
+ }
199
+ }
200
+ if (stripZW && isZeroWidth(code)) {
201
+ if (zwAsSpace) if (collapseWS) {
202
+ if (!lastWasSpace && bufIdx > 0) {
203
+ buffer[bufIdx++] = CHAR_SPACE;
204
+ lastWasSpace = true;
205
+ }
206
+ } else {
207
+ buffer[bufIdx++] = CHAR_SPACE;
208
+ lastWasSpace = false;
209
+ }
210
+ continue;
211
+ }
212
+ if (removeHijri && code === CHAR_HA) {
213
+ let nextIdx = i + 1;
214
+ if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL) nextIdx++;
215
+ let isBoundary = false;
216
+ if (nextIdx >= len) isBoundary = true;
217
+ else {
218
+ const nextCode = text.charCodeAt(nextIdx);
219
+ if (nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45) isBoundary = true;
220
+ }
221
+ if (isBoundary) {
222
+ let backIdx = i - 1;
223
+ while (backIdx >= 0) {
224
+ const c = text.charCodeAt(backIdx);
225
+ if (c <= 32 || isZeroWidth(c)) backIdx--;
226
+ else break;
227
+ }
228
+ if (backIdx >= 0 && isDigit(text.charCodeAt(backIdx))) {
229
+ if (nextIdx > i + 1) i++;
230
+ continue;
231
+ }
232
+ }
233
+ }
234
+ if (removeDia && isDiacritic(code)) continue;
235
+ if (code === CHAR_TATWEEL) {
236
+ if (tatweelMode === "all") continue;
237
+ if (tatweelMode === "safe") {
238
+ let backIdx = bufIdx - 1;
239
+ while (backIdx >= 0 && buffer[backIdx] === CHAR_SPACE) backIdx--;
240
+ if (backIdx >= 0) {
241
+ const prev = buffer[backIdx];
242
+ if (isDigit(prev) || prev === CHAR_HA) {} else continue;
243
+ } else continue;
244
+ }
245
+ }
246
+ if (stripNoise && !lettersSpacesOnly && !lettersOnly) {
247
+ if (isLatinOrDigit(code) || isSymbol(code)) {
248
+ if (collapseWS) {
249
+ if (!lastWasSpace && bufIdx > 0) {
250
+ buffer[bufIdx++] = CHAR_SPACE;
251
+ lastWasSpace = true;
252
+ }
253
+ } else {
254
+ buffer[bufIdx++] = CHAR_SPACE;
255
+ lastWasSpace = false;
256
+ }
257
+ continue;
258
+ }
259
+ if (code === 47 && i + 1 < len && text.charCodeAt(i + 1) === 47) {
260
+ while (i + 1 < len && text.charCodeAt(i + 1) === 47) i++;
261
+ if (collapseWS) {
262
+ if (!lastWasSpace && bufIdx > 0) {
263
+ buffer[bufIdx++] = CHAR_SPACE;
264
+ lastWasSpace = true;
265
+ }
266
+ } else {
267
+ buffer[bufIdx++] = CHAR_SPACE;
268
+ lastWasSpace = false;
269
+ }
270
+ continue;
271
+ }
272
+ }
273
+ if (removeFootnotes && !lettersSpacesOnly && !lettersOnly && code === 40) {
274
+ let nextIdx = i + 1;
275
+ if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
276
+ if (nextIdx < len) {
277
+ const c1 = text.charCodeAt(nextIdx);
278
+ if (c1 === 172) {
279
+ nextIdx++;
280
+ let hasDigits = false;
281
+ while (nextIdx < len) {
282
+ const c = text.charCodeAt(nextIdx);
283
+ if (c >= 1632 && c <= 1641) {
284
+ hasDigits = true;
285
+ nextIdx++;
286
+ } else break;
287
+ }
288
+ if (hasDigits && nextIdx < len) {
289
+ if (text.charCodeAt(nextIdx) === 41) {
290
+ i = nextIdx;
291
+ if (collapseWS) {
292
+ if (!lastWasSpace && bufIdx > 0) {
293
+ buffer[bufIdx++] = CHAR_SPACE;
294
+ lastWasSpace = true;
295
+ }
296
+ } else {
297
+ buffer[bufIdx++] = CHAR_SPACE;
298
+ lastWasSpace = false;
299
+ }
300
+ continue;
301
+ }
302
+ if (text.charCodeAt(nextIdx) === CHAR_SPACE) {
303
+ nextIdx++;
304
+ if (nextIdx < len && text.charCodeAt(nextIdx) === 41) {
305
+ i = nextIdx;
306
+ if (collapseWS) {
307
+ if (!lastWasSpace && bufIdx > 0) {
308
+ buffer[bufIdx++] = CHAR_SPACE;
309
+ lastWasSpace = true;
310
+ }
311
+ } else {
312
+ buffer[bufIdx++] = CHAR_SPACE;
313
+ lastWasSpace = false;
314
+ }
315
+ continue;
316
+ }
317
+ }
318
+ }
319
+ } else if (c1 >= 1632 && c1 <= 1641) {
320
+ let tempIdx = nextIdx + 1;
321
+ let matched = false;
322
+ if (tempIdx < len) {
323
+ const c2 = text.charCodeAt(tempIdx);
324
+ if (c2 === 41) {
325
+ matched = true;
326
+ tempIdx++;
327
+ } else if (c2 === CHAR_SPACE) {
328
+ tempIdx++;
329
+ if (tempIdx < len) {
330
+ const c3 = text.charCodeAt(tempIdx);
331
+ if (c3 >= 1536 && c3 <= 1791) {
332
+ tempIdx++;
333
+ if (tempIdx < len && text.charCodeAt(tempIdx) === 41) {
334
+ matched = true;
335
+ tempIdx++;
336
+ }
337
+ }
338
+ }
339
+ }
340
+ }
341
+ if (matched) {
342
+ i = tempIdx - 1;
343
+ if (collapseWS) {
344
+ if (!lastWasSpace && bufIdx > 0) {
345
+ buffer[bufIdx++] = CHAR_SPACE;
346
+ lastWasSpace = true;
347
+ }
348
+ } else {
349
+ buffer[bufIdx++] = CHAR_SPACE;
350
+ lastWasSpace = false;
351
+ }
352
+ continue;
353
+ }
354
+ }
355
+ }
356
+ }
357
+ if (lettersSpacesOnly || lettersOnly) {
358
+ if (!isArabicLetter(code)) {
359
+ if (lettersOnly) continue;
360
+ if (collapseWS) {
361
+ if (!lastWasSpace && bufIdx > 0) {
362
+ buffer[bufIdx++] = CHAR_SPACE;
363
+ lastWasSpace = true;
364
+ }
365
+ } else {
366
+ buffer[bufIdx++] = CHAR_SPACE;
367
+ lastWasSpace = false;
368
+ }
369
+ continue;
370
+ }
371
+ let outCode$1 = code;
372
+ if (normAlif) {
373
+ if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode$1 = CHAR_ALIF;
374
+ }
375
+ if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode$1 = CHAR_YA;
376
+ if (taToHa && code === CHAR_TA_MARBUTAH) outCode$1 = CHAR_HA;
377
+ buffer[bufIdx++] = outCode$1;
378
+ lastWasSpace = false;
379
+ continue;
380
+ }
381
+ let outCode = code;
382
+ if (normAlif) {
383
+ if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode = CHAR_ALIF;
384
+ }
385
+ if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode = CHAR_YA;
386
+ if (taToHa && code === CHAR_TA_MARBUTAH) outCode = CHAR_HA;
387
+ buffer[bufIdx++] = outCode;
388
+ lastWasSpace = false;
389
+ }
390
+ if (doTrim && lastWasSpace && bufIdx > 0) bufIdx--;
391
+ if (bufIdx === 0) return "";
392
+ const resultView = buffer.subarray(0, bufIdx);
393
+ return decoder.decode(resultView);
394
+ };
395
+ /**
396
+ * Resolves options from a preset or custom options object.
397
+ * Returns all resolved flags for reuse in batch processing.
398
+ */
399
+ const resolveOptions = (optionsOrPreset) => {
400
+ let preset;
401
+ let opts = null;
402
+ if (typeof optionsOrPreset === "string") preset = PRESETS[optionsOrPreset];
403
+ else {
404
+ const base = optionsOrPreset.base ?? "light";
405
+ preset = base === "none" ? PRESET_NONE : PRESETS[base];
406
+ opts = optionsOrPreset;
407
+ }
408
+ return {
409
+ collapseWS: resolveBoolean(preset.collapseWhitespace, opts?.collapseWhitespace),
410
+ doTrim: resolveBoolean(preset.trim, opts?.trim),
411
+ lettersOnly: resolveBoolean(preset.keepOnlyArabicLetters, opts?.keepOnlyArabicLetters),
412
+ lettersSpacesOnly: resolveBoolean(preset.lettersAndSpacesOnly, opts?.lettersAndSpacesOnly),
413
+ maqToYa: resolveBoolean(preset.replaceAlifMaqsurah, opts?.replaceAlifMaqsurah),
414
+ nfc: resolveBoolean(preset.nfc, opts?.nfc),
415
+ normAlif: resolveBoolean(preset.normalizeAlif, opts?.normalizeAlif),
416
+ removeDia: resolveBoolean(preset.stripDiacritics, opts?.stripDiacritics),
417
+ removeFootnotes: resolveBoolean(preset.stripFootnotes, opts?.stripFootnotes),
418
+ removeHijri: resolveBoolean(preset.removeHijriMarker, opts?.removeHijriMarker),
419
+ stripNoise: resolveBoolean(preset.stripLatinAndSymbols, opts?.stripLatinAndSymbols),
420
+ stripZW: resolveBoolean(preset.stripZeroWidth, opts?.stripZeroWidth),
421
+ taToHa: resolveBoolean(preset.replaceTaMarbutahWithHa, opts?.replaceTaMarbutahWithHa),
422
+ tatweelMode: resolveTatweelMode(preset.stripTatweel, opts?.stripTatweel),
423
+ zwAsSpace: resolveBoolean(preset.zeroWidthToSpace, opts?.zeroWidthToSpace)
424
+ };
425
+ };
426
+ /**
427
+ * Creates a reusable sanitizer function with pre-resolved options.
428
+ * Use this when you need to sanitize many strings with the same options
429
+ * for maximum performance.
430
+ *
431
+ * @example
432
+ * ```ts
433
+ * const sanitize = createArabicSanitizer('search');
434
+ * const results = texts.map(sanitize);
435
+ * ```
436
+ */
437
+ const createArabicSanitizer = (optionsOrPreset = "search") => {
438
+ const resolved = resolveOptions(optionsOrPreset);
439
+ return (input) => applySanitization(input, resolved);
440
+ };
441
+ function sanitizeArabic(input, optionsOrPreset = "search") {
442
+ if (Array.isArray(input)) {
443
+ if (input.length === 0) return [];
444
+ const resolved = resolveOptions(optionsOrPreset);
445
+ const results = new Array(input.length);
446
+ for (let i = 0; i < input.length; i++) results[i] = applySanitization(input[i], resolved);
447
+ return results;
448
+ }
449
+ if (!input) return "";
450
+ return applySanitization(input, resolveOptions(optionsOrPreset));
451
+ }
452
+
453
+ //#endregion
454
+ //#region src/utils/levenshthein.ts
455
+ /**
456
+ * Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
457
+ * The Levenshtein distance is the minimum number of single-character edits (insertions,
458
+ * deletions, or substitutions) required to change one string into another.
459
+ *
460
+ * @param textA - First string to compare
461
+ * @param textB - Second string to compare
462
+ * @returns Minimum edit distance between the two strings
463
+ * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths
464
+ * @example
465
+ * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3
466
+ * calculateLevenshteinDistance('', 'hello') // Returns 5
467
+ */
468
+ const calculateLevenshteinDistance = (textA, textB) => {
469
+ const lengthA = textA.length;
470
+ const lengthB = textB.length;
471
+ if (lengthA === 0) return lengthB;
472
+ if (lengthB === 0) return lengthA;
473
+ const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];
474
+ const shortLen = shorter.length;
475
+ const longLen = longer.length;
476
+ let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);
477
+ for (let i = 1; i <= longLen; i++) {
478
+ const currentRow = [i];
479
+ for (let j = 1; j <= shortLen; j++) {
480
+ const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;
481
+ const minCost = Math.min(previousRow[j] + 1, currentRow[j - 1] + 1, previousRow[j - 1] + substitutionCost);
482
+ currentRow.push(minCost);
483
+ }
484
+ previousRow = currentRow;
485
+ }
486
+ return previousRow[shortLen];
487
+ };
488
+ /**
489
+ * Early exit check for bounded Levenshtein distance.
490
+ */
491
+ const shouldEarlyExit = (a, b, maxDist) => {
492
+ if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1;
493
+ if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1;
494
+ if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1;
495
+ return null;
496
+ };
497
+ /**
498
+ * Initializes arrays for bounded Levenshtein calculation.
499
+ */
500
+ const initializeBoundedArrays = (m) => {
501
+ const prev = new Int16Array(m + 1);
502
+ const curr = new Int16Array(m + 1);
503
+ for (let j = 0; j <= m; j++) prev[j] = j;
504
+ return [prev, curr];
505
+ };
506
+ /**
507
+ * Calculates the bounds for the current row in bounded Levenshtein.
508
+ */
509
+ const getRowBounds = (i, maxDist, m) => ({
510
+ from: Math.max(1, i - maxDist),
511
+ to: Math.min(m, i + maxDist)
512
+ });
513
+ /**
514
+ * Processes a single cell in the bounded Levenshtein matrix.
515
+ */
516
+ const processBoundedCell = (a, b, i, j, prev, curr) => {
517
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
518
+ const del = prev[j] + 1;
519
+ const ins = curr[j - 1] + 1;
520
+ const sub = prev[j - 1] + cost;
521
+ return Math.min(del, ins, sub);
522
+ };
523
+ /**
524
+ * Processes a single row in bounded Levenshtein calculation.
525
+ */
526
+ const processBoundedRow = (a, b, i, maxDist, prev, curr) => {
527
+ const m = b.length;
528
+ const big = maxDist + 1;
529
+ const { from, to } = getRowBounds(i, maxDist, m);
530
+ curr[0] = i;
531
+ let rowMin = i;
532
+ for (let j = 1; j < from; j++) curr[j] = big;
533
+ for (let j = to + 1; j <= m; j++) curr[j] = big;
534
+ for (let j = from; j <= to; j++) {
535
+ const val = processBoundedCell(a, b, i, j, prev, curr);
536
+ curr[j] = val;
537
+ if (val < rowMin) rowMin = val;
538
+ }
539
+ return rowMin;
540
+ };
541
+ /**
542
+ * Calculates bounded Levenshtein distance with early termination.
543
+ * More efficient when you only care about distances up to a threshold.
544
+ */
545
+ const boundedLevenshtein = (a, b, maxDist) => {
546
+ const big = maxDist + 1;
547
+ const earlyResult = shouldEarlyExit(a, b, maxDist);
548
+ if (earlyResult !== null) return earlyResult;
549
+ if (a.length > b.length) return boundedLevenshtein(b, a, maxDist);
550
+ let [prev, curr] = initializeBoundedArrays(b.length);
551
+ for (let i = 1; i <= a.length; i++) {
552
+ if (processBoundedRow(a, b, i, maxDist, prev, curr) > maxDist) return big;
553
+ const tmp = prev;
554
+ prev = curr;
555
+ curr = tmp;
556
+ }
557
+ return prev[b.length] <= maxDist ? prev[b.length] : big;
558
+ };
559
+
560
+ //#endregion
561
+ //#region src/utils/similarity.ts
562
+ const ALIGNMENT_SCORES = {
563
+ GAP_PENALTY: -1,
564
+ MISMATCH_PENALTY: -2,
565
+ PERFECT_MATCH: 2,
566
+ SOFT_MATCH: 1
567
+ };
568
+ /**
569
+ * Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
570
+ * Uses Levenshtein distance normalized by the length of the longer string.
571
+ * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.
572
+ *
573
+ * @param textA - First string to compare
574
+ * @param textB - Second string to compare
575
+ * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)
576
+ * @example
577
+ * calculateSimilarity('hello', 'hello') // Returns 1.0
578
+ * calculateSimilarity('hello', 'help') // Returns 0.6
579
+ */
580
+ const calculateSimilarity = (textA, textB) => {
581
+ const maxLength = Math.max(textA.length, textB.length) || 1;
582
+ return (maxLength - calculateLevenshteinDistance(textA, textB)) / maxLength;
583
+ };
584
+ /**
585
+ * Checks if two texts are similar after Arabic normalization.
586
+ * Normalizes both texts by removing diacritics and decorative elements,
587
+ * then compares their similarity against the provided threshold.
588
+ *
589
+ * @param textA - First text to compare
590
+ * @param textB - Second text to compare
591
+ * @param threshold - Similarity threshold (0.0 to 1.0)
592
+ * @returns True if normalized texts meet the similarity threshold
593
+ * @example
594
+ * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true
595
+ */
596
+ const areSimilarAfterNormalization = (textA, textB, threshold = .6) => {
597
+ return calculateSimilarity(sanitizeArabic(textA), sanitizeArabic(textB)) >= threshold;
598
+ };
599
+ /**
600
+ * Calculates alignment score for two tokens in sequence alignment.
601
+ * Uses different scoring criteria: perfect match after normalization gets highest score,
602
+ * typo symbols or highly similar tokens get soft match score, mismatches get penalty.
603
+ *
604
+ * @param tokenA - First token to score
605
+ * @param tokenB - Second token to score
606
+ * @param typoSymbols - Array of special symbols that get preferential treatment
607
+ * @param similarityThreshold - Threshold for considering tokens highly similar
608
+ * @returns Alignment score (higher is better match)
609
+ * @example
610
+ * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)
611
+ * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity
612
+ */
613
+ const calculateAlignmentScore = (tokenA, tokenB, typoSymbols, similarityThreshold) => {
614
+ const normalizedA = sanitizeArabic(tokenA);
615
+ const normalizedB = sanitizeArabic(tokenB);
616
+ if (normalizedA === normalizedB) return ALIGNMENT_SCORES.PERFECT_MATCH;
617
+ const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);
618
+ const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;
619
+ return isTypoSymbol || isHighlySimilar ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY;
620
+ };
621
+ /**
622
+ * Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
623
+ * Follows the directional indicators in the matrix to build the sequence of aligned
624
+ * token pairs from the Needleman-Wunsch algorithm.
625
+ *
626
+ * @param matrix - Scoring matrix with directional information from alignment
627
+ * @param tokensA - First sequence of tokens
628
+ * @param tokensB - Second sequence of tokens
629
+ * @returns Array of aligned token pairs, where null indicates a gap
630
+ * @throws Error if invalid alignment direction is encountered
631
+ */
632
+ const backtrackAlignment = (matrix, tokensA, tokensB) => {
633
+ const alignment = [];
634
+ let i = tokensA.length;
635
+ let j = tokensB.length;
636
+ while (i > 0 || j > 0) switch (matrix[i][j].direction) {
637
+ case "diagonal":
638
+ alignment.push([tokensA[--i], tokensB[--j]]);
639
+ break;
640
+ case "left":
641
+ alignment.push([null, tokensB[--j]]);
642
+ break;
643
+ case "up":
644
+ alignment.push([tokensA[--i], null]);
645
+ break;
646
+ default: throw new Error("Invalid alignment direction");
647
+ }
648
+ return alignment.reverse();
649
+ };
650
+ /**
651
+ * Initializes the scoring matrix with gap penalties.
652
+ *
653
+ * @param lengthA - Length of the first token sequence.
654
+ * @param lengthB - Length of the second token sequence.
655
+ * @returns A matrix seeded with gap penalties for alignment.
656
+ */
657
+ const initializeScoringMatrix = (lengthA, lengthB) => {
658
+ const matrix = Array.from({ length: lengthA + 1 }, () => Array.from({ length: lengthB + 1 }, () => ({
659
+ direction: null,
660
+ score: 0
661
+ })));
662
+ for (let i = 1; i <= lengthA; i++) matrix[i][0] = {
663
+ direction: "up",
664
+ score: i * ALIGNMENT_SCORES.GAP_PENALTY
665
+ };
666
+ for (let j = 1; j <= lengthB; j++) matrix[0][j] = {
667
+ direction: "left",
668
+ score: j * ALIGNMENT_SCORES.GAP_PENALTY
669
+ };
670
+ return matrix;
671
+ };
672
+ /**
673
+ * Determines the best alignment direction and score for a cell.
674
+ *
675
+ * @param diagonalScore - Score achieved by aligning tokens diagonally.
676
+ * @param upScore - Score achieved by inserting a gap in the second sequence.
677
+ * @param leftScore - Score achieved by inserting a gap in the first sequence.
678
+ * @returns The direction and score that maximize the alignment.
679
+ */
680
+ const getBestAlignment = (diagonalScore, upScore, leftScore) => {
681
+ const maxScore = Math.max(diagonalScore, upScore, leftScore);
682
+ if (maxScore === diagonalScore) return {
683
+ direction: "diagonal",
684
+ score: maxScore
685
+ };
686
+ if (maxScore === upScore) return {
687
+ direction: "up",
688
+ score: maxScore
689
+ };
690
+ return {
691
+ direction: "left",
692
+ score: maxScore
693
+ };
694
+ };
695
+ /**
696
+ * Performs global sequence alignment using the Needleman-Wunsch algorithm.
697
+ * Aligns two token sequences to find the optimal pairing that maximizes
698
+ * the total alignment score, handling insertions, deletions, and substitutions.
699
+ *
700
+ * @param tokensA - First sequence of tokens to align
701
+ * @param tokensB - Second sequence of tokens to align
702
+ * @param typoSymbols - Special symbols that affect scoring
703
+ * @param similarityThreshold - Threshold for high similarity scoring
704
+ * @returns Array of aligned token pairs, with null indicating gaps
705
+ * @example
706
+ * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)
707
+ * // Returns [['a', 'a'], ['b', 'c']]
708
+ */
709
+ const alignTokenSequences = (tokensA, tokensB, typoSymbols, similarityThreshold) => {
710
+ const lengthA = tokensA.length;
711
+ const lengthB = tokensB.length;
712
+ const matrix = initializeScoringMatrix(lengthA, lengthB);
713
+ const typoSymbolsSet = new Set(typoSymbols);
714
+ const normalizedA = tokensA.map((t) => sanitizeArabic(t));
715
+ const normalizedB = tokensB.map((t) => sanitizeArabic(t));
716
+ for (let i = 1; i <= lengthA; i++) for (let j = 1; j <= lengthB; j++) {
717
+ const aNorm = normalizedA[i - 1];
718
+ const bNorm = normalizedB[j - 1];
719
+ let alignmentScore;
720
+ if (aNorm === bNorm) alignmentScore = ALIGNMENT_SCORES.PERFECT_MATCH;
721
+ else {
722
+ const isTypo = typoSymbolsSet.has(tokensA[i - 1]) || typoSymbolsSet.has(tokensB[j - 1]);
723
+ const highSim = calculateSimilarity(aNorm, bNorm) >= similarityThreshold;
724
+ alignmentScore = isTypo || highSim ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY;
725
+ }
726
+ const { direction, score } = getBestAlignment(matrix[i - 1][j - 1].score + alignmentScore, matrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY, matrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY);
727
+ matrix[i][j] = {
728
+ direction,
729
+ score
730
+ };
731
+ }
732
+ return backtrackAlignment(matrix, tokensA, tokensB);
733
+ };
734
+
735
+ //#endregion
736
+ //#region src/alignment.ts
737
+ /**
738
+ * Aligns split text segments to match target lines by finding the best order.
739
+ *
740
+ * This function handles cases where text lines have been split into segments
741
+ * and need to be merged back together in the correct order. It compares
742
+ * different arrangements of the segments against target lines to find the
743
+ * best match based on similarity scores.
744
+ *
745
+ * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment
746
+ * @param segmentLines - Array of text segments that may represent split versions of target lines.
747
+ * @returns Array of aligned text lines
748
+ */
749
+ const alignTextSegments = (targetLines, segmentLines) => {
750
+ const alignedLines = [];
751
+ let segmentIndex = 0;
752
+ for (const targetLine of targetLines) {
753
+ if (segmentIndex >= segmentLines.length) break;
754
+ if (targetLine) {
755
+ const { result, segmentsConsumed } = processAlignmentTarget(targetLine, segmentLines, segmentIndex);
756
+ if (result) alignedLines.push(result);
757
+ segmentIndex += segmentsConsumed;
758
+ } else {
759
+ alignedLines.push(segmentLines[segmentIndex]);
760
+ segmentIndex++;
761
+ }
762
+ }
763
+ if (segmentIndex < segmentLines.length) alignedLines.push(...segmentLines.slice(segmentIndex));
764
+ return alignedLines;
765
+ };
766
+ /**
767
+ * Tries to merge two candidate segments in both possible orders and returns the best match.
768
+ *
769
+ * @param targetLine - The line we are trying to reconstruct.
770
+ * @param partA - The first candidate segment to evaluate.
771
+ * @param partB - The second candidate segment to evaluate.
772
+ * @returns The merged segment that best matches the target line after normalization.
773
+ */
774
+ const findBestSegmentMerge = (targetLine, partA, partB) => {
775
+ const mergedForward = `${partA} ${partB}`;
776
+ const mergedReversed = `${partB} ${partA}`;
777
+ const normalizedTarget = sanitizeArabic(targetLine);
778
+ return calculateSimilarity(normalizedTarget, sanitizeArabic(mergedForward)) >= calculateSimilarity(normalizedTarget, sanitizeArabic(mergedReversed)) ? mergedForward : mergedReversed;
779
+ };
780
+ /**
781
+ * Processes a single target line that needs alignment.
782
+ *
783
+ * @param targetLine - The line we are attempting to align to.
784
+ * @param segmentLines - The collection of available text segments.
785
+ * @param segmentIndex - The current index within {@link segmentLines} to consider.
786
+ * @returns An object containing the resulting aligned text and how many segments were consumed.
787
+ */
788
+ const processAlignmentTarget = (targetLine, segmentLines, segmentIndex) => {
789
+ const currentSegment = segmentLines[segmentIndex];
790
+ if (areSimilarAfterNormalization(targetLine, currentSegment)) return {
791
+ result: currentSegment,
792
+ segmentsConsumed: 1
793
+ };
794
+ const partA = segmentLines[segmentIndex];
795
+ const partB = segmentLines[segmentIndex + 1];
796
+ if (!partA || !partB) return partA ? {
797
+ result: partA,
798
+ segmentsConsumed: 1
799
+ } : {
800
+ result: "",
801
+ segmentsConsumed: 0
802
+ };
803
+ return {
804
+ result: findBestSegmentMerge(targetLine, partA, partB),
805
+ segmentsConsumed: 2
806
+ };
807
+ };
808
+
809
+ //#endregion
810
+ //#region src/balance.ts
811
+ /**
812
+ * Checks if all double quotes in a string are balanced and returns detailed error information.
813
+ *
814
+ * A string has balanced quotes when every opening quote has a corresponding closing quote.
815
+ * This function counts all quote characters and determines if there's an even number of them.
816
+ * If there's an odd number, the last quote is marked as unmatched.
817
+ *
818
+ * @param str - The string to check for quote balance
819
+ * @returns An object containing balance status and any errors found
820
+ *
821
+ * @example
822
+ * ```typescript
823
+ * checkQuoteBalance('Hello "world"') // { errors: [], isBalanced: true }
824
+ * checkQuoteBalance('Hello "world') // { errors: [{ char: '"', index: 6, reason: 'unmatched', type: 'quote' }], isBalanced: false }
825
+ * ```
826
+ */
827
+ const checkQuoteBalance = (str) => {
828
+ const errors = [];
829
+ let quoteCount = 0;
830
+ let lastQuoteIndex = -1;
831
+ for (let i = 0; i < str.length; i++) if (str[i] === "\"") {
832
+ quoteCount++;
833
+ lastQuoteIndex = i;
834
+ }
835
+ const isBalanced$1 = quoteCount % 2 === 0;
836
+ if (!isBalanced$1 && lastQuoteIndex !== -1) errors.push({
837
+ char: "\"",
838
+ index: lastQuoteIndex,
839
+ reason: "unmatched",
840
+ type: "quote"
841
+ });
842
+ return {
843
+ errors,
844
+ isBalanced: isBalanced$1
845
+ };
846
+ };
847
+ /** Mapping of opening brackets to their corresponding closing brackets */
848
+ const BRACKETS = {
849
+ "«": "»",
850
+ "(": ")",
851
+ "[": "]",
852
+ "{": "}"
853
+ };
854
+ /** Set of all opening bracket characters */
855
+ const OPEN_BRACKETS = new Set([
856
+ "«",
857
+ "(",
858
+ "[",
859
+ "{"
860
+ ]);
861
+ /** Set of all closing bracket characters */
862
+ const CLOSE_BRACKETS = new Set([
863
+ "»",
864
+ ")",
865
+ "]",
866
+ "}"
867
+ ]);
868
+ /**
869
+ * Checks if all brackets in a string are properly balanced and returns detailed error information.
870
+ *
871
+ * A string has balanced brackets when:
872
+ * - Every opening bracket has a corresponding closing bracket
873
+ * - Brackets are properly nested (no crossing pairs)
874
+ * - Each closing bracket matches the most recent unmatched opening bracket
875
+ *
876
+ * Supports the following bracket pairs: (), [], {}, «»
877
+ *
878
+ * @param str - The string to check for bracket balance
879
+ * @returns An object containing balance status and any errors found
880
+ *
881
+ * @example
882
+ * ```typescript
883
+ * checkBracketBalance('(hello [world])') // { errors: [], isBalanced: true }
884
+ * checkBracketBalance('(hello [world)') // { errors: [{ char: '[', index: 7, reason: 'unclosed', type: 'bracket' }], isBalanced: false }
885
+ * checkBracketBalance('(hello ]world[') // { errors: [...], isBalanced: false }
886
+ * ```
887
+ */
888
+ const checkBracketBalance = (str) => {
889
+ const errors = [];
890
+ const stack = [];
891
+ for (let i = 0; i < str.length; i++) {
892
+ const char = str[i];
893
+ if (OPEN_BRACKETS.has(char)) stack.push({
894
+ char,
895
+ index: i
896
+ });
897
+ else if (CLOSE_BRACKETS.has(char)) {
898
+ const lastOpen = stack.pop();
899
+ if (!lastOpen) errors.push({
900
+ char,
901
+ index: i,
902
+ reason: "unmatched",
903
+ type: "bracket"
904
+ });
905
+ else if (BRACKETS[lastOpen.char] !== char) {
906
+ errors.push({
907
+ char: lastOpen.char,
908
+ index: lastOpen.index,
909
+ reason: "mismatched",
910
+ type: "bracket"
911
+ });
912
+ errors.push({
913
+ char,
914
+ index: i,
915
+ reason: "mismatched",
916
+ type: "bracket"
917
+ });
918
+ }
919
+ }
920
+ }
921
+ stack.forEach(({ char, index }) => {
922
+ errors.push({
923
+ char,
924
+ index,
925
+ reason: "unclosed",
926
+ type: "bracket"
927
+ });
928
+ });
929
+ return {
930
+ errors,
931
+ isBalanced: errors.length === 0
932
+ };
933
+ };
934
+ /**
935
+ * Checks if both quotes and brackets are balanced in a string and returns detailed error information.
936
+ *
937
+ * This function combines the results of both quote and bracket balance checking,
938
+ * providing a comprehensive analysis of all balance issues in the text.
939
+ * The errors are sorted by their position in the string for easier debugging.
940
+ *
941
+ * @param str - The string to check for overall balance
942
+ * @returns An object containing combined balance status and all errors found, sorted by position
943
+ *
944
+ * @example
945
+ * ```typescript
946
+ * checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true }
947
+ * checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false }
948
+ * ```
949
+ */
950
+ const checkBalance = (str) => {
951
+ const quoteResult = checkQuoteBalance(str);
952
+ const bracketResult = checkBracketBalance(str);
953
+ return {
954
+ errors: [...quoteResult.errors, ...bracketResult.errors].sort((a, b) => a.index - b.index),
955
+ isBalanced: quoteResult.isBalanced && bracketResult.isBalanced
956
+ };
957
+ };
958
+ /**
959
+ * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
960
+ *
961
+ * This function processes text line by line, but only checks lines longer than 10 characters
962
+ * for balance issues. It returns absolute positions that can be used with text editors
963
+ * or highlighting components that need precise character positioning across the entire text.
964
+ *
965
+ * The absolute index accounts for newline characters between lines, providing accurate
966
+ * positioning for the original text string.
967
+ *
968
+ * @param text - The multi-line text to analyze for balance errors
969
+ * @returns Array of character errors with absolute positioning information
970
+ *
971
+ * @example
972
+ * ```typescript
973
+ * const text = 'Line 1 with "quote\nLine 2 with (bracket';
974
+ * const errors = getUnbalancedErrors(text);
975
+ * // Returns errors with absoluteIndex pointing to exact character positions
976
+ * ```
977
+ */
978
+ const getUnbalancedErrors = (text) => {
979
+ const characterErrors = [];
980
+ const lines = text.split("\n");
981
+ let absoluteIndex = 0;
982
+ lines.forEach((line, lineIndex) => {
983
+ if (line.length > 10) {
984
+ const balanceResult = checkBalance(line);
985
+ if (!balanceResult.isBalanced) balanceResult.errors.forEach((error) => {
986
+ characterErrors.push({
987
+ absoluteIndex: absoluteIndex + error.index,
988
+ char: error.char,
989
+ reason: error.reason,
990
+ type: error.type
991
+ });
992
+ });
993
+ }
994
+ absoluteIndex += line.length + (lineIndex < lines.length - 1 ? 1 : 0);
995
+ });
996
+ return characterErrors;
997
+ };
998
+ /**
999
+ * Checks if all double quotes in a string are balanced.
1000
+ *
1001
+ * This is a convenience function that returns only the boolean result
1002
+ * without detailed error information.
1003
+ *
1004
+ * @param str - The string to check for quote balance
1005
+ * @returns True if quotes are balanced, false otherwise
1006
+ *
1007
+ * @example
1008
+ * ```typescript
1009
+ * areQuotesBalanced('Hello "world"') // true
1010
+ * areQuotesBalanced('Hello "world') // false
1011
+ * ```
1012
+ */
1013
+ const areQuotesBalanced = (str) => {
1014
+ return checkQuoteBalance(str).isBalanced;
1015
+ };
1016
+ /**
1017
+ * Checks if all brackets in a string are properly balanced.
1018
+ *
1019
+ * This is a convenience function that returns only the boolean result
1020
+ * without detailed error information.
1021
+ *
1022
+ * @param str - The string to check for bracket balance
1023
+ * @returns True if brackets are balanced, false otherwise
1024
+ *
1025
+ * @example
1026
+ * ```typescript
1027
+ * areBracketsBalanced('(hello [world])') // true
1028
+ * areBracketsBalanced('(hello [world') // false
1029
+ * ```
1030
+ */
1031
+ const areBracketsBalanced = (str) => {
1032
+ return checkBracketBalance(str).isBalanced;
1033
+ };
1034
+ /**
1035
+ * Checks if both quotes and brackets are balanced in a string.
1036
+ *
1037
+ * This is a convenience function that returns only the boolean result
1038
+ * without detailed error information.
1039
+ *
1040
+ * @param str - The string to check for overall balance
1041
+ * @returns True if both quotes and brackets are balanced, false otherwise
1042
+ *
1043
+ * @example
1044
+ * ```typescript
1045
+ * isBalanced('Hello "world" and (test)') // true
1046
+ * isBalanced('Hello "world and (test') // false
1047
+ * ```
1048
+ */
1049
+ const isBalanced = (str) => {
1050
+ return checkBalance(str).isBalanced;
1051
+ };
1052
+
1053
+ //#endregion
1054
+ //#region src/utils/textUtils.ts
1055
+ const INTAHA_ACTUAL = "اهـ";
1056
+ /**
1057
+ * Collection of regex patterns used throughout the library for text processing
1058
+ */
1059
+ const PATTERNS = {
1060
+ arabicCharacters: /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,
1061
+ arabicDigits: /[0-9\u0660-\u0669]+/,
1062
+ arabicFootnoteReferenceRegex: /^\([\u0660-\u0669]+\)/g,
1063
+ arabicLettersAndDigits: /[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,
1064
+ arabicPunctuationAndWhitespace: /[\s\u060C\u061B\u061F\u06D4]+/,
1065
+ arabicReferenceRegex: /\([\u0660-\u0669]+\)/g,
1066
+ footnoteEmbedded: /\([0-9\u0660-\u0669]+\)/,
1067
+ footnoteStandalone: /^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,
1068
+ invalidReferenceRegex: /\(\)|\([.1OV9]+\)/g,
1069
+ ocrConfusedFootnoteReferenceRegex: /^\([.1OV9]+\)/g,
1070
+ ocrConfusedReferenceRegex: /\([.1OV9]+\)/g,
1071
+ whitespace: /\s+/
1072
+ };
1073
+ /**
1074
+ * Extracts the first sequence of Arabic or Western digits from text.
1075
+ * Used primarily for footnote number comparison to match related footnote elements.
1076
+ *
1077
+ * @param text - Text containing digits to extract
1078
+ * @returns First digit sequence found, or empty string if none found
1079
+ * @example
1080
+ * extractDigits('(٥)أخرجه البخاري') // Returns '٥'
1081
+ * extractDigits('See note (123)') // Returns '123'
1082
+ */
1083
+ const extractDigits = (text) => {
1084
+ const match = text.match(PATTERNS.arabicDigits);
1085
+ return match ? match[0] : "";
1086
+ };
1087
+ /**
1088
+ * Tokenizes text into individual words while preserving special symbols.
1089
+ * Adds spacing around preserved symbols to ensure they are tokenized separately,
1090
+ * then splits on whitespace.
1091
+ *
1092
+ * @param text - Text to tokenize
1093
+ * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
1094
+ * @returns Array of tokens, or empty array if input is empty/whitespace
1095
+ * @example
1096
+ * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']
1097
+ */
1098
+ const tokenizeText = (text, preserveSymbols = []) => {
1099
+ let processedText = text;
1100
+ for (const symbol of preserveSymbols) {
1101
+ const symbolRegex = new RegExp(symbol, "g");
1102
+ processedText = processedText.replace(symbolRegex, ` ${symbol} `);
1103
+ }
1104
+ return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);
1105
+ };
1106
+ /**
1107
+ * Handles fusion of standalone and embedded footnotes during token processing.
1108
+ * Detects patterns where standalone footnotes should be merged with embedded ones
1109
+ * or where trailing standalone footnotes should be skipped.
1110
+ *
1111
+ * @param result - Current result array being built
1112
+ * @param previousToken - The previous token in the sequence
1113
+ * @param currentToken - The current token being processed
1114
+ * @returns True if the current token was handled (fused or skipped), false otherwise
1115
+ * @example
1116
+ * // (٥) + (٥)أخرجه → result gets (٥)أخرجه
1117
+ * // (٥)أخرجه + (٥) → (٥) is skipped
1118
+ */
1119
+ const handleFootnoteFusion = (result, previousToken, currentToken) => {
1120
+ const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);
1121
+ const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);
1122
+ const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);
1123
+ const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);
1124
+ const prevDigits = extractDigits(previousToken);
1125
+ const currDigits = extractDigits(currentToken);
1126
+ if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {
1127
+ result[result.length - 1] = currentToken;
1128
+ return true;
1129
+ }
1130
+ if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) return true;
1131
+ return false;
1132
+ };
1133
+ /**
1134
+ * Handles selection logic for tokens with embedded footnotes during alignment.
1135
+ * Prefers tokens that contain embedded footnotes over plain text, and among
1136
+ * tokens with embedded footnotes, prefers the shorter one.
1137
+ *
1138
+ * @param tokenA - First token to compare
1139
+ * @param tokenB - Second token to compare
1140
+ * @returns Array containing selected token(s), or null if no special handling needed
1141
+ * @example
1142
+ * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']
1143
+ * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']
1144
+ */
1145
+ const handleFootnoteSelection = (tokenA, tokenB) => {
1146
+ const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);
1147
+ const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);
1148
+ if (aHasEmbedded && !bHasEmbedded) return [tokenA];
1149
+ if (bHasEmbedded && !aHasEmbedded) return [tokenB];
1150
+ if (aHasEmbedded && bHasEmbedded) return [tokenA.length <= tokenB.length ? tokenA : tokenB];
1151
+ return null;
1152
+ };
1153
+ /**
1154
+ * Handles selection logic for standalone footnote tokens during alignment.
1155
+ * Manages cases where one or both tokens are standalone footnotes, preserving
1156
+ * both tokens when one is a footnote and the other is regular text.
1157
+ *
1158
+ * @param tokenA - First token to compare
1159
+ * @param tokenB - Second token to compare
1160
+ * @returns Array containing selected token(s), or null if no special handling needed
1161
+ * @example
1162
+ * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']
1163
+ * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
1164
+ */
1165
+ const handleStandaloneFootnotes = (tokenA, tokenB) => {
1166
+ const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);
1167
+ const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);
1168
+ if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];
1169
+ if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];
1170
+ if (aIsFootnote && bIsFootnote) return [tokenA.length <= tokenB.length ? tokenA : tokenB];
1171
+ return null;
1172
+ };
1173
+ /**
1174
+ * Removes simple footnote references from Arabic text.
1175
+ * Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC).
1176
+ *
1177
+ * @param text - The input text containing footnote references to remove
1178
+ * @returns The text with footnote references removed and extra spaces normalized
1179
+ *
1180
+ * @example
1181
+ * ```typescript
1182
+ * removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية")
1183
+ * // Returns: "هذا النص يحتوي على حاشية"
1184
+ * ```
1185
+ */
1186
+ const removeFootnoteReferencesSimple = (text) => {
1187
+ return text.replace(/ ?\(\u00AC[\u0660-\u0669]+\) ?/g, " ").replace(/ +/g, " ").trim();
1188
+ };
1189
+ /**
1190
+ * Removes single digit footnote references and extended footnote formats from Arabic text.
1191
+ * Handles footnotes in the format:
1192
+ * - ([single Arabic digit]) - e.g., (٣)
1193
+ * - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب)
1194
+ *
1195
+ * @param text - The input text containing footnote references to remove
1196
+ * @returns The text with footnote references removed and extra spaces normalized
1197
+ *
1198
+ * @example
1199
+ * ```typescript
1200
+ * removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي")
1201
+ * // Returns: "هذا النص والآخر والثالث يحتوي على حواشي"
1202
+ * ```
1203
+ */
1204
+ const removeSingleDigitFootnoteReferences = (text) => {
1205
+ return text.replace(/ ?\([٠-٩]{1}(\s+[\u0600-\u06FF])?\) ?/g, " ").replace(/ +/g, " ").trim();
1206
+ };
1207
+ /**
1208
+ * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
1209
+ * @param text - Input text to process
1210
+ * @returns Text with standardized Hijri symbols
1211
+ */
1212
+ const standardizeHijriSymbol = (text) => {
1213
+ return text.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu, "$1 هـ");
1214
+ };
1215
+ /**
1216
+ * Standardizes standalone اه to اهـ when appearing as whole word
1217
+ * @param text - Input text to process
1218
+ * @returns Text with standardized AH Hijri symbols
1219
+ */
1220
+ const standardizeIntahaSymbol = (text) => {
1221
+ return text.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu, `$1${INTAHA_ACTUAL}`);
1222
+ };
1223
+
1224
+ //#endregion
1225
+ //#region src/footnotes.ts
1226
+ const INVALID_FOOTNOTE = "()";
1227
+ /**
1228
+ * Checks if the given text contains invalid footnote references.
1229
+ * Invalid footnotes include empty parentheses "()" or OCR-confused characters
1230
+ * like ".1OV9" that were misrecognized instead of Arabic numerals.
1231
+ *
1232
+ * @param text - Text to check for invalid footnote patterns
1233
+ * @returns True if text contains invalid footnote references, false otherwise
1234
+ * @example
1235
+ * hasInvalidFootnotes('This text has ()') // Returns true
1236
+ * hasInvalidFootnotes('This text has (١)') // Returns false
1237
+ * hasInvalidFootnotes('OCR mistake (O)') // Returns true
1238
+ */
1239
+ const hasInvalidFootnotes = (text) => {
1240
+ return PATTERNS.invalidReferenceRegex.test(text);
1241
+ };
1242
+ const arabicFormatter = new Intl.NumberFormat("ar-SA");
1243
+ /**
1244
+ * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.
1245
+ * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.
1246
+ *
1247
+ * @param num - The number to convert to Arabic numerals
1248
+ * @returns String representation using Arabic-Indic digits (٠-٩)
1249
+ * @example
1250
+ * numberToArabic(123) // Returns '١٢٣'
1251
+ * numberToArabic(5) // Returns '٥'
1252
+ */
1253
+ const numberToArabic = (num) => {
1254
+ return arabicFormatter.format(num);
1255
+ };
1256
+ /**
1257
+ * Converts OCR-confused characters to their corresponding Arabic-Indic numerals.
1258
+ * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.
1259
+ *
1260
+ * @param char - Single character that may be an OCR mistake
1261
+ * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists
1262
+ * @example
1263
+ * ocrToArabic('O') // Returns '٥' (O often confused with ٥)
1264
+ * ocrToArabic('1') // Returns '١' (1 often confused with ١)
1265
+ * ocrToArabic('.') // Returns '٠' (dot often confused with ٠)
1266
+ */
1267
+ const ocrToArabic = (char) => {
1268
+ return {
1269
+ "1": "١",
1270
+ "9": "٩",
1271
+ ".": "٠",
1272
+ O: "٥",
1273
+ o: "٥",
1274
+ V: "٧",
1275
+ v: "٧"
1276
+ }[char] || char;
1277
+ };
1278
+ /**
1279
+ * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.
1280
+ * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.
1281
+ *
1282
+ * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(١٢٣)'
1283
+ * @returns Parsed number, or 0 if parsing fails
1284
+ * @example
1285
+ * arabicToNumber('(١٢٣)') // Returns 123
1286
+ * arabicToNumber('(٥)') // Returns 5
1287
+ * arabicToNumber('invalid') // Returns 0
1288
+ */
1289
+ const arabicToNumber = (arabicStr) => {
1290
+ const lookup = {
1291
+ "٠": "0",
1292
+ "١": "1",
1293
+ "٢": "2",
1294
+ "٣": "3",
1295
+ "٤": "4",
1296
+ "٥": "5",
1297
+ "٦": "6",
1298
+ "٧": "7",
1299
+ "٨": "8",
1300
+ "٩": "9"
1301
+ };
1302
+ const digits = arabicStr.replace(/[()]/g, "");
1303
+ let numStr = "";
1304
+ for (const char of digits) numStr += lookup[char];
1305
+ const parsed = parseInt(numStr, 10);
1306
+ return Number.isNaN(parsed) ? 0 : parsed;
1307
+ };
1308
+ /**
1309
+ * Extracts all footnote references from text lines, categorizing them by type and location.
1310
+ * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.
1311
+ *
1312
+ * @param lines - Array of text line objects with optional isFootnote flag
1313
+ * @returns Object containing categorized reference arrays:
1314
+ * - bodyReferences: All valid references found in body text
1315
+ * - footnoteReferences: All valid references found in footnotes
1316
+ * - ocrConfusedInBody: OCR-confused references in body text (for tracking)
1317
+ * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)
1318
+ * @example
1319
+ * const lines = [
1320
+ * { text: 'Body with (١) and (O)', isFootnote: false },
1321
+ * { text: '(١) Footnote text', isFootnote: true }
1322
+ * ];
1323
+ * const refs = extractReferences(lines);
1324
+ * // refs.bodyReferences contains ['(١)', '(٥)'] - OCR 'O' converted to '٥'
1325
+ */
1326
+ const extractReferences = (lines) => {
1327
+ const arabicReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);
1328
+ const ocrConfusedReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);
1329
+ const arabicReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);
1330
+ const ocrConfusedReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);
1331
+ const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)));
1332
+ const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)));
1333
+ return {
1334
+ bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],
1335
+ footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],
1336
+ ocrConfusedInBody: ocrConfusedReferencesInBody,
1337
+ ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes
1338
+ };
1339
+ };
1340
+ /**
1341
+ * Determines if footnote reference correction is needed by checking for:
1342
+ * 1. Invalid footnote patterns (empty parentheses, OCR mistakes)
1343
+ * 2. Mismatched sets of references between body text and footnotes
1344
+ * 3. Different counts of references in body vs footnotes
1345
+ *
1346
+ * @param lines - Array of text line objects to analyze
1347
+ * @param references - Extracted reference data from extractReferences()
1348
+ * @returns True if correction is needed, false if references are already correct
1349
+ * @example
1350
+ * const lines = [{ text: 'Text with ()', isFootnote: false }];
1351
+ * const refs = extractReferences(lines);
1352
+ * needsCorrection(lines, refs) // Returns true due to invalid "()" reference
1353
+ */
1354
+ const needsCorrection = (lines, references) => {
1355
+ if (lines.some((line) => hasInvalidFootnotes(line.text))) return true;
1356
+ const bodySet = new Set(references.bodyReferences);
1357
+ const footnoteSet = new Set(references.footnoteReferences);
1358
+ if (bodySet.size !== footnoteSet.size) return true;
1359
+ for (const ref of bodySet) if (!footnoteSet.has(ref)) return true;
1360
+ return false;
1361
+ };
1362
+ /**
1363
+ * Corrects footnote references in an array of text lines by:
1364
+ * 1. Converting OCR-confused characters to proper Arabic numerals
1365
+ * 2. Filling in empty "()" references with appropriate numbers
1366
+ * 3. Ensuring footnote references in body text match those in footnotes
1367
+ * 4. Generating new reference numbers when needed
1368
+ *
1369
+ * @param lines - Array of text line objects, each with optional isFootnote flag
1370
+ * @returns Array of corrected text lines with proper footnote references
1371
+ * @example
1372
+ * const lines = [
1373
+ * { text: 'Main text with ()', isFootnote: false },
1374
+ * { text: '() This is a footnote', isFootnote: true }
1375
+ * ];
1376
+ * const corrected = correctReferences(lines);
1377
+ * // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
1378
+ */
1379
+ const correctReferences = (lines) => {
1380
+ if (!needsCorrection(lines, extractReferences(lines))) return lines;
1381
+ const sanitizedLines = lines.map((line) => {
1382
+ let updatedText = line.text;
1383
+ updatedText = updatedText.replace(/\([.1OV9]+\)/g, (match) => {
1384
+ return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));
1385
+ });
1386
+ return {
1387
+ ...line,
1388
+ text: updatedText
1389
+ };
1390
+ });
1391
+ const cleanReferences = extractReferences(sanitizedLines);
1392
+ const bodyRefSet = new Set(cleanReferences.bodyReferences);
1393
+ const footnoteRefSet = new Set(cleanReferences.footnoteReferences);
1394
+ const uniqueBodyRefs = [...new Set(cleanReferences.bodyReferences)];
1395
+ const uniqueFootnoteRefs = [...new Set(cleanReferences.footnoteReferences)];
1396
+ const bodyRefsForFootnotes = uniqueBodyRefs.filter((ref) => !footnoteRefSet.has(ref));
1397
+ const footnoteRefsForBody = uniqueFootnoteRefs.filter((ref) => !bodyRefSet.has(ref));
1398
+ const allRefs = [...bodyRefSet, ...footnoteRefSet];
1399
+ const referenceCounter = { count: (allRefs.length > 0 ? Math.max(0, ...allRefs.map((ref) => arabicToNumber(ref))) : 0) + 1 };
1400
+ return sanitizedLines.map((line) => {
1401
+ if (!line.text.includes(INVALID_FOOTNOTE)) return line;
1402
+ let updatedText = line.text;
1403
+ updatedText = updatedText.replace(/\(\)/g, () => {
1404
+ if (line.isFootnote) {
1405
+ const availableRef = bodyRefsForFootnotes.shift();
1406
+ if (availableRef) return availableRef;
1407
+ } else {
1408
+ const availableRef = footnoteRefsForBody.shift();
1409
+ if (availableRef) return availableRef;
1410
+ }
1411
+ const newRef = `(${numberToArabic(referenceCounter.count)})`;
1412
+ referenceCounter.count++;
1413
+ return newRef;
1414
+ });
1415
+ return {
1416
+ ...line,
1417
+ text: updatedText
1418
+ };
1419
+ });
1420
+ };
1421
+
1422
+ //#endregion
1423
+ //#region src/utils/ahocorasick.ts
1424
+ /**
1425
+ * Node in the Aho-Corasick automaton trie structure.
1426
+ * Each node represents a state in the pattern matching automaton.
1427
+ */
1428
+ var ACNode = class {
1429
+ /** Transition map from characters to next node indices */
1430
+ next = /* @__PURE__ */ new Map();
1431
+ /** Failure link for efficient pattern matching */
1432
+ link = 0;
1433
+ /** Pattern IDs that end at this node */
1434
+ out = [];
1435
+ };
1436
+ /**
1437
+ * Aho-Corasick automaton for efficient multi-pattern string matching.
1438
+ * Provides O(n + m + z) time complexity where n is text length,
1439
+ * m is total pattern length, and z is number of matches.
1440
+ */
1441
+ var AhoCorasick = class {
1442
+ /** Array of nodes forming the automaton */
1443
+ nodes = [new ACNode()];
1444
+ /**
1445
+ * Adds a pattern to the automaton trie.
1446
+ *
1447
+ * @param pattern - Pattern string to add
1448
+ * @param id - Unique identifier for this pattern
1449
+ */
1450
+ add(pattern, id) {
1451
+ let v = 0;
1452
+ for (let i = 0; i < pattern.length; i++) {
1453
+ const ch = pattern[i];
1454
+ let to = this.nodes[v].next.get(ch);
1455
+ if (to === void 0) {
1456
+ to = this.nodes.length;
1457
+ this.nodes[v].next.set(ch, to);
1458
+ this.nodes.push(new ACNode());
1459
+ }
1460
+ v = to;
1461
+ }
1462
+ this.nodes[v].out.push(id);
1463
+ }
1464
+ /**
1465
+ * Builds failure links for the automaton using BFS.
1466
+ * Must be called after adding all patterns and before searching.
1467
+ */
1468
+ build() {
1469
+ const q = [];
1470
+ for (const [, to] of this.nodes[0].next) {
1471
+ this.nodes[to].link = 0;
1472
+ q.push(to);
1473
+ }
1474
+ for (let qi = 0; qi < q.length; qi++) {
1475
+ const v = q[qi];
1476
+ for (const [ch, to] of this.nodes[v].next) {
1477
+ q.push(to);
1478
+ let link = this.nodes[v].link;
1479
+ while (link !== 0 && !this.nodes[link].next.has(ch)) link = this.nodes[link].link;
1480
+ const nxt = this.nodes[link].next.get(ch);
1481
+ this.nodes[to].link = nxt === void 0 ? 0 : nxt;
1482
+ const linkOut = this.nodes[this.nodes[to].link].out;
1483
+ if (linkOut.length) this.nodes[to].out.push(...linkOut);
1484
+ }
1485
+ }
1486
+ }
1487
+ /**
1488
+ * Finds all pattern matches in the given text.
1489
+ *
1490
+ * @param text - Text to search in
1491
+ * @param onMatch - Callback function called for each match found
1492
+ * Receives pattern ID and end position of the match
1493
+ */
1494
+ find(text, onMatch) {
1495
+ let v = 0;
1496
+ for (let i = 0; i < text.length; i++) {
1497
+ const ch = text[i];
1498
+ while (v !== 0 && !this.nodes[v].next.has(ch)) v = this.nodes[v].link;
1499
+ const to = this.nodes[v].next.get(ch);
1500
+ v = to === void 0 ? 0 : to;
1501
+ if (this.nodes[v].out.length) for (const pid of this.nodes[v].out) onMatch(pid, i + 1);
1502
+ }
1503
+ }
1504
+ };
1505
+ /**
1506
+ * Builds Aho-Corasick automaton for exact pattern matching.
1507
+ *
1508
+ * @param patterns - Array of patterns to search for
1509
+ * @returns Constructed and built Aho-Corasick automaton ready for searching
1510
+ *
1511
+ * @example
1512
+ * ```typescript
1513
+ * const patterns = ['hello', 'world', 'hell'];
1514
+ * const ac = buildAhoCorasick(patterns);
1515
+ * ac.find('hello world', (patternId, endPos) => {
1516
+ * console.log(`Found pattern ${patternId} ending at position ${endPos}`);
1517
+ * });
1518
+ * ```
1519
+ */
1520
+ const buildAhoCorasick = (patterns) => {
1521
+ const ac = new AhoCorasick();
1522
+ for (let pid = 0; pid < patterns.length; pid++) {
1523
+ const pat = patterns[pid];
1524
+ if (pat.length > 0) ac.add(pat, pid);
1525
+ }
1526
+ ac.build();
1527
+ return ac;
1528
+ };
1529
+
1530
+ //#endregion
1531
+ //#region src/utils/constants.ts
1532
+ const DEFAULT_POLICY = {
1533
+ enableFuzzy: true,
1534
+ gramsPerExcerpt: 5,
1535
+ log: () => {},
1536
+ maxCandidatesPerExcerpt: 40,
1537
+ maxEditAbs: 3,
1538
+ maxEditRel: .1,
1539
+ q: 4,
1540
+ seamLen: 512
1541
+ };
1542
+
1543
+ //#endregion
1544
+ //#region src/utils/fuzzyUtils.ts
1545
+ const SEAM_GAP_CEILING = 200;
1546
+ const SEAM_BONUS_CAP = 80;
1547
+ /**
1548
+ * Builds a concatenated book from pages with position tracking
1549
+ */
1550
+ function buildBook(pagesN) {
1551
+ const parts = [];
1552
+ const starts = [];
1553
+ const lens = [];
1554
+ let off = 0;
1555
+ for (let i = 0; i < pagesN.length; i++) {
1556
+ const p = pagesN[i];
1557
+ starts.push(off);
1558
+ lens.push(p.length);
1559
+ parts.push(p);
1560
+ off += p.length;
1561
+ if (i + 1 < pagesN.length) {
1562
+ parts.push(" ");
1563
+ off += 1;
1564
+ }
1565
+ }
1566
+ return {
1567
+ book: parts.join(""),
1568
+ lens,
1569
+ starts
1570
+ };
1571
+ }
1572
+ /**
1573
+ * Binary search to find which page contains a given position
1574
+ */
1575
+ function posToPage(pos, pageStarts) {
1576
+ let lo = 0;
1577
+ let hi = pageStarts.length - 1;
1578
+ let ans = 0;
1579
+ while (lo <= hi) {
1580
+ const mid = lo + hi >> 1;
1581
+ if (pageStarts[mid] <= pos) {
1582
+ ans = mid;
1583
+ lo = mid + 1;
1584
+ } else hi = mid - 1;
1585
+ }
1586
+ return ans;
1587
+ }
1588
+ /**
1589
+ * Performs exact matching using Aho-Corasick algorithm to find all occurrences
1590
+ * of patterns in the concatenated book text.
1591
+ *
1592
+ * @param book - Concatenated text from all pages
1593
+ * @param pageStarts - Array of starting positions for each page in the book
1594
+ * @param patterns - Array of deduplicated patterns to search for
1595
+ * @param patIdToOrigIdxs - Mapping from pattern IDs to original excerpt indices
1596
+ * @param excerpts - Original array of excerpts (used for length reference)
1597
+ * @returns Object containing result array and exact match flags
1598
+ */
1599
+ function findExactMatches(book, pageStarts, patterns, patIdToOrigIdxs, excerptsCount) {
1600
+ const ac = buildAhoCorasick(patterns);
1601
+ const result = new Int32Array(excerptsCount).fill(-1);
1602
+ const seenExact = new Uint8Array(excerptsCount);
1603
+ ac.find(book, (pid, endPos) => {
1604
+ const startPage = posToPage(endPos - patterns[pid].length, pageStarts);
1605
+ for (const origIdx of patIdToOrigIdxs[pid]) if (!seenExact[origIdx]) {
1606
+ result[origIdx] = startPage;
1607
+ seenExact[origIdx] = 1;
1608
+ }
1609
+ });
1610
+ return {
1611
+ result,
1612
+ seenExact
1613
+ };
1614
+ }
1615
+ /**
1616
+ * Deduplicates excerpts and creates pattern mapping
1617
+ */
1618
+ function deduplicateExcerpts(excerptsN) {
1619
+ const keyToPatId = /* @__PURE__ */ new Map();
1620
+ const patIdToOrigIdxs = [];
1621
+ const patterns = [];
1622
+ for (let i = 0; i < excerptsN.length; i++) {
1623
+ const k = excerptsN[i];
1624
+ let pid = keyToPatId.get(k);
1625
+ if (pid === void 0) {
1626
+ pid = patterns.length;
1627
+ keyToPatId.set(k, pid);
1628
+ patterns.push(k);
1629
+ patIdToOrigIdxs.push([i]);
1630
+ } else patIdToOrigIdxs[pid].push(i);
1631
+ }
1632
+ return {
1633
+ keyToPatId,
1634
+ patIdToOrigIdxs,
1635
+ patterns
1636
+ };
1637
+ }
1638
+ /**
1639
+ * Calculates fuzzy match score for a candidate using bounded Levenshtein distance.
1640
+ * Extracts a window around the candidate position and computes edit distance.
1641
+ *
1642
+ * @param excerpt - Text excerpt to match
1643
+ * @param candidate - Candidate position to evaluate
1644
+ * @param pagesN - Array of normalized page texts
1645
+ * @param seams - Array of seam data
1646
+ * @param maxDist - Maximum edit distance to consider
1647
+ * @returns Edit distance if within bounds, null otherwise
1648
+ */
1649
+ const calculateFuzzyScore = (excerpt, candidate, pagesN, seams, maxDist) => {
1650
+ const L = excerpt.length;
1651
+ const extra = Math.min(maxDist, Math.max(6, Math.ceil(L * .12)));
1652
+ const half = Math.floor(extra / 2);
1653
+ const start0 = candidate.start - half;
1654
+ const base = candidate.seam ? seams[candidate.page]?.text : pagesN[candidate.page];
1655
+ if (!base) return null;
1656
+ return findBestMatch(generateWindows(createWindowBuilder(candidate, pagesN, seams, start0, L, extra), candidate, base, start0, L, extra), excerpt, calculateAcceptance(candidate, base, start0, L, extra, maxDist));
1657
+ };
1658
+ /**
1659
+ * Creates a window builder function for the given candidate
1660
+ */
1661
+ const createWindowBuilder = (candidate, pagesN, seams, start0, L, extra) => {
1662
+ return (trimTailEndBy = 0, trimHeadStartBy = 0) => {
1663
+ if (candidate.seam) return buildSeamWindow(seams, candidate.page, start0, L, extra);
1664
+ return buildPageWindow(pagesN, candidate.page, start0, L, extra, trimTailEndBy, trimHeadStartBy);
1665
+ };
1666
+ };
1667
+ /**
1668
+ * Builds a window from seam text
1669
+ */
1670
+ const buildSeamWindow = (seams, page, start0, L, extra) => {
1671
+ const seam = seams[page]?.text;
1672
+ if (!seam) return null;
1673
+ const s0 = Math.max(0, start0);
1674
+ const desired = L + extra;
1675
+ const end = Math.min(seam.length, s0 + desired);
1676
+ return end > s0 ? seam.slice(s0, end) : null;
1677
+ };
1678
+ /**
1679
+ * Builds a window from page text, potentially spanning multiple pages
1680
+ */
1681
+ const buildPageWindow = (pagesN, page, start0, L, extra, trimTailEndBy, trimHeadStartBy) => {
1682
+ const base = pagesN[page];
1683
+ if (!base) return null;
1684
+ const desired = L + extra;
1685
+ let s0 = start0;
1686
+ let window = "";
1687
+ if (s0 < 0) {
1688
+ const needFromPrev = Math.max(0, -s0 - trimHeadStartBy);
1689
+ if (needFromPrev > 0) window += buildPreviousPagesContent(pagesN, page, needFromPrev);
1690
+ s0 = 0;
1691
+ }
1692
+ const end0 = Math.min(base.length - trimTailEndBy, Math.max(0, s0) + desired - window.length);
1693
+ if (end0 > s0) window += base.slice(Math.max(0, s0), end0);
1694
+ window += buildFollowingPagesContent(pagesN, page, desired - window.length);
1695
+ return window.length ? window : null;
1696
+ };
1697
+ /**
1698
+ * Builds content from previous pages
1699
+ */
1700
+ const buildPreviousPagesContent = (pagesN, currentPage, needed) => {
1701
+ let needPre = needed;
1702
+ let pp = currentPage - 1;
1703
+ const bits = [];
1704
+ while (needPre > 0 && pp >= 0) {
1705
+ const src = pagesN[pp];
1706
+ if (!src) break;
1707
+ const take = Math.min(needPre, src.length);
1708
+ const chunk = src.slice(src.length - take);
1709
+ bits.unshift(chunk);
1710
+ needPre -= chunk.length;
1711
+ pp--;
1712
+ }
1713
+ return bits.length ? `${bits.join(" ")} ` : "";
1714
+ };
1715
+ /**
1716
+ * Builds content from following pages
1717
+ */
1718
+ const buildFollowingPagesContent = (pagesN, currentPage, remaining) => {
1719
+ let content = "";
1720
+ let pn = currentPage + 1;
1721
+ while (remaining > 0 && pn < pagesN.length) {
1722
+ const src = pagesN[pn];
1723
+ if (!src) break;
1724
+ const addition = src.slice(0, remaining);
1725
+ if (!addition.length) break;
1726
+ content += ` ${addition}`;
1727
+ remaining -= addition.length;
1728
+ pn++;
1729
+ }
1730
+ return content;
1731
+ };
1732
+ /**
1733
+ * Generates all possible windows for matching
1734
+ */
1735
+ const generateWindows = (buildWindow, candidate, base, start0, L, extra) => {
1736
+ const windows = [];
1737
+ const desired = L + extra;
1738
+ const crossesEnd = !candidate.seam && start0 + desired > base.length;
1739
+ const crossesStart = !candidate.seam && start0 < 0;
1740
+ const w0 = buildWindow(0, 0);
1741
+ if (w0) windows.push(w0);
1742
+ if (crossesEnd) {
1743
+ const cut = Math.min(SEAM_GAP_CEILING, Math.max(0, base.length - Math.max(0, start0)));
1744
+ if (cut > 0) {
1745
+ const wTrimTail = buildWindow(cut, 0);
1746
+ if (wTrimTail) windows.push(wTrimTail);
1747
+ }
1748
+ }
1749
+ if (crossesStart) {
1750
+ const wTrimHead = buildWindow(0, Math.min(SEAM_GAP_CEILING, -start0));
1751
+ if (wTrimHead) windows.push(wTrimHead);
1752
+ }
1753
+ return windows;
1754
+ };
1755
+ /**
1756
+ * Calculates the acceptance threshold for edit distance
1757
+ */
1758
+ const calculateAcceptance = (candidate, base, start0, L, extra, maxDist) => {
1759
+ const desired = L + extra;
1760
+ const crossesEnd = !candidate.seam && start0 + desired > base.length;
1761
+ const crossesStart = !candidate.seam && start0 < 0;
1762
+ const normalizationSlack = Math.min(2, Math.max(1, Math.ceil(L * .005)));
1763
+ return crossesEnd || crossesStart || candidate.seam ? maxDist + Math.min(SEAM_BONUS_CAP, Math.ceil(L * .08)) : maxDist + normalizationSlack;
1764
+ };
1765
+ /**
1766
+ * Finds the best match among all windows
1767
+ */
1768
+ const findBestMatch = (windows, excerpt, acceptance) => {
1769
+ let best = null;
1770
+ for (const w of windows) {
1771
+ const d = boundedLevenshtein(excerpt, w, acceptance);
1772
+ if (d <= acceptance && (best == null || d < best)) best = d;
1773
+ }
1774
+ return best == null ? null : {
1775
+ acceptance,
1776
+ dist: best
1777
+ };
1778
+ };
1779
+
1780
+ //#endregion
1781
+ //#region src/utils/qgram.ts
1782
+ /**
1783
+ * Q-gram index for efficient fuzzy string matching candidate generation.
1784
+ * Maintains an inverted index of q-grams to their occurrence positions.
1785
+ */
1786
+ var QGramIndex = class {
1787
+ /** Length of q-grams to index */
1788
+ q;
1789
+ /** Inverted index mapping q-grams to their postings */
1790
+ map = /* @__PURE__ */ new Map();
1791
+ /** Frequency count for each q-gram in the corpus */
1792
+ gramFreq = /* @__PURE__ */ new Map();
1793
+ /**
1794
+ * Creates a new Q-gram index with the specified gram length.
1795
+ * @param q - Length of q-grams to index (typically 3-5)
1796
+ */
1797
+ constructor(q) {
1798
+ this.q = q;
1799
+ }
1800
+ /**
1801
+ * Adds text to the index, extracting q-grams and building postings.
1802
+ *
1803
+ * @param page - Page number or identifier for this text
1804
+ * @param text - Text content to index
1805
+ * @param seam - Whether this text represents a seam (cross-page boundary)
1806
+ */
1807
+ addText(page, text, seam) {
1808
+ const q = this.q;
1809
+ const m = text.length;
1810
+ if (m < q) return;
1811
+ for (let i = 0; i + q <= m; i++) {
1812
+ const gram = text.slice(i, i + q);
1813
+ let postings = this.map.get(gram);
1814
+ if (!postings) {
1815
+ postings = [];
1816
+ this.map.set(gram, postings);
1817
+ }
1818
+ postings.push({
1819
+ page,
1820
+ pos: i,
1821
+ seam
1822
+ });
1823
+ this.gramFreq.set(gram, (this.gramFreq.get(gram) ?? 0) + 1);
1824
+ }
1825
+ }
1826
+ /**
1827
+ * Picks the rarest grams from an excerpt that exist in the index.
1828
+ */
1829
+ pickRare(excerpt, gramsPerExcerpt) {
1830
+ gramsPerExcerpt = Math.max(1, Math.floor(gramsPerExcerpt));
1831
+ const items = [];
1832
+ const seen = /* @__PURE__ */ new Set();
1833
+ const q = this.q;
1834
+ for (let i = 0; i + q <= excerpt.length; i++) {
1835
+ const gram = excerpt.slice(i, i + q);
1836
+ if (seen.has(gram)) continue;
1837
+ seen.add(gram);
1838
+ const freq = this.gramFreq.get(gram) ?? 2147483647;
1839
+ items.push({
1840
+ freq,
1841
+ gram,
1842
+ offset: i
1843
+ });
1844
+ }
1845
+ items.sort((a, b) => a.freq - b.freq);
1846
+ const result = [];
1847
+ for (const it of items) if (this.map.has(it.gram)) {
1848
+ result.push({
1849
+ gram: it.gram,
1850
+ offset: it.offset
1851
+ });
1852
+ if (result.length >= gramsPerExcerpt) return result;
1853
+ }
1854
+ if (result.length < gramsPerExcerpt) {
1855
+ const chosen = new Set(result.map((r) => r.gram));
1856
+ for (let i = items.length - 1; i >= 0 && result.length < gramsPerExcerpt; i--) {
1857
+ const it = items[i];
1858
+ if (this.map.has(it.gram) && !chosen.has(it.gram)) {
1859
+ result.push({
1860
+ gram: it.gram,
1861
+ offset: it.offset
1862
+ });
1863
+ chosen.add(it.gram);
1864
+ }
1865
+ }
1866
+ }
1867
+ return result;
1868
+ }
1869
+ getPostings(gram) {
1870
+ return this.map.get(gram);
1871
+ }
1872
+ };
1873
+
1874
+ //#endregion
1875
+ //#region src/fuzzy.ts
1876
+ /**
1877
+ * Creates seam data for cross-page matching by combining text from adjacent page boundaries.
1878
+ * Seams help find matches that span across page breaks.
1879
+ *
1880
+ * @param pagesN - Array of normalized page texts
1881
+ * @param seamLen - Length of text to take from each page boundary
1882
+ * @returns Array of seam data structures
1883
+ */
1884
+ function createSeams(pagesN, seamLen) {
1885
+ const seams = [];
1886
+ for (let p = 0; p + 1 < pagesN.length; p++) {
1887
+ const text = `${pagesN[p].slice(-seamLen)} ${pagesN[p + 1].slice(0, seamLen)}`;
1888
+ seams.push({
1889
+ startPage: p,
1890
+ text
1891
+ });
1892
+ }
1893
+ return seams;
1894
+ }
1895
+ /**
1896
+ * Builds Q-gram index for efficient fuzzy matching candidate generation.
1897
+ * The index contains both regular pages and cross-page seams.
1898
+ *
1899
+ * @param pagesN - Array of normalized page texts
1900
+ * @param seams - Array of seam data for cross-page matching
1901
+ * @param q - Length of q-grams to index
1902
+ * @returns Constructed Q-gram index
1903
+ */
1904
+ function buildQGramIndex(pagesN, seams, q) {
1905
+ const qidx = new QGramIndex(q);
1906
+ for (let p = 0; p < pagesN.length; p++) qidx.addText(p, pagesN[p], false);
1907
+ for (let p = 0; p < seams.length; p++) qidx.addText(p, seams[p].text, true);
1908
+ return qidx;
1909
+ }
1910
+ /**
1911
+ * Generates fuzzy matching candidates using rare q-grams from the excerpt.
1912
+ * Uses frequency-based selection to find the most discriminative grams.
1913
+ *
1914
+ * @param excerpt - Text excerpt to find candidates for
1915
+ * @param qidx - Q-gram index containing page and seam data
1916
+ * @param cfg - Match policy configuration
1917
+ * @returns Array of candidate match positions
1918
+ */
1919
+ function generateCandidates(excerpt, qidx, cfg) {
1920
+ const seeds = qidx.pickRare(excerpt, cfg.gramsPerExcerpt);
1921
+ if (seeds.length === 0) return [];
1922
+ const candidates = [];
1923
+ const seenKeys = /* @__PURE__ */ new Set();
1924
+ const excerptLen = excerpt.length;
1925
+ outer: for (const { gram, offset } of seeds) {
1926
+ const posts = qidx.getPostings(gram);
1927
+ if (!posts) continue;
1928
+ for (const p of posts) {
1929
+ const startPos = p.pos - offset;
1930
+ if (startPos < -Math.floor(excerptLen * .25)) continue;
1931
+ const start = Math.max(0, startPos);
1932
+ const key = `${p.page}:${start}:${p.seam ? 1 : 0}`;
1933
+ if (seenKeys.has(key)) continue;
1934
+ candidates.push({
1935
+ page: p.page,
1936
+ seam: p.seam,
1937
+ start
1938
+ });
1939
+ seenKeys.add(key);
1940
+ if (candidates.length >= cfg.maxCandidatesPerExcerpt) break outer;
1941
+ }
1942
+ }
1943
+ return candidates;
1944
+ }
1945
+ /**
1946
+ * Finds the best fuzzy match among candidates by comparing edit distances.
1947
+ * Prioritizes lower edit distance, then earlier page number for tie-breaking.
1948
+ *
1949
+ * @param excerpt - Text excerpt to match
1950
+ * @param candidates - Array of candidate positions to evaluate
1951
+ * @param pagesN - Array of normalized page texts
1952
+ * @param seams - Array of seam data
1953
+ * @param cfg - Match policy configuration
1954
+ * @returns Best fuzzy match or null if none found
1955
+ */
1956
+ function findBestFuzzyMatch(excerpt, candidates, pagesN, seams, cfg) {
1957
+ if (excerpt.length === 0) return null;
1958
+ const maxDist = calculateMaxDistance(excerpt, cfg);
1959
+ cfg.log("maxDist", maxDist);
1960
+ const keyset = /* @__PURE__ */ new Set();
1961
+ let best = null;
1962
+ for (const candidate of candidates) {
1963
+ if (shouldSkipCandidate(candidate, keyset)) continue;
1964
+ const match = evaluateCandidate(candidate, excerpt, pagesN, seams, maxDist, cfg);
1965
+ if (!match) continue;
1966
+ best = updateBestMatch(best, match, candidate);
1967
+ cfg.log("findBest best", best);
1968
+ if (match.dist === 0) break;
1969
+ }
1970
+ return best;
1971
+ }
1972
+ /**
1973
+ * Calculates the maximum edit distance allowed for a fuzzy comparison.
1974
+ *
1975
+ * @param excerpt - The excerpt currently being matched.
1976
+ * @param cfg - The resolved matching policy in effect.
1977
+ * @returns The maximum permitted edit distance for the excerpt.
1978
+ */
1979
+ function calculateMaxDistance(excerpt, cfg) {
1980
+ return Math.max(cfg.maxEditAbs, Math.ceil(cfg.maxEditRel * excerpt.length));
1981
+ }
1982
+ /**
1983
+ * Checks whether a candidate has already been processed and should be skipped.
1984
+ *
1985
+ * @param candidate - The candidate under consideration.
1986
+ * @param keyset - A set of serialized candidate keys used for deduplication.
1987
+ * @returns True if the candidate was seen before, otherwise false.
1988
+ */
1989
+ function shouldSkipCandidate(candidate, keyset) {
1990
+ const key = `${candidate.page}:${candidate.start}:${candidate.seam ? 1 : 0}`;
1991
+ if (keyset.has(key)) return true;
1992
+ keyset.add(key);
1993
+ return false;
1994
+ }
1995
+ /**
1996
+ * Evaluates a candidate by computing its fuzzy score and checking acceptance.
1997
+ *
1998
+ * @param candidate - The candidate segment to evaluate.
1999
+ * @param excerpt - The normalized excerpt being matched.
2000
+ * @param pagesN - Normalized page content collection.
2001
+ * @param seams - Precomputed seam data for cross-page matching.
2002
+ * @param maxDist - Maximum allowed edit distance for this excerpt.
2003
+ * @param cfg - The resolved matching policy.
2004
+ * @returns The candidate's distance and acceptance threshold if valid, otherwise null.
2005
+ */
2006
+ function evaluateCandidate(candidate, excerpt, pagesN, seams, maxDist, cfg) {
2007
+ const res = calculateFuzzyScore(excerpt, candidate, pagesN, seams, maxDist);
2008
+ const dist = res?.dist ?? null;
2009
+ const acceptance = res?.acceptance ?? maxDist;
2010
+ cfg.log("dist", dist);
2011
+ return isValidMatch(dist, acceptance) ? {
2012
+ acceptance,
2013
+ dist
2014
+ } : null;
2015
+ }
2016
+ /**
2017
+ * Determines whether an evaluated match satisfies its acceptance threshold.
2018
+ *
2019
+ * @param dist - The computed edit distance for the match.
2020
+ * @param acceptance - The maximum acceptable distance for the match.
2021
+ * @returns True when the match should be accepted.
2022
+ */
2023
+ function isValidMatch(dist, acceptance) {
2024
+ return dist !== null && dist <= acceptance;
2025
+ }
2026
+ /**
2027
+ * Updates the running "best" match if the current candidate improves it.
2028
+ *
2029
+ * @param current - The previously best fuzzy match, if any.
2030
+ * @param match - The latest candidate match metrics.
2031
+ * @param candidate - The candidate metadata associated with {@link match}.
2032
+ * @returns The preferred match after considering the candidate.
2033
+ */
2034
+ function updateBestMatch(current, match, candidate) {
2035
+ const newMatch = {
2036
+ dist: match.dist,
2037
+ page: candidate.page
2038
+ };
2039
+ if (!current) return newMatch;
2040
+ return isBetterMatch(match.dist, candidate.page, current.dist, current.page) ? newMatch : current;
2041
+ }
2042
+ /**
2043
+ * Determines whether a new match outranks the current best match.
2044
+ *
2045
+ * @param newDist - Edit distance for the new match.
2046
+ * @param newPage - Page index where the new match resides.
2047
+ * @param bestDist - Edit distance of the existing best match.
2048
+ * @param bestPage - Page index of the existing best match.
2049
+ * @returns True if the new match should replace the current best match.
2050
+ */
2051
+ function isBetterMatch(newDist, newPage, bestDist, bestPage) {
2052
+ return newDist < bestDist || newDist === bestDist && newPage < bestPage;
2053
+ }
2054
+ /**
2055
+ * Performs fuzzy matching for excerpts that didn't have exact matches.
2056
+ * Uses Q-gram indexing and bounded Levenshtein distance for efficiency.
2057
+ *
2058
+ * @param excerptsN - Array of normalized excerpts
2059
+ * @param pagesN - Array of normalized page texts
2060
+ * @param seenExact - Flags indicating which excerpts had exact matches
2061
+ * @param result - Result array to update with fuzzy match pages
2062
+ * @param cfg - Match policy configuration
2063
+ */
2064
+ function performFuzzyMatching(excerptsN, pagesN, seenExact, result, cfg) {
2065
+ if (!cfg.enableFuzzy) return;
2066
+ const seams = createSeams(pagesN, cfg.seamLen);
2067
+ const qidx = buildQGramIndex(pagesN, seams, cfg.q);
2068
+ for (let i = 0; i < excerptsN.length; i++) {
2069
+ if (seenExact[i]) continue;
2070
+ const excerpt = excerptsN[i];
2071
+ cfg.log("excerpt", excerpt);
2072
+ if (!excerpt || excerpt.length < cfg.q) continue;
2073
+ const candidates = generateCandidates(excerpt, qidx, cfg);
2074
+ cfg.log("candidates", candidates);
2075
+ if (candidates.length === 0) continue;
2076
+ const best = findBestFuzzyMatch(excerpt, candidates, pagesN, seams, cfg);
2077
+ cfg.log("best", best);
2078
+ if (best) {
2079
+ result[i] = best.page;
2080
+ seenExact[i] = 1;
2081
+ }
2082
+ }
2083
+ }
2084
+ /**
2085
+ * Main function to find the single best match per excerpt.
2086
+ * Combines exact matching with fuzzy matching for comprehensive text search.
2087
+ *
2088
+ * @param pages - Array of page texts to search within
2089
+ * @param excerpts - Array of text excerpts to find matches for
2090
+ * @param policy - Optional matching policy configuration
2091
+ * @returns Array of page indices (one per excerpt, -1 if no match found)
2092
+ *
2093
+ * @example
2094
+ * ```typescript
2095
+ * const pages = ['Hello world', 'Goodbye world'];
2096
+ * const excerpts = ['Hello', 'Good bye']; // Note the typo
2097
+ * const matches = findMatches(pages, excerpts, { enableFuzzy: true });
2098
+ * // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
2099
+ * ```
2100
+ */
2101
+ function findMatches(pages, excerpts, policy = {}) {
2102
+ const cfg = {
2103
+ ...DEFAULT_POLICY,
2104
+ ...policy
2105
+ };
2106
+ const pagesN = pages.map((p) => sanitizeArabic(p, "aggressive"));
2107
+ const excerptsN = excerpts.map((e) => sanitizeArabic(e, "aggressive"));
2108
+ if (policy.log) {
2109
+ policy.log("pages", pages);
2110
+ policy.log("excerpts", excerpts);
2111
+ policy.log("pagesN", pagesN);
2112
+ policy.log("excerptsN", excerptsN);
2113
+ }
2114
+ const { patIdToOrigIdxs, patterns } = deduplicateExcerpts(excerptsN);
2115
+ const { book, starts: pageStarts } = buildBook(pagesN);
2116
+ const { result, seenExact } = findExactMatches(book, pageStarts, patterns, patIdToOrigIdxs, excerpts.length);
2117
+ if (policy.log) {
2118
+ policy.log("findExactMatches result", result);
2119
+ policy.log("seenExact", seenExact);
2120
+ }
2121
+ if (!seenExact.every((seen) => seen === 1)) performFuzzyMatching(excerptsN, pagesN, seenExact, result, cfg);
2122
+ if (policy.log) policy.log("performFuzzyMatching result", result);
2123
+ return Array.from(result);
2124
+ }
2125
+ /**
2126
+ * Records exact matches for the findMatchesAll function.
2127
+ * Updates the hits tracking structure with exact match information.
2128
+ *
2129
+ * @param book - Concatenated text from all pages
2130
+ * @param pageStarts - Array of starting positions for each page
2131
+ * @param patterns - Array of deduplicated patterns to search for
2132
+ * @param patIdToOrigIdxs - Mapping from pattern IDs to original excerpt indices
2133
+ * @param hitsByExcerpt - Array of maps tracking hits per excerpt
2134
+ */
2135
+ function recordExactMatches(book, pageStarts, patterns, patIdToOrigIdxs, hitsByExcerpt) {
2136
+ buildAhoCorasick(patterns).find(book, (pid, endPos) => {
2137
+ const startPage = posToPage(endPos - patterns[pid].length, pageStarts);
2138
+ for (const origIdx of patIdToOrigIdxs[pid]) {
2139
+ const hits = hitsByExcerpt[origIdx];
2140
+ const prev = hits.get(startPage);
2141
+ if (!prev || !prev.exact) hits.set(startPage, {
2142
+ exact: true,
2143
+ score: 1,
2144
+ seam: false
2145
+ });
2146
+ }
2147
+ });
2148
+ }
2149
+ /**
2150
+ * Processes a single fuzzy candidate and updates hits if a better match is found.
2151
+ * Used internally by the findMatchesAll function for comprehensive matching.
2152
+ *
2153
+ * @param candidate - Candidate position to evaluate
2154
+ * @param excerpt - Text excerpt being matched
2155
+ * @param pagesN - Array of normalized page texts
2156
+ * @param seams - Array of seam data
2157
+ * @param maxDist - Maximum edit distance threshold
2158
+ * @param hits - Map of page hits to update
2159
+ * @param keyset - Set to track processed candidates (for deduplication)
2160
+ */
2161
+ function processFuzzyCandidate(candidate, excerpt, pagesN, seams, maxDist, hits, keyset) {
2162
+ const key = `${candidate.page}:${candidate.start}:${candidate.seam ? 1 : 0}`;
2163
+ if (keyset.has(key)) return;
2164
+ keyset.add(key);
2165
+ const res = calculateFuzzyScore(excerpt, candidate, pagesN, seams, maxDist);
2166
+ if (!res) return;
2167
+ const { dist, acceptance } = res;
2168
+ if (dist > acceptance) return;
2169
+ const score = 1 - dist / acceptance;
2170
+ const entry = hits.get(candidate.page);
2171
+ if (!entry || !entry.exact && score > entry.score) hits.set(candidate.page, {
2172
+ exact: false,
2173
+ score,
2174
+ seam: candidate.seam
2175
+ });
2176
+ }
2177
+ /**
2178
+ * Processes fuzzy matching for a single excerpt in the findMatchesAll function.
2179
+ * Generates candidates and evaluates them for potential matches.
2180
+ *
2181
+ * @param excerptIndex - Index of the excerpt being processed
2182
+ * @param excerpt - Text excerpt to find matches for
2183
+ * @param pagesN - Array of normalized page texts
2184
+ * @param seams - Array of seam data
2185
+ * @param qidx - Q-gram index for candidate generation
2186
+ * @param hitsByExcerpt - Array of maps tracking hits per excerpt
2187
+ * @param cfg - Match policy configuration
2188
+ */
2189
+ function processSingleExcerptFuzzy(excerptIndex, excerpt, pagesN, seams, qidx, hitsByExcerpt, cfg) {
2190
+ if (Array.from(hitsByExcerpt[excerptIndex].values()).some((v) => v.exact)) return;
2191
+ if (!excerpt || excerpt.length < cfg.q) return;
2192
+ const candidates = generateCandidates(excerpt, qidx, cfg);
2193
+ if (candidates.length === 0) return;
2194
+ const maxDist = Math.max(cfg.maxEditAbs, Math.ceil(cfg.maxEditRel * excerpt.length));
2195
+ const keyset = /* @__PURE__ */ new Set();
2196
+ const hits = hitsByExcerpt[excerptIndex];
2197
+ for (const candidate of candidates) processFuzzyCandidate(candidate, excerpt, pagesN, seams, maxDist, hits, keyset);
2198
+ }
2199
+ /**
2200
+ * Records fuzzy matches for excerpts that don't have exact matches.
2201
+ * Used by findMatchesAll to provide comprehensive matching results.
2202
+ *
2203
+ * @param excerptsN - Array of normalized excerpts
2204
+ * @param pagesN - Array of normalized page texts
2205
+ * @param hitsByExcerpt - Array of maps tracking hits per excerpt
2206
+ * @param cfg - Match policy configuration
2207
+ */
2208
+ function recordFuzzyMatches(excerptsN, pagesN, hitsByExcerpt, cfg) {
2209
+ const seams = createSeams(pagesN, cfg.seamLen);
2210
+ const qidx = buildQGramIndex(pagesN, seams, cfg.q);
2211
+ for (let i = 0; i < excerptsN.length; i++) processSingleExcerptFuzzy(i, excerptsN[i], pagesN, seams, qidx, hitsByExcerpt, cfg);
2212
+ }
2213
+ /**
2214
+ * Sorts matches by quality and page order for optimal ranking.
2215
+ * Exact matches are prioritized over fuzzy matches, with secondary sorting by page order.
2216
+ *
2217
+ * @param hits - Map of page hits with quality scores
2218
+ * @returns Array of page numbers sorted by match quality
2219
+ */
2220
+ const sortMatches = (hits) => {
2221
+ if (hits.size === 0) return [];
2222
+ collapseAdjacentSeams(hits);
2223
+ removeWeakSeams(hits);
2224
+ return rankHits(hits);
2225
+ };
2226
+ /**
2227
+ * Removes weaker seam matches from adjacent seam pairs.
2228
+ *
2229
+ * @param hits - Mutable map of page hits that may contain seam entries.
2230
+ */
2231
+ const collapseAdjacentSeams = (hits) => {
2232
+ const pagesAsc = Array.from(hits.keys()).sort((a, b) => a - b);
2233
+ for (const page of pagesAsc) {
2234
+ const currentHit = hits.get(page);
2235
+ const nextHit = hits.get(page + 1);
2236
+ if (shouldCollapseSeams(currentHit, nextHit)) {
2237
+ const pageToRemove = selectWeakerSeam(page, currentHit, nextHit);
2238
+ hits.delete(pageToRemove);
2239
+ }
2240
+ }
2241
+ };
2242
+ /**
2243
+ * Checks whether two neighboring hits are both seams that should be merged.
2244
+ *
2245
+ * @param hit1 - The first hit in the pair.
2246
+ * @param hit2 - The second hit in the pair.
2247
+ * @returns True if both hits represent seam matches.
2248
+ */
2249
+ const shouldCollapseSeams = (hit1, hit2) => {
2250
+ return Boolean(hit1?.seam && hit2?.seam);
2251
+ };
2252
+ /**
2253
+ * Selects which seam page to discard based on score ordering.
2254
+ *
2255
+ * @param page1 - The page index for the first seam hit.
2256
+ * @param hit1 - The first seam hit entry.
2257
+ * @param hit2 - The second seam hit entry on the following page.
2258
+ * @returns The page index that should be removed from the hits map.
2259
+ */
2260
+ const selectWeakerSeam = (page1, hit1, hit2) => {
2261
+ if (hit2.score > hit1.score) return page1;
2262
+ if (hit2.score < hit1.score) return page1 + 1;
2263
+ return page1 + 1;
2264
+ };
2265
+ /**
2266
+ * Removes seam hits that are redundant compared to their neighbors.
2267
+ *
2268
+ * @param hits - Mutable map of page hits that may contain redundant seams.
2269
+ */
2270
+ const removeWeakSeams = (hits) => {
2271
+ const seamPages = Array.from(hits.entries()).filter(([, hit]) => hit.seam).map(([page]) => page);
2272
+ for (const page of seamPages) if (isSeamRedundant(hits.get(page), hits.get(page + 1))) hits.delete(page);
2273
+ };
2274
+ /**
2275
+ * Determines whether a seam hit is redundant when compared to its neighbor.
2276
+ *
2277
+ * @param seamHit - The seam hit currently under review.
2278
+ * @param neighbor - The neighboring hit on the following page, if any.
2279
+ * @returns True if the seam hit should be removed.
2280
+ */
2281
+ const isSeamRedundant = (seamHit, neighbor) => {
2282
+ if (!neighbor) return false;
2283
+ return neighbor.exact || !neighbor.seam && neighbor.score >= seamHit.score;
2284
+ };
2285
+ /**
2286
+ * Splits hits into exact and fuzzy categories, then sorts and merges them.
2287
+ *
2288
+ * @param hits - Map of page hits to rank.
2289
+ * @returns Sorted page numbers ordered by relevance.
2290
+ */
2291
+ const rankHits = (hits) => {
2292
+ const exact = [];
2293
+ const fuzzy = [];
2294
+ for (const entry of hits.entries()) if (entry[1].exact) exact.push(entry);
2295
+ else fuzzy.push(entry);
2296
+ exact.sort((a, b) => a[0] - b[0]);
2297
+ fuzzy.sort((a, b) => b[1].score - a[1].score || a[0] - b[0]);
2298
+ return [...exact, ...fuzzy].map((entry) => entry[0]);
2299
+ };
2300
+ /**
2301
+ * Main function to find all matches per excerpt, ranked by quality.
2302
+ * Returns comprehensive results with both exact and fuzzy matches for each excerpt.
2303
+ *
2304
+ * @param pages - Array of page texts to search within
2305
+ * @param excerpts - Array of text excerpts to find matches for
2306
+ * @param policy - Optional matching policy configuration
2307
+ * @returns Array of page index arrays (one array per excerpt, sorted by match quality)
2308
+ *
2309
+ * @example
2310
+ * ```typescript
2311
+ * const pages = ['Hello world', 'Hello there', 'Goodbye world'];
2312
+ * const excerpts = ['Hello'];
2313
+ * const matches = findMatchesAll(pages, excerpts);
2314
+ * // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
2315
+ * ```
2316
+ */
2317
+ function findMatchesAll(pages, excerpts, policy = {}) {
2318
+ const cfg = {
2319
+ ...DEFAULT_POLICY,
2320
+ ...policy
2321
+ };
2322
+ const pagesN = pages.map((p) => sanitizeArabic(p, "aggressive"));
2323
+ const excerptsN = excerpts.map((e) => sanitizeArabic(e, "aggressive"));
2324
+ if (policy.log) {
2325
+ policy.log("pages", pages);
2326
+ policy.log("excerpts", excerpts);
2327
+ policy.log("pagesN", pagesN);
2328
+ policy.log("excerptsN", excerptsN);
2329
+ }
2330
+ const { patIdToOrigIdxs, patterns } = deduplicateExcerpts(excerptsN);
2331
+ const { book, starts: pageStarts } = buildBook(pagesN);
2332
+ const hitsByExcerpt = Array.from({ length: excerpts.length }, () => /* @__PURE__ */ new Map());
2333
+ recordExactMatches(book, pageStarts, patterns, patIdToOrigIdxs, hitsByExcerpt);
2334
+ if (cfg.enableFuzzy) recordFuzzyMatches(excerptsN, pagesN, hitsByExcerpt, cfg);
2335
+ return hitsByExcerpt.map((hits) => sortMatches(hits));
2336
+ }
2337
+
2338
+ //#endregion
2339
+ //#region src/noise.ts
2340
+ /**
2341
+ * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
2342
+ * This function performs comprehensive analysis to identify patterns commonly associated
2343
+ * with OCR errors, formatting artifacts, or meaningless content in Arabic text processing.
2344
+ *
2345
+ * @param text - The input string to analyze for noise patterns
2346
+ * @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content
2347
+ *
2348
+ * @example
2349
+ * ```typescript
2350
+ * import { isArabicTextNoise } from 'baburchi';
2351
+ *
2352
+ * console.log(isArabicTextNoise('---')); // true (formatting artifact)
2353
+ * console.log(isArabicTextNoise('السلام عليكم')); // false (valid Arabic)
2354
+ * console.log(isArabicTextNoise('ABC')); // true (uppercase pattern)
2355
+ * ```
2356
+ */
2357
+ const isArabicTextNoise = (text) => {
2358
+ if (!text || text.trim().length === 0) return true;
2359
+ const trimmed = text.trim();
2360
+ const length = trimmed.length;
2361
+ if (length < 2) return true;
2362
+ if (isBasicNoisePattern(trimmed)) return true;
2363
+ const charStats = analyzeCharacterStats(trimmed);
2364
+ if (hasExcessiveRepetition(charStats, length)) return true;
2365
+ const hasArabic = PATTERNS.arabicCharacters.test(trimmed);
2366
+ if (!hasArabic && /[a-zA-Z]/.test(trimmed)) return true;
2367
+ if (hasArabic) return !isValidArabicContent(charStats, length);
2368
+ return isNonArabicNoise(charStats, length, trimmed);
2369
+ };
2370
+ /**
2371
+ * Analyzes character composition and frequency statistics for the input text.
2372
+ * Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols)
2373
+ * and tracks character frequency for pattern analysis.
2374
+ *
2375
+ * @param text - The text string to analyze
2376
+ * @returns CharacterStats object containing detailed character analysis
2377
+ *
2378
+ * @example
2379
+ * ```typescript
2380
+ * import { analyzeCharacterStats } from 'baburchi';
2381
+ *
2382
+ * const stats = analyzeCharacterStats('مرحبا 123!');
2383
+ * console.log(stats.arabicCount); // 5
2384
+ * console.log(stats.digitCount); // 3
2385
+ * console.log(stats.symbolCount); // 1
2386
+ * ```
2387
+ */
2388
+ function analyzeCharacterStats(text) {
2389
+ const stats = {
2390
+ arabicCount: 0,
2391
+ charFreq: /* @__PURE__ */ new Map(),
2392
+ digitCount: 0,
2393
+ latinCount: 0,
2394
+ punctuationCount: 0,
2395
+ spaceCount: 0,
2396
+ symbolCount: 0
2397
+ };
2398
+ const chars = Array.from(text);
2399
+ for (const char of chars) {
2400
+ stats.charFreq.set(char, (stats.charFreq.get(char) || 0) + 1);
2401
+ if (PATTERNS.arabicCharacters.test(char)) stats.arabicCount++;
2402
+ else if (/\d/.test(char)) stats.digitCount++;
2403
+ else if (/[a-zA-Z]/.test(char)) stats.latinCount++;
2404
+ else if (/\s/.test(char)) stats.spaceCount++;
2405
+ else if (/[.,;:()[\]{}"""''`]/.test(char)) stats.punctuationCount++;
2406
+ else stats.symbolCount++;
2407
+ }
2408
+ return stats;
2409
+ }
2410
+ /**
2411
+ * Detects excessive repetition of specific characters that commonly indicate noise.
2412
+ * Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs,
2413
+ * and underscores that often appear in OCR artifacts or formatting elements.
2414
+ *
2415
+ * @param charStats - Character statistics from analyzeCharacterStats
2416
+ * @param textLength - Total length of the original text
2417
+ * @returns true if excessive repetition is detected, false otherwise
2418
+ *
2419
+ * @example
2420
+ * ```typescript
2421
+ * import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';
2422
+ *
2423
+ * const stats = analyzeCharacterStats('!!!!!');
2424
+ * console.log(hasExcessiveRepetition(stats, 5)); // true
2425
+ *
2426
+ * const normalStats = analyzeCharacterStats('hello world');
2427
+ * console.log(hasExcessiveRepetition(normalStats, 11)); // false
2428
+ * ```
2429
+ */
2430
+ function hasExcessiveRepetition(charStats, textLength) {
2431
+ let repeatCount = 0;
2432
+ const repetitiveChars = [
2433
+ "!",
2434
+ ".",
2435
+ "-",
2436
+ "=",
2437
+ "_"
2438
+ ];
2439
+ for (const [char, count] of charStats.charFreq) if (count >= 5 && repetitiveChars.includes(char)) repeatCount += count;
2440
+ return repeatCount / textLength > .4;
2441
+ }
2442
+ /**
2443
+ * Identifies text that matches common noise patterns using regular expressions.
2444
+ * Detects patterns like repeated dashes, dot sequences, uppercase-only text,
2445
+ * digit-dash combinations, and other formatting artifacts commonly found in OCR output.
2446
+ *
2447
+ * @param text - The text string to check against noise patterns
2448
+ * @returns true if the text matches a basic noise pattern, false otherwise
2449
+ *
2450
+ * @example
2451
+ * ```typescript
2452
+ * import { isBasicNoisePattern } from 'baburchi';
2453
+ *
2454
+ * console.log(isBasicNoisePattern('---')); // true
2455
+ * console.log(isBasicNoisePattern('...')); // true
2456
+ * console.log(isBasicNoisePattern('ABC')); // true
2457
+ * console.log(isBasicNoisePattern('- 77')); // true
2458
+ * console.log(isBasicNoisePattern('hello world')); // false
2459
+ * ```
2460
+ */
2461
+ function isBasicNoisePattern(text) {
2462
+ return [
2463
+ /^[-=_━≺≻\s]*$/,
2464
+ /^[.\s]*$/,
2465
+ /^[!\s]*$/,
2466
+ /^[A-Z\s]*$/,
2467
+ /^[-\d\s]*$/,
2468
+ /^\d+\s*$/,
2469
+ /^[A-Z]\s*$/,
2470
+ /^[—\s]*$/,
2471
+ /^[्र\s-]*$/
2472
+ ].some((pattern) => pattern.test(text));
2473
+ }
2474
+ /**
2475
+ * Determines if non-Arabic content should be classified as noise based on various heuristics.
2476
+ * Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition
2477
+ * to identify unwanted OCR artifacts or meaningless content.
2478
+ *
2479
+ * @param charStats - Character statistics from analyzeCharacterStats
2480
+ * @param textLength - Total length of the original text
2481
+ * @param text - The original text string for additional pattern matching
2482
+ * @returns true if the content is likely noise, false if it appears to be valid content
2483
+ *
2484
+ * @example
2485
+ * ```typescript
2486
+ * import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';
2487
+ *
2488
+ * const stats = analyzeCharacterStats('!!!');
2489
+ * console.log(isNonArabicNoise(stats, 3, '!!!')); // true
2490
+ *
2491
+ * const validStats = analyzeCharacterStats('2023');
2492
+ * console.log(isNonArabicNoise(validStats, 4, '2023')); // false
2493
+ * ```
2494
+ */
2495
+ function isNonArabicNoise(charStats, textLength, text) {
2496
+ const contentChars = charStats.arabicCount + charStats.latinCount + charStats.digitCount;
2497
+ if (contentChars === 0) return true;
2498
+ if (isSpacingNoise(charStats, contentChars, textLength)) return true;
2499
+ if (/[٠-٩]/.test(text) && charStats.digitCount >= 3) return false;
2500
+ if ((charStats.symbolCount + Math.max(0, charStats.punctuationCount - 5)) / Math.max(contentChars, 1) > 2) return true;
2501
+ if (textLength <= 5 && charStats.arabicCount === 0 && !(/^\d+$/.test(text) && charStats.digitCount >= 3)) return true;
2502
+ if (/^\d{3,4}$/.test(text)) return false;
2503
+ return textLength <= 10;
2504
+ }
2505
+ /**
2506
+ * Detects problematic spacing patterns that indicate noise or OCR artifacts.
2507
+ * Identifies cases where spacing is excessive relative to content, or where
2508
+ * single characters are surrounded by spaces in a way that suggests OCR errors.
2509
+ *
2510
+ * @param charStats - Character statistics from analyzeCharacterStats
2511
+ * @param contentChars - Number of meaningful content characters (Arabic + Latin + digits)
2512
+ * @param textLength - Total length of the original text
2513
+ * @returns true if spacing patterns indicate noise, false otherwise
2514
+ *
2515
+ * @example
2516
+ * ```typescript
2517
+ * import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';
2518
+ *
2519
+ * const stats = analyzeCharacterStats(' a ');
2520
+ * const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;
2521
+ * console.log(isSpacingNoise(stats, contentChars, 3)); // true
2522
+ *
2523
+ * const normalStats = analyzeCharacterStats('hello world');
2524
+ * const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;
2525
+ * console.log(isSpacingNoise(normalStats, normalContent, 11)); // false
2526
+ * ```
2527
+ */
2528
+ function isSpacingNoise(charStats, contentChars, textLength) {
2529
+ const { arabicCount, spaceCount } = charStats;
2530
+ if (spaceCount > 0 && contentChars === spaceCount + 1 && contentChars <= 5) return true;
2531
+ if (textLength <= 10 && spaceCount >= 2 && arabicCount === 0) return true;
2532
+ if (spaceCount / textLength > .6) return true;
2533
+ return false;
2534
+ }
2535
+ /**
2536
+ * Validates whether Arabic content is substantial enough to be considered meaningful.
2537
+ * Uses character counts and text length to determine if Arabic text contains
2538
+ * sufficient content or if it's likely to be a fragment or OCR artifact.
2539
+ *
2540
+ * @param charStats - Character statistics from analyzeCharacterStats
2541
+ * @param textLength - Total length of the original text
2542
+ * @returns true if the Arabic content appears valid, false if it's likely noise
2543
+ *
2544
+ * @example
2545
+ * ```typescript
2546
+ * import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';
2547
+ *
2548
+ * const validStats = analyzeCharacterStats('السلام عليكم');
2549
+ * console.log(isValidArabicContent(validStats, 12)); // true
2550
+ *
2551
+ * const shortStats = analyzeCharacterStats('ص');
2552
+ * console.log(isValidArabicContent(shortStats, 1)); // false
2553
+ *
2554
+ * const withDigitsStats = analyzeCharacterStats('ص 5');
2555
+ * console.log(isValidArabicContent(withDigitsStats, 3)); // true
2556
+ * ```
2557
+ */
2558
+ function isValidArabicContent(charStats, textLength) {
2559
+ if (charStats.arabicCount >= 3) return true;
2560
+ if (charStats.arabicCount >= 1 && charStats.digitCount > 0 && textLength <= 20) return true;
2561
+ if (charStats.arabicCount >= 2 && charStats.punctuationCount <= 2 && textLength <= 10) return true;
2562
+ if (charStats.arabicCount >= 1 && textLength <= 5 && charStats.punctuationCount <= 1) return true;
2563
+ return false;
2564
+ }
2565
+
2566
+ //#endregion
2567
+ //#region src/typos.ts
2568
+ /**
2569
+ * Selects the best token(s) from an aligned pair during typo correction.
2570
+ * Uses various heuristics including normalization, footnote handling, typo symbols,
2571
+ * and similarity scores to determine which token(s) to keep.
2572
+ *
2573
+ * @param originalToken - Token from the original OCR text (may be null)
2574
+ * @param altToken - Token from the alternative OCR text (may be null)
2575
+ * @param options - Configuration options including typo symbols and similarity threshold
2576
+ * @returns Array of selected tokens (usually contains one token, but may contain multiple)
2577
+ */
2578
+ const selectBestTokens = (originalToken, altToken, { similarityThreshold, typoSymbols }) => {
2579
+ if (originalToken === null) return [altToken];
2580
+ if (altToken === null) return [originalToken];
2581
+ if (sanitizeArabic(originalToken) === sanitizeArabic(altToken)) return [originalToken];
2582
+ const result = handleFootnoteSelection(originalToken, altToken);
2583
+ if (result) return result;
2584
+ const footnoteResult = handleStandaloneFootnotes(originalToken, altToken);
2585
+ if (footnoteResult) return footnoteResult;
2586
+ if (typoSymbols.includes(originalToken) || typoSymbols.includes(altToken)) {
2587
+ const typoSymbol = typoSymbols.find((symbol) => symbol === originalToken || symbol === altToken);
2588
+ return typoSymbol ? [typoSymbol] : [originalToken];
2589
+ }
2590
+ return [calculateSimilarity(sanitizeArabic(originalToken), sanitizeArabic(altToken)) > similarityThreshold ? originalToken : altToken];
2591
+ };
2592
+ /**
2593
+ * Removes duplicate tokens and handles footnote fusion in post-processing.
2594
+ * Identifies and removes tokens that are highly similar while preserving
2595
+ * important variations. Also handles special cases like footnote merging.
2596
+ *
2597
+ * @param tokens - Array of tokens to process
2598
+ * @param highSimilarityThreshold - Threshold for detecting duplicates (0.0 to 1.0)
2599
+ * @returns Array of tokens with duplicates removed and footnotes fused
2600
+ */
2601
+ const removeDuplicateTokens = (tokens, highSimilarityThreshold) => {
2602
+ if (tokens.length === 0) return tokens;
2603
+ const result = [];
2604
+ for (const currentToken of tokens) {
2605
+ if (result.length === 0) {
2606
+ result.push(currentToken);
2607
+ continue;
2608
+ }
2609
+ const previousToken = result.at(-1);
2610
+ if (areSimilarAfterNormalization(previousToken, currentToken, highSimilarityThreshold)) {
2611
+ if (currentToken.length < previousToken.length) result[result.length - 1] = currentToken;
2612
+ continue;
2613
+ }
2614
+ if (handleFootnoteFusion(result, previousToken, currentToken)) continue;
2615
+ result.push(currentToken);
2616
+ }
2617
+ return result;
2618
+ };
2619
+ /**
2620
+ * Processes text alignment between original and alternate OCR results to fix typos.
2621
+ * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
2622
+ * then selects the best tokens and performs post-processing.
2623
+ *
2624
+ * @param originalText - Original OCR text that may contain typos
2625
+ * @param altText - Reference text from alternate OCR for comparison
2626
+ * @param options - Configuration options for alignment and selection
2627
+ * @returns Corrected text with typos fixed
2628
+ */
2629
+ const processTextAlignment = (originalText, altText, options) => {
2630
+ return removeDuplicateTokens(alignTokenSequences(tokenizeText(originalText, options.typoSymbols), tokenizeText(altText, options.typoSymbols), options.typoSymbols, options.similarityThreshold).flatMap(([original, alt]) => selectBestTokens(original, alt, options)), options.highSimilarityThreshold).join(" ");
2631
+ };
2632
+ /**
2633
+ * Convenience wrapper around {@link processTextAlignment} that accepts partial options.
2634
+ *
2635
+ * @param original - The source text that may contain typographical errors.
2636
+ * @param correction - The reference text used to correct the {@link original} text.
2637
+ * @param options - Partial typo correction options combined with required typo symbols.
2638
+ * @returns The corrected text generated from the alignment process.
2639
+ */
2640
+ const fixTypo = (original, correction, { highSimilarityThreshold = .8, similarityThreshold = .6, typoSymbols }) => {
2641
+ return processTextAlignment(original, correction, {
2642
+ highSimilarityThreshold,
2643
+ similarityThreshold,
2644
+ typoSymbols
2645
+ });
2646
+ };
2647
+
2648
+ //#endregion
2649
+ export { BRACKETS, CLOSE_BRACKETS, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
3
2650
  //# sourceMappingURL=index.js.map