npm - baburchi - Versions diffs - 1.7.2 → 1.8.0 - Mend

baburchi 1.7.2 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -86,10 +86,11 @@ const CHAR_TA_MARBUTAH = 1577;
 const CHAR_MADDA_ABOVE = 1619;
 const CHAR_HAMZA_ABOVE_MARK = 1620;
 const CHAR_HAMZA_BELOW_MARK = 1621;
+const CHAR_DAGGER_ALIF = 1648;
 let sharedBuffer = new Uint16Array(2048);
 const decoder = new TextDecoder("utf-16le");
 const isDiacritic = (code) => {
-	return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === 1648 || code >= 1750 && code <= 1773;
+	return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === CHAR_DAGGER_ALIF || code >= 1750 && code <= 1773;
 };
 const isZeroWidth = (code) => {
 	return code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
@@ -133,264 +134,292 @@ const resolveTatweelMode = (presetValue, override) => {
 	return override;
 };
 /**
-* Internal sanitization logic that applies all transformations to a single string.
-* Uses single-pass character transformation for maximum performance when possible.
-* This function assumes all options have been pre-resolved for maximum performance.
+* Emits a single space into the output buffer, respecting the collapse-whitespace flag.
+*
+* @param ctx - Mutable loop state; `bufIdx` and `lastWasSpace` may be updated.
+* @param collapseWS - When true, suppress consecutive spaces and leading spaces.
+*/
+const emitSpace = (ctx, collapseWS) => {
+	if (collapseWS) {
+		if (!ctx.lastWasSpace && ctx.bufIdx > 0) {
+			ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
+			ctx.lastWasSpace = true;
+		}
+	} else {
+		ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
+		ctx.lastWasSpace = false;
+	}
+};
+/**
+* Applies letter-level normalization to a single code point:
+* alif variants → bare alif, alif maqsurah → ya, ta marbutah → ha.
+*
+* @param code - Input code point.
+* @param normAlif - Whether to collapse alif variants.
+* @param maqToYa - Whether to replace ى with ي.
+* @param taToHa - Whether to replace ة with ه.
+* @returns The (possibly mapped) output code point.
+*/
+const normalizeCode = (code, normAlif, maqToYa, taToHa) => {
+	let out = code;
+	if (normAlif) {
+		if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) out = CHAR_ALIF;
+	}
+	if (maqToYa && code === CHAR_ALIF_MAQSURAH) out = CHAR_YA;
+	if (taToHa && code === CHAR_TA_MARBUTAH) out = CHAR_HA;
+	return out;
+};
+/**
+* Handles ASCII and control whitespace (code ≤ 32).
+* Returns `false` for non-whitespace characters.
+*/
+const processWhitespace = (code, ctx, opts) => {
+	if (code > 32) return false;
+	if (opts.lettersOnly) return true;
+	if (opts.collapseWS) {
+		if (!ctx.lastWasSpace && ctx.bufIdx > 0) {
+			ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
+			ctx.lastWasSpace = true;
+		}
+	} else {
+		ctx.buffer[ctx.bufIdx++] = code;
+		ctx.lastWasSpace = false;
+	}
+	return true;
+};
+/**
+* Performs inline NFC canonical composition for Arabic combining marks.
+* Only handles the five compositions relevant to Arabic OCR:
+*   ا + ◌ٓ → آ,  ا + ◌ٔ → أ,  ا + ◌ٕ → إ,  و + ◌ٔ → ؤ,  ي + ◌ٔ → ئ
+*
+* Called only when `nfc` is enabled. Returns `false` when the mark cannot be
+* composed (it will then be emitted as a standalone character by the fallthrough).
+*/
+const processNfc = (code, ctx) => {
+	if (code !== CHAR_MADDA_ABOVE && code !== CHAR_HAMZA_ABOVE_MARK && code !== CHAR_HAMZA_BELOW_MARK) return false;
+	const prevIdx = ctx.bufIdx - 1;
+	if (prevIdx < 0) return false;
+	const prev = ctx.buffer[prevIdx];
+	let composed = 0;
+	if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
+	else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
+	else composed = CHAR_ALIF_HAMZA_BELOW;
+	else if (code === CHAR_HAMZA_ABOVE_MARK) {
+		if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
+		else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
+	}
+	if (composed === 0) return false;
+	ctx.buffer[prevIdx] = composed;
+	return true;
+};
+/**
+* Strips zero-width controls (U+200B–U+FEFF range).
+* When `zwAsSpace` is set, emits a space in place of the removed character.
+* Called only when `stripZW` is enabled.
+*/
+const processZeroWidth = (code, ctx, opts) => {
+	if (!isZeroWidth(code)) return false;
+	if (opts.zwAsSpace) emitSpace(ctx, opts.collapseWS);
+	return true;
+};
+/**
+* Removes the Hijri date marker "هـ" (or bare "ه" when tatweel has already been
+* stripped) when it immediately follows a date-like token (digits/slashes/hyphens).
+*
+* May advance `ctx.i` by one to also consume an attached tatweel.
+* Called only when `removeHijri` is enabled.
+*/
+const processHijriMarker = (code, ctx, opts) => {
+	if (code !== CHAR_HA) return false;
+	const { text, len } = ctx;
+	const origI = ctx.i;
+	let nextIdx = origI + 1;
+	const hasTatweel = nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL;
+	if (hasTatweel) nextIdx++;
+	let isBoundary = nextIdx >= len;
+	if (!isBoundary) {
+		const nextCode = text.charCodeAt(nextIdx);
+		isBoundary = nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45;
+	}
+	if (!isBoundary) return false;
+	let backIdx = origI - 1;
+	while (backIdx >= 0 && (text.charCodeAt(backIdx) <= 32 || isZeroWidth(text.charCodeAt(backIdx)))) backIdx--;
+	if (backIdx < 0 || !isDigit(text.charCodeAt(backIdx))) return false;
+	if (hasTatweel) ctx.i = origI + 1;
+	return true;
+};
+/**
+* Strips tatweel (ـ U+0640) according to the resolved mode.
+*
+* - `'all'`: always remove.
+* - `'safe'`: remove unless immediately preceded by a digit or ه
+*   (preserves date suffixes like "هـ" and list markers like "4ـ").
+*
+* Called only when `tatweelMode !== false`.
+*/
+const processTatweel = (code, ctx, tatweelMode) => {
+	if (code !== CHAR_TATWEEL) return false;
+	if (tatweelMode === "all") return true;
+	let backIdx = ctx.bufIdx - 1;
+	while (backIdx >= 0 && ctx.buffer[backIdx] === CHAR_SPACE) backIdx--;
+	if (backIdx < 0) return true;
+	const prev = ctx.buffer[backIdx];
+	return !(isDigit(prev) || prev === CHAR_HA);
+};
+/**
+* Replaces Latin letters, Western digits, and recognised symbols with a space.
+* Also collapses runs of double-slashes ("//") common in URLs.
+*
+* Called only when `stripNoise` is enabled and letter-filtering is not already
+* handling cleanup (`lettersSpacesOnly` / `lettersOnly` take care of it themselves).
+*
+* May advance `ctx.i` to consume a run of slashes.
+*/
+const processNoise = (code, ctx, opts) => {
+	if (opts.lettersSpacesOnly || opts.lettersOnly) return false;
+	if (isLatinOrDigit(code) || isSymbol(code)) {
+		emitSpace(ctx, opts.collapseWS);
+		return true;
+	}
+	if (code === 47 && ctx.i + 1 < ctx.len && ctx.text.charCodeAt(ctx.i + 1) === 47) {
+		while (ctx.i + 1 < ctx.len && ctx.text.charCodeAt(ctx.i + 1) === 47) ctx.i++;
+		emitSpace(ctx, opts.collapseWS);
+		return true;
+	}
+	return false;
+};
+/**
+* Matches footnote pattern 1: `(¬٣)` or `(¬٣ )` — a negation sign followed by
+* Arabic-Indic digits and an optional space before the closing parenthesis.
+*
+* @param text - Full source string.
+* @param len - Length of `text`.
+* @param startPos - Index of the first character **after** ¬.
+* @returns Index of the closing `)` on match, or -1 on no match.
+*/
+const matchFootnotePattern1 = (text, len, startPos) => {
+	let pos = startPos;
+	let hasDigits = false;
+	while (pos < len && text.charCodeAt(pos) >= 1632 && text.charCodeAt(pos) <= 1641) {
+		hasDigits = true;
+		pos++;
+	}
+	if (!hasDigits || pos >= len) return -1;
+	const closing = text.charCodeAt(pos);
+	if (closing === 41) return pos;
+	if (closing === CHAR_SPACE && pos + 1 < len && text.charCodeAt(pos + 1) === 41) return pos + 1;
+	return -1;
+};
+/**
+* Matches footnote pattern 2: `(٣)` or `(٣ X)` — a single Arabic-Indic digit,
+* optionally followed by a space and one Arabic letter, then a closing parenthesis.
+*
+* @param text - Full source string.
+* @param len - Length of `text`.
+* @param digitPos - Index of the Arabic-Indic digit character.
+* @returns Index of the closing `)` on match, or -1 on no match.
+*/
+const matchFootnotePattern2 = (text, len, digitPos) => {
+	const afterDigit = digitPos + 1;
+	if (afterDigit >= len) return -1;
+	const c2 = text.charCodeAt(afterDigit);
+	if (c2 === 41) return afterDigit;
+	if (c2 !== CHAR_SPACE) return -1;
+	const afterSpace = afterDigit + 1;
+	if (afterSpace >= len) return -1;
+	const c3 = text.charCodeAt(afterSpace);
+	if (c3 < 1536 || c3 > 1791) return -1;
+	const closingIdx = afterSpace + 1;
+	if (closingIdx >= len || text.charCodeAt(closingIdx) !== 41) return -1;
+	return closingIdx;
+};
+/**
+* Removes inline footnote references of the form `(٣)`, `(٣ م)`, or `(¬٣)`.
+* Replaces the entire token (including parens) with a single space.
+*
+* Called only when `removeFootnotes` is enabled and letter-filtering is inactive.
+* May advance `ctx.i` past the consumed token.
+*/
+const processFootnote = (code, ctx, opts) => {
+	if (opts.lettersSpacesOnly || opts.lettersOnly || code !== 40) return false;
+	const { text, len } = ctx;
+	let nextIdx = ctx.i + 1;
+	if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
+	if (nextIdx >= len) return false;
+	const c1 = text.charCodeAt(nextIdx);
+	let endIdx = -1;
+	if (c1 === 172) endIdx = matchFootnotePattern1(text, len, nextIdx + 1);
+	else if (c1 >= 1632 && c1 <= 1641) endIdx = matchFootnotePattern2(text, len, nextIdx);
+	if (endIdx < 0) return false;
+	ctx.i = endIdx;
+	emitSpace(ctx, opts.collapseWS);
+	return true;
+};
+/**
+* Handles letter filtering for the `lettersSpacesOnly` / `lettersOnly` modes.
+* Non-Arabic characters are either dropped (lettersOnly) or replaced with a space.
+* Arabic letters are emitted after normalization.
+*
+* Returns `false` when neither mode is active, allowing the default emit to run.
+*/
+const processLetterFilter = (code, ctx, opts) => {
+	if (!opts.lettersSpacesOnly && !opts.lettersOnly) return false;
+	if (!isArabicLetter(code)) {
+		if (!opts.lettersOnly) emitSpace(ctx, opts.collapseWS);
+		return true;
+	}
+	ctx.buffer[ctx.bufIdx++] = normalizeCode(code, opts.normAlif, opts.maqToYa, opts.taToHa);
+	ctx.lastWasSpace = false;
+	return true;
+};
+/**
+* Internal sanitization logic. Iterates once over the source string, dispatching
+* each character through a series of focused step handlers. All options must be
+* pre-resolved; no allocations occur beyond the context object and any buffer growth.
 */
 const applySanitization = (input, options) => {
 	if (!input) return "";
-	const { nfc, stripZW, zwAsSpace, removeHijri, removeDia, tatweelMode, normAlif, maqToYa, taToHa, removeFootnotes, lettersSpacesOnly, stripNoise, lettersOnly, collapseWS, doTrim } = options;
-	/**
-	* NFC Normalization (Fast Path)
-	*
-	* `String.prototype.normalize('NFC')` is extremely expensive under high throughput.
-	* For Arabic OCR text, the main canonical compositions we care about are:
-	* - ا + ◌ٓ (U+0653) → آ
-	* - ا + ◌ٔ (U+0654) → أ
-	* - ا + ◌ٕ (U+0655) → إ
-	* - و + ◌ٔ (U+0654) → ؤ
-	* - ي + ◌ٔ (U+0654) → ئ
-	*
-	* We implement these compositions inline during the main loop, avoiding full NFC
-	* normalization in the common case while preserving behavior needed by our sanitizer.
-	*/
+	const { nfc, stripZW, removeHijri, removeDia, tatweelMode, stripNoise, removeFootnotes, normAlif, maqToYa, taToHa, doTrim } = options;
 	const text = input;
 	const len = text.length;
 	if (len > sharedBuffer.length) sharedBuffer = new Uint16Array(len + 1024);
-	const buffer = sharedBuffer;
-	let bufIdx = 0;
-	let lastWasSpace = false;
+	const ctx = {
+		buffer: sharedBuffer,
+		bufIdx: 0,
+		i: 0,
+		lastWasSpace: false,
+		len,
+		text
+	};
 	let start = 0;
 	if (doTrim) while (start < len && text.charCodeAt(start) <= 32) start++;
 	for (let i = start; i < len; i++) {
+		ctx.i = i;
 		const code = text.charCodeAt(i);
-		if (code <= 32) {
-			if (lettersOnly) continue;
-			if (collapseWS) {
-				if (!lastWasSpace && bufIdx > 0) {
-					buffer[bufIdx++] = CHAR_SPACE;
-					lastWasSpace = true;
-				}
-			} else {
-				buffer[bufIdx++] = CHAR_SPACE;
-				lastWasSpace = false;
-			}
-			continue;
-		}
-		if (nfc) {
-			if (code === CHAR_MADDA_ABOVE || code === CHAR_HAMZA_ABOVE_MARK || code === CHAR_HAMZA_BELOW_MARK) {
-				const prevIdx = bufIdx - 1;
-				if (prevIdx >= 0) {
-					const prev = buffer[prevIdx];
-					let composed = 0;
-					if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
-					else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
-					else composed = CHAR_ALIF_HAMZA_BELOW;
-					else if (code === CHAR_HAMZA_ABOVE_MARK) {
-						if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
-						else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
-					}
-					if (composed !== 0) {
-						buffer[prevIdx] = composed;
-						continue;
-					}
-				}
-			}
-		}
-		if (stripZW && isZeroWidth(code)) {
-			if (zwAsSpace) if (collapseWS) {
-				if (!lastWasSpace && bufIdx > 0) {
-					buffer[bufIdx++] = CHAR_SPACE;
-					lastWasSpace = true;
-				}
-			} else {
-				buffer[bufIdx++] = CHAR_SPACE;
-				lastWasSpace = false;
-			}
+		if (processWhitespace(code, ctx, options)) continue;
+		if (nfc && processNfc(code, ctx)) continue;
+		if (stripZW && processZeroWidth(code, ctx, options)) continue;
+		if (removeHijri && processHijriMarker(code, ctx, options)) {
+			i = ctx.i;
 			continue;
 		}
-		if (removeHijri && code === CHAR_HA) {
-			let nextIdx = i + 1;
-			if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL) nextIdx++;
-			let isBoundary = false;
-			if (nextIdx >= len) isBoundary = true;
-			else {
-				const nextCode = text.charCodeAt(nextIdx);
-				if (nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45) isBoundary = true;
-			}
-			if (isBoundary) {
-				let backIdx = i - 1;
-				while (backIdx >= 0) {
-					const c = text.charCodeAt(backIdx);
-					if (c <= 32 || isZeroWidth(c)) backIdx--;
-					else break;
-				}
-				if (backIdx >= 0 && isDigit(text.charCodeAt(backIdx))) {
-					if (nextIdx > i + 1) i++;
-					continue;
-				}
-			}
-		}
 		if (removeDia && isDiacritic(code)) continue;
-		if (code === CHAR_TATWEEL) {
-			if (tatweelMode === "all") continue;
-			if (tatweelMode === "safe") {
-				let backIdx = bufIdx - 1;
-				while (backIdx >= 0 && buffer[backIdx] === CHAR_SPACE) backIdx--;
-				if (backIdx >= 0) {
-					const prev = buffer[backIdx];
-					if (isDigit(prev) || prev === CHAR_HA) {} else continue;
-				} else continue;
-			}
-		}
-		if (stripNoise && !lettersSpacesOnly && !lettersOnly) {
-			if (isLatinOrDigit(code) || isSymbol(code)) {
-				if (collapseWS) {
-					if (!lastWasSpace && bufIdx > 0) {
-						buffer[bufIdx++] = CHAR_SPACE;
-						lastWasSpace = true;
-					}
-				} else {
-					buffer[bufIdx++] = CHAR_SPACE;
-					lastWasSpace = false;
-				}
-				continue;
-			}
-			if (code === 47 && i + 1 < len && text.charCodeAt(i + 1) === 47) {
-				while (i + 1 < len && text.charCodeAt(i + 1) === 47) i++;
-				if (collapseWS) {
-					if (!lastWasSpace && bufIdx > 0) {
-						buffer[bufIdx++] = CHAR_SPACE;
-						lastWasSpace = true;
-					}
-				} else {
-					buffer[bufIdx++] = CHAR_SPACE;
-					lastWasSpace = false;
-				}
-				continue;
-			}
-		}
-		if (removeFootnotes && !lettersSpacesOnly && !lettersOnly && code === 40) {
-			let nextIdx = i + 1;
-			if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
-			if (nextIdx < len) {
-				const c1 = text.charCodeAt(nextIdx);
-				if (c1 === 172) {
-					nextIdx++;
-					let hasDigits = false;
-					while (nextIdx < len) {
-						const c = text.charCodeAt(nextIdx);
-						if (c >= 1632 && c <= 1641) {
-							hasDigits = true;
-							nextIdx++;
-						} else break;
-					}
-					if (hasDigits && nextIdx < len) {
-						if (text.charCodeAt(nextIdx) === 41) {
-							i = nextIdx;
-							if (collapseWS) {
-								if (!lastWasSpace && bufIdx > 0) {
-									buffer[bufIdx++] = CHAR_SPACE;
-									lastWasSpace = true;
-								}
-							} else {
-								buffer[bufIdx++] = CHAR_SPACE;
-								lastWasSpace = false;
-							}
-							continue;
-						}
-						if (text.charCodeAt(nextIdx) === CHAR_SPACE) {
-							nextIdx++;
-							if (nextIdx < len && text.charCodeAt(nextIdx) === 41) {
-								i = nextIdx;
-								if (collapseWS) {
-									if (!lastWasSpace && bufIdx > 0) {
-										buffer[bufIdx++] = CHAR_SPACE;
-										lastWasSpace = true;
-									}
-								} else {
-									buffer[bufIdx++] = CHAR_SPACE;
-									lastWasSpace = false;
-								}
-								continue;
-							}
-						}
-					}
-				} else if (c1 >= 1632 && c1 <= 1641) {
-					let tempIdx = nextIdx + 1;
-					let matched = false;
-					if (tempIdx < len) {
-						const c2 = text.charCodeAt(tempIdx);
-						if (c2 === 41) {
-							matched = true;
-							tempIdx++;
-						} else if (c2 === CHAR_SPACE) {
-							tempIdx++;
-							if (tempIdx < len) {
-								const c3 = text.charCodeAt(tempIdx);
-								if (c3 >= 1536 && c3 <= 1791) {
-									tempIdx++;
-									if (tempIdx < len && text.charCodeAt(tempIdx) === 41) {
-										matched = true;
-										tempIdx++;
-									}
-								}
-							}
-						}
-					}
-					if (matched) {
-						i = tempIdx - 1;
-						if (collapseWS) {
-							if (!lastWasSpace && bufIdx > 0) {
-								buffer[bufIdx++] = CHAR_SPACE;
-								lastWasSpace = true;
-							}
-						} else {
-							buffer[bufIdx++] = CHAR_SPACE;
-							lastWasSpace = false;
-						}
-						continue;
-					}
-				}
-			}
-		}
-		if (lettersSpacesOnly || lettersOnly) {
-			if (!isArabicLetter(code)) {
-				if (lettersOnly) continue;
-				if (collapseWS) {
-					if (!lastWasSpace && bufIdx > 0) {
-						buffer[bufIdx++] = CHAR_SPACE;
-						lastWasSpace = true;
-					}
-				} else {
-					buffer[bufIdx++] = CHAR_SPACE;
-					lastWasSpace = false;
-				}
-				continue;
-			}
-			let outCode$1 = code;
-			if (normAlif) {
-				if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode$1 = CHAR_ALIF;
-			}
-			if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode$1 = CHAR_YA;
-			if (taToHa && code === CHAR_TA_MARBUTAH) outCode$1 = CHAR_HA;
-			buffer[bufIdx++] = outCode$1;
-			lastWasSpace = false;
+		if (tatweelMode !== false && processTatweel(code, ctx, tatweelMode)) continue;
+		if (stripNoise && processNoise(code, ctx, options)) {
+			i = ctx.i;
 			continue;
 		}
-		let outCode = code;
-		if (normAlif) {
-			if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode = CHAR_ALIF;
+		if (removeFootnotes && processFootnote(code, ctx, options)) {
+			i = ctx.i;
+			continue;
 		}
-		if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode = CHAR_YA;
-		if (taToHa && code === CHAR_TA_MARBUTAH) outCode = CHAR_HA;
-		buffer[bufIdx++] = outCode;
-		lastWasSpace = false;
+		if (processLetterFilter(code, ctx, options)) continue;
+		ctx.buffer[ctx.bufIdx++] = normalizeCode(code, normAlif, maqToYa, taToHa);
+		ctx.lastWasSpace = false;
 	}
-	if (doTrim && lastWasSpace && bufIdx > 0) bufIdx--;
-	if (bufIdx === 0) return "";
-	const resultView = buffer.subarray(0, bufIdx);
-	return decoder.decode(resultView);
+	if (doTrim && ctx.lastWasSpace && ctx.bufIdx > 0) ctx.bufIdx--;
+	if (ctx.bufIdx === 0) return "";
+	return decoder.decode(ctx.buffer.subarray(0, ctx.bufIdx));
 };
 /**
 * Resolves options from a preset or custom options object.
@@ -449,7 +478,58 @@ function sanitizeArabic(input, optionsOrPreset = "search") {
 	if (!input) return "";
 	return applySanitization(input, resolveOptions(optionsOrPreset));
 }
+const sanitizeQuranBase = createArabicSanitizer({
+	base: "none",
+	collapseWhitespace: true,
+	lettersAndSpacesOnly: true,
+	nfc: true,
+	replaceAlifMaqsurah: false,
+	replaceTaMarbutahWithHa: false,
+	stripDiacritics: true,
+	stripFootnotes: true,
+	stripTatweel: "all",
+	stripZeroWidth: true,
+	trim: true
+});
+const normalizeQuranOrthography = (input) => {
+	if (!input) return "";
+	let output = "";
+	let lastBaseCode = 0;
+	for (let index = 0; index < input.length; index += 1) {
+		const code = input.charCodeAt(index);
+		if (code === CHAR_ALIF_WASLA) {
+			output += String.fromCharCode(CHAR_ALIF);
+			lastBaseCode = CHAR_ALIF;
+			continue;
+		}
+		if (code === CHAR_DAGGER_ALIF) {
+			if (lastBaseCode !== 0 && lastBaseCode !== 1584 && lastBaseCode !== CHAR_HA && lastBaseCode !== CHAR_ALIF && lastBaseCode !== CHAR_WAW && lastBaseCode !== CHAR_YA && lastBaseCode !== CHAR_ALIF_MAQSURAH) {
+				output += String.fromCharCode(CHAR_ALIF);
+				lastBaseCode = CHAR_ALIF;
+			}
+			continue;
+		}
+		output += input[index];
+		if (!isDiacritic(code) && !isZeroWidth(code)) lastBaseCode = code;
+	}
+	return output;
+};
+/**
+* Produces a conservative Qur'an-specific search surface.
+*
+* This helper is intentionally narrower than the generic `search` preset:
+* it preserves standard hamza forms and alif maqsurah while normalizing
+* Qur'anic orthography that would otherwise damage lexical identity in FTS.
+*
+* Current behavior:
+* - maps alif wasla (`ٱ`) to bare alif (`ا`)
+* - expands dagger alif (`ٰ`) only in contexts where the imla'i form needs an alif
+* - strips tashkeel, tatweel, footnotes, zero-width chars, and non-letter noise
+* - keeps only Arabic letters and spaces
+*/
+const sanitizeQuranForSearch = (input) => {
+	return sanitizeQuranBase(normalizeQuranOrthography(input)).replace(/آ/gu, "ا").replace(/ىء/gu, "يء");
+};
 //#endregion
 //#region src/utils/levenshthein.ts
 /**
@@ -556,7 +636,6 @@ const boundedLevenshtein = (a, b, maxDist) => {
 	}
 	return prev[b.length] <= maxDist ? prev[b.length] : big;
 };
 //#endregion
 //#region src/utils/similarity.ts
 const ALIGNMENT_SCORES = {
@@ -731,7 +810,6 @@ const alignTokenSequences = (tokensA, tokensB, typoSymbols, similarityThreshold)
 	}
 	return backtrackAlignment(matrix, tokensA, tokensB);
 };
 //#endregion
 //#region src/alignment.ts
 /**
@@ -805,7 +883,6 @@ const processAlignmentTarget = (targetLine, segmentLines, segmentIndex) => {
 		segmentsConsumed: 2
 	};
 };
 //#endregion
 //#region src/balance.ts
 /**
@@ -832,8 +909,8 @@ const checkQuoteBalance = (str) => {
 		quoteCount++;
 		lastQuoteIndex = i;
 	}
-	const isBalanced$1 = quoteCount % 2 === 0;
-	if (!isBalanced$1 && lastQuoteIndex !== -1) errors.push({
+	const isBalanced = quoteCount % 2 === 0;
+	if (!isBalanced && lastQuoteIndex !== -1) errors.push({
 		char: "\"",
 		index: lastQuoteIndex,
 		reason: "unmatched",
@@ -841,15 +918,15 @@ const checkQuoteBalance = (str) => {
 	});
 	return {
 		errors,
-		isBalanced: isBalanced$1
+		isBalanced
 	};
 };
 /** Mapping of opening brackets to their corresponding closing brackets */
 const BRACKETS = {
-	"«": "»",
 	"(": ")",
 	"[": "]",
-	"{": "}"
+	"{": "}",
+	"«": "»"
 };
 /** Set of all opening bracket characters */
 const OPEN_BRACKETS = new Set([
@@ -1049,7 +1126,6 @@ const areBracketsBalanced = (str) => {
 const isBalanced = (str) => {
 	return checkBalance(str).isBalanced;
 };
 //#endregion
 //#region src/utils/textUtils.ts
 const INTAHA_ACTUAL = "اهـ";
@@ -1220,7 +1296,6 @@ const standardizeHijriSymbol = (text) => {
 const standardizeIntahaSymbol = (text) => {
 	return text.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu, `$1${INTAHA_ACTUAL}`);
 };
 //#endregion
 //#region src/footnotes.ts
 const INVALID_FOOTNOTE = "()";
@@ -1266,9 +1341,9 @@ const numberToArabic = (num) => {
 */
 const ocrToArabic = (char) => {
 	return {
+		".": "٠",
 		"1": "١",
 		"9": "٩",
-		".": "٠",
 		O: "٥",
 		o: "٥",
 		V: "٧",
@@ -1418,7 +1493,6 @@ const correctReferences = (lines) => {
 		};
 	});
 };
 //#endregion
 //#region src/utils/ahocorasick.ts
 /**
@@ -1526,7 +1600,6 @@ const buildAhoCorasick = (patterns) => {
 	ac.build();
 	return ac;
 };
 //#endregion
 //#region src/utils/constants.ts
 const DEFAULT_POLICY = {
@@ -1539,7 +1612,6 @@ const DEFAULT_POLICY = {
 	q: 4,
 	seamLen: 512
 };
 //#endregion
 //#region src/utils/fuzzyUtils.ts
 const SEAM_GAP_CEILING = 200;
@@ -1776,7 +1848,6 @@ const findBestMatch = (windows, excerpt, acceptance) => {
 		dist: best
 	};
 };
 //#endregion
 //#region src/utils/qgram.ts
 /**
@@ -1870,7 +1941,6 @@ var QGramIndex = class {
 		return this.map.get(gram);
 	}
 };
 //#endregion
 //#region src/fuzzy.ts
 /**
@@ -2334,7 +2404,6 @@ function findMatchesAll(pages, excerpts, policy = {}) {
 	if (cfg.enableFuzzy) recordFuzzyMatches(excerptsN, pagesN, hitsByExcerpt, cfg);
 	return hitsByExcerpt.map((hits) => sortMatches(hits));
 }
 //#endregion
 //#region src/noise.ts
 /**
@@ -2562,7 +2631,6 @@ function isValidArabicContent(charStats, textLength) {
 	if (charStats.arabicCount >= 1 && textLength <= 5 && charStats.punctuationCount <= 1) return true;
 	return false;
 }
 //#endregion
 //#region src/typos.ts
 /**
@@ -2644,7 +2712,7 @@ const fixTypo = (original, correction, { highSimilarityThreshold = .8, similarit
 		typoSymbols
 	});
 };
 //#endregion
-export { BRACKETS, CLOSE_BRACKETS, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
+export { BRACKETS, CLOSE_BRACKETS, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, sanitizeQuranForSearch, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
 //# sourceMappingURL=index.js.map