npm - flappa-doormal - Versions diffs - 2.17.1 → 2.18.0 - Mend

flappa-doormal 2.17.1 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1,141 +1,25 @@
-//#region src/utils/textUtils.ts
+//#region src/segmentation/tokens.ts
 /**
-* Normalizes line endings to Unix-style (`\n`).
-*
-* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
-* for consistent pattern matching across platforms.
+* Arabic base letters used by low-level dictionary-style regex helpers.
 *
-* @param content - Raw content with potentially mixed line endings
-* @returns Content with all line endings normalized to `\n`
+* This is intentionally broader than `{{harf}}`:
+* - includes standalone hamza `ء`
+* - stays as a raw regex fragment rather than a template token
 */
-const normalizeLineEndings = (content) => {
-	return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
-};
+const ARABIC_BASE_LETTER_CLASS = "[ء-غف-ي]";
 /**
-* Escapes regex metacharacters (parentheses and brackets) in template patterns,
-* but preserves content inside `{{...}}` token delimiters.
-*
-* This allows users to write intuitive patterns like `({{harf}}):` instead of
-* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
-* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
-*
-* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
-* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
-*
-* @example
-* escapeTemplateBrackets('({{harf}}): ')
-* // → '\\({{harf}}\\): '
-*
-* @example
-* escapeTemplateBrackets('[{{raqm}}] ')
-* // → '\\[{{raqm}}\\] '
-*
-* @example
-* escapeTemplateBrackets('{{harf}}')
-* // → '{{harf}}' (unchanged - no brackets outside tokens)
+* Arabic combining marks / annotation signs used by low-level regex helpers.
 */
-const escapeTemplateBrackets = (pattern) => {
-	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
-};
+const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
 /**
-* Character class matching all Arabic diacritics (Tashkeel/Harakat).
-*
-* Includes the following diacritical marks:
-* - U+064B: ً (fathatan - double fatha)
-* - U+064C: ٌ (dammatan - double damma)
-* - U+064D: ٍ (kasratan - double kasra)
-* - U+064E: َ (fatha - short a)
-* - U+064F: ُ (damma - short u)
-* - U+0650: ِ (kasra - short i)
-* - U+0651: ّ (shadda - gemination)
-* - U+0652: ْ (sukun - no vowel)
-*
-* @internal
+* A single Arabic base letter followed by zero or more combining marks.
 */
-const DIACRITICS_CLASS = "[ًٌٍَُِّْ]";
+const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = `${ARABIC_BASE_LETTER_CLASS}${ARABIC_MARKS_CLASS}*`;
 /**
-* Groups of equivalent Arabic characters.
-*
-* Characters within the same group are considered equivalent for matching purposes.
-* This handles common variations in Arabic text where different characters are
-* used interchangeably or have the same underlying meaning.
-*
-* Equivalence groups:
-* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
-* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
-* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
-*
-* @internal
+* One or more Arabic letters, where each letter may carry combining marks.
 */
-const EQUIV_GROUPS = [
-	[
-		"ا",
-		"آ",
-		"أ",
-		"إ"
-	],
-	["ة", "ه"],
-	["ى", "ي"]
-];
-/**
-* Escapes a string for safe inclusion in a regular expression.
-*
-* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
-*
-* @param s - Any string to escape
-* @returns String with regex metacharacters escaped
-*
-* @example
-* escapeRegex('hello.world')   // → 'hello\\.world'
-* escapeRegex('[test]')        // → '\\[test\\]'
-* escapeRegex('a+b*c?')        // → 'a\\+b\\*c\\?'
-*/
-const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-const getEquivClass = (ch) => {
-	const group = EQUIV_GROUPS.find((g) => g.includes(ch));
-	return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
-};
-const normalizeArabicLight = (str) => {
-	return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
-};
-const makeDiacriticInsensitive = (text) => {
-	const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
-	return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
-};
-const isCombiningMarkOrSelector = (char) => {
-	if (!char) return false;
-	return /\p{M}/u.test(char) || char === "︎" || char === "️";
-};
-const isJoiner = (char) => char === "‌" || char === "‍";
-/**
-* Ensures the position does not split a grapheme cluster (surrogate pairs,
-* combining marks, or zero-width joiners / variation selectors).
-*
-* This is only used as a last-resort fallback when we are forced to split
-* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
-*/
-const adjustForUnicodeBoundary = (content, position) => {
-	let adjusted = position;
-	while (adjusted > 0) {
-		const high = content.charCodeAt(adjusted - 1);
-		const low = content.charCodeAt(adjusted);
-		if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
-			adjusted -= 1;
-			continue;
-		}
-		const nextChar = content[adjusted];
-		const prevChar = content[adjusted - 1];
-		if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
-			adjusted -= 1;
-			continue;
-		}
-		break;
-	}
-	return adjusted;
-};
-//#endregion
-//#region src/segmentation/tokens.ts
+const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = `(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN})+`;
+const ARABIC_SPACED_CODE_ATOM = `[أ-غف-ي]${ARABIC_MARKS_CLASS}*`;
 const RUMUZ_ATOM = `(?:${[
 	"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
 	"خت",
@@ -166,15 +50,25 @@ const RUMUZ_ATOM = `(?:${[
 ].join("|")})`;
 const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
 const BASE_TOKENS = {
+	/** Chapter marker (باب). */
 	bab: "باب",
+	/** Basmala (بسم الله). Also matches ﷽. */
 	basmalah: ["بسم الله", "﷽"].join("|"),
+	/** Bullet point variants: `•`, `*`, `°`. */
 	bullet: "[•*°]",
+	/** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
 	dash: "[-–—ـ]",
+	/** Section marker (فصل / مسألة). */
 	fasl: ["مسألة", "فصل"].join("|"),
+	/** Single Arabic letter (أ-ي). Does NOT include diacritics. */
 	harf: "[أ-ي]",
-	harfs: "[أ-ي](?:\\s+[أ-ي])*",
+	/** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
+	harfs: `${ARABIC_SPACED_CODE_ATOM}(?:\\s+${ARABIC_SPACED_CODE_ATOM})*`,
+	/** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
 	hr: "[-–—ـ_=]{5,}",
+	/** Book marker (كتاب). */
 	kitab: "كتاب",
+	/** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
 	naql: [
 		"حدثني",
 		"وأخبرنا",
@@ -186,33 +80,58 @@ const BASE_TOKENS = {
 		"وحدثني",
 		"وحدثنيه"
 	].join("|"),
+	/** Newline character. Useful for breakpoints that split on line boundaries. */
 	newline: "\\n",
+	/** Single ASCII digit (0-9). */
 	num: "\\d",
+	/** One or more ASCII digits (0-9)+. */
 	nums: "\\d+",
+	/** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
 	raqm: "[\\u0660-\\u0669]",
+	/** One or more Arabic-Indic digits (٠-٩)+. */
 	raqms: "[\\u0660-\\u0669]+",
+	/** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
 	rumuz: RUMUZ_BLOCK,
+	/** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
 	tarqim: "[.!?؟؛]"
 };
 /** Pre-defined token constants for use in patterns. */
 const Token = {
+	/** Chapter marker - باب */
 	BAB: "{{bab}}",
+	/** Basmala - بسم الله */
 	BASMALAH: "{{basmalah}}",
+	/** Bullet point variants */
 	BULLET: "{{bullet}}",
+	/** Dash variants (hyphen, en-dash, em-dash, tatweel) */
 	DASH: "{{dash}}",
+	/** Section marker - فصل / مسألة */
 	FASL: "{{fasl}}",
+	/** Single Arabic letter */
 	HARF: "{{harf}}",
+	/** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
 	HARFS: "{{harfs}}",
+	/** Horizontal rule / separator (repeated dashes) */
 	HR: "{{hr}}",
+	/** Book marker - كتاب */
 	KITAB: "{{kitab}}",
+	/** Hadith transmission phrases */
 	NAQL: "{{naql}}",
+	/** Newline character (for breakpoints) */
 	NEWLINE: "{{newline}}",
+	/** Single ASCII digit */
 	NUM: "{{num}}",
+	/** Composite: {{raqms}} {{dash}} (space) */
 	NUMBERED: "{{numbered}}",
+	/** One or more ASCII digits */
 	NUMS: "{{nums}}",
+	/** Single Arabic-Indic digit */
 	RAQM: "{{raqm}}",
+	/** One or more Arabic-Indic digits */
 	RAQMS: "{{raqms}}",
+	/** Source abbreviations (rijāl/takhrīj) */
 	RUMUZ: "{{rumuz}}",
+	/** Punctuation marks */
 	TARQIM: "{{tarqim}}"
 };
 /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
@@ -222,7 +141,9 @@ const withCapture = (token, name) => {
 	return `{{${match[1]}:${name}}}`;
 };
 /** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
-const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
+const COMPOSITE_TOKENS = {
+/** Common hadith numbering format: Arabic-Indic digits + dash + space. */
+numbered: "{{raqms}} {{dash}} " };
 /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
 const expandCompositeTokensInTemplate = (template) => {
 	let out = template;
@@ -473,11 +394,11 @@ const templateToRegex = (template) => {
 * Useful for documentation, validation, or building user interfaces
 * that show available tokens.
 *
-* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
+* @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
 *
 * @example
 * getAvailableTokens()
-* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
+* // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
 */
 const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
 /**
@@ -486,13 +407,13 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
 * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
 * without any expansion or capture group wrapping.
 *
-* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
-* @returns The regex pattern string, or `undefined` if token doesn't exist
+* @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
+* @returns The regex pattern string for that known token
 *
 * @example
 * getTokenPattern('raqms')   // → '[\\u0660-\\u0669]+'
 * getTokenPattern('dash')    // → '[-–—ـ]'
-* getTokenPattern('unknown') // → undefined
+* getTokenPattern('harfs')     // → pattern for spaced isolated Arabic letter codes
 */
 const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
 /**
@@ -571,7 +492,161 @@ const applyTokenMappings = (template, mappings) => {
 const stripTokenMappings = (template) => {
 	return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
 };
+//#endregion
+//#region src/utils/textUtils.ts
+/**
+* Normalizes line endings to Unix-style (`\n`).
+*
+* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
+* for consistent pattern matching across platforms.
+*
+* @param content - Raw content with potentially mixed line endings
+* @returns Content with all line endings normalized to `\n`
+*/
+const normalizeLineEndings = (content) => {
+	return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
+};
+/**
+* Escapes regex metacharacters (parentheses and brackets) in template patterns,
+* but preserves content inside `{{...}}` token delimiters.
+*
+* This allows users to write intuitive patterns like `({{harf}}):` instead of
+* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
+* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
+*
+* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
+* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
+*
+* @example
+* escapeTemplateBrackets('({{harf}}): ')
+* // → '\\({{harf}}\\): '
+*
+* @example
+* escapeTemplateBrackets('[{{raqm}}] ')
+* // → '\\[{{raqm}}\\] '
+*
+* @example
+* escapeTemplateBrackets('{{harf}}')
+* // → '{{harf}}' (unchanged - no brackets outside tokens)
+*/
+const escapeTemplateBrackets = (pattern) => {
+	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => token || `\\${bracket}`);
+};
+/**
+* Character class matching all Arabic diacritics (Tashkeel/Harakat).
+*
+* Includes the following diacritical marks:
+* - U+0640: ـ (tatweel / kashida)
+* - U+064B: ً (fathatan - double fatha)
+* - U+064C: ٌ (dammatan - double damma)
+* - U+064D: ٍ (kasratan - double kasra)
+* - U+064E: َ (fatha - short a)
+* - U+064F: ُ (damma - short u)
+* - U+0650: ِ (kasra - short i)
+* - U+0651: ّ (shadda - gemination)
+* - U+0652: ْ (sukun - no vowel)
+*
+* @internal
+*/
+const DIACRITICS_CLASS = "[ـًٌٍَُِّْ]";
+/**
+* Groups of equivalent Arabic characters.
+*
+* Characters within the same group are considered equivalent for matching purposes.
+* This handles common variations in Arabic text where different characters are
+* used interchangeably or have the same underlying meaning.
+*
+* Equivalence groups:
+* - Alef variants: ا (bare), آ (with madda), أ (with hamza above), إ (with hamza below)
+* - Ta marbuta and Ha: ة ↔ ه (often interchangeable at word endings)
+* - Alef maqsura and Ya: ى ↔ ي (often interchangeable at word endings)
+*
+* @internal
+*/
+const EQUIV_GROUPS = [
+	[
+		"ا",
+		"آ",
+		"أ",
+		"إ"
+	],
+	["ة", "ه"],
+	["ى", "ي"]
+];
+const DIACRITICS_AND_MARKS_REGEX = new RegExp(ARABIC_MARKS_CLASS, "g");
+/**
+* Escapes a string for safe inclusion in a regular expression.
+*
+* Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
+*
+* @param s - Any string to escape
+* @returns String with regex metacharacters escaped
+*
+* @example
+* escapeRegex('hello.world')   // → 'hello\\.world'
+* escapeRegex('[test]')        // → '\\[test\\]'
+* escapeRegex('a+b*c?')        // → 'a\\+b\\*c\\?'
+*/
+const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+const getEquivClass = (ch) => {
+	const group = EQUIV_GROUPS.find((g) => g.includes(ch));
+	return group ? `[${group.map(escapeRegex).join("")}]` : escapeRegex(ch);
+};
+const normalizeArabicLight = (str) => {
+	return str.normalize("NFC").replace(/[\u200C\u200D]/g, "").replace(/\s+/g, " ").trim();
+};
+/**
+* Normalizes Arabic text for exact comparisons while tolerating common variants.
+*
+* This removes Arabic diacritics, collapses whitespace, removes joiners, and
+* maps common equivalent letters to a shared canonical form:
+* - ا/آ/أ/إ -> ا
+* - ة/ه -> ه
+* - ى/ي -> ي
+*/
+const normalizeArabicForComparison = (text) => {
+	return Array.from(normalizeArabicLight(text).replace(DIACRITICS_AND_MARKS_REGEX, "")).map((ch) => {
+		if (ch === "آ" || ch === "أ" || ch === "إ") return "ا";
+		if (ch === "ة") return "ه";
+		if (ch === "ى") return "ي";
+		return ch;
+	}).join("");
+};
+const makeDiacriticInsensitive = (text) => {
+	const diacriticsMatcher = `${DIACRITICS_CLASS}*`;
+	return Array.from(normalizeArabicLight(text)).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
+};
+const isCombiningMarkOrSelector = (char) => {
+	if (!char) return false;
+	return /\p{M}/u.test(char) || char === "︎" || char === "️";
+};
+const isJoiner = (char) => char === "‌" || char === "‍";
+/**
+* Ensures the position does not split a grapheme cluster (surrogate pairs,
+* combining marks, or zero-width joiners / variation selectors).
+*
+* This is only used as a last-resort fallback when we are forced to split
+* near a hard limit (e.g. maxContentLength with no safe whitespace/punctuation).
+*/
+const adjustForUnicodeBoundary = (content, position) => {
+	let adjusted = position;
+	while (adjusted > 0) {
+		const high = content.charCodeAt(adjusted - 1);
+		const low = content.charCodeAt(adjusted);
+		if (high >= 55296 && high <= 56319 && low >= 56320 && low <= 57343) {
+			adjusted -= 1;
+			continue;
+		}
+		const nextChar = content[adjusted];
+		const prevChar = content[adjusted - 1];
+		if (isCombiningMarkOrSelector(nextChar) || isJoiner(nextChar) || isJoiner(prevChar)) {
+			adjusted -= 1;
+			continue;
+		}
+		break;
+	}
+	return adjusted;
+};
 //#endregion
 //#region src/analysis/shared.ts
 const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
@@ -632,7 +707,6 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
 };
 const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
 const isCommonDelimiter = (ch) => /[:：\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
 //#endregion
 //#region src/analysis/line-starts.ts
 const resolveOptions$1 = (options = {}) => ({
@@ -658,65 +732,141 @@ const compareBySpecificity = (a, b) => {
 	return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
 };
 const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
-/** Remove trailing whitespace placeholders */
-const trimTrailingWs = (out, mode) => {
-	const suffix = mode === "regex" ? "\\s*" : " ";
-	while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
-	return out;
-};
-/** Try to extract first word for fallback */
-const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
-/** Consume prefix matchers at current position */
-const consumePrefixes = (s, pos, out, matchers, ws) => {
-	let matched = false;
+const appendPrefix = (s, pos, out, matchers, ws) => {
 	for (const re of matchers) {
 		if (pos >= s.length) break;
 		const m = re.exec(s.slice(pos));
 		if (!m?.index && m?.[0]) {
 			out += escapeSignatureLiteral(m[0]);
 			pos += m[0].length;
-			matched = true;
 			const wsm = /^[ \t]+/u.exec(s.slice(pos));
 			if (wsm) {
 				pos += wsm[0].length;
 				out = appendWs(out, ws);
 			}
+			return {
+				matched: true,
+				out,
+				pos
+			};
 		}
 	}
 	return {
-		matched,
+		matched: false,
 		out,
 		pos
 	};
 };
-/** Try to match a token at current position and append to signature */
-const tryMatchToken = (s, pos, out, compiled) => {
+const appendToken = (s, pos, out, compiled) => {
 	const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
-	if (!best) return {
-		matched: false,
-		out,
-		pos
-	};
-	return {
+	return best ? {
 		matched: true,
 		out: `${out}{{${best.token}}}`,
 		pos: pos + best.text.length
+	} : {
+		matched: false,
+		out,
+		pos
 	};
 };
-/** Try to match a delimiter at current position */
-const tryMatchDelimiter = (s, pos, out) => {
+const appendDelimiter = (s, pos, out) => {
 	const ch = s[pos];
-	if (!ch || !isCommonDelimiter(ch)) return {
+	return ch && isCommonDelimiter(ch) ? {
+		matched: true,
+		out: `${out}${escapeSignatureLiteral(ch)}`,
+		pos: pos + 1
+	} : {
 		matched: false,
 		out,
-		pos
+		pos
+	};
+};
+const appendFallbackWord = (s, pos, out) => {
+	const word = extractFirstWord(s.slice(pos));
+	return word ? `${out}${escapeSignatureLiteral(word)}` : null;
+};
+const consumeLineStartStep = (s, pos, out, compiled, opts, matchedAny, matchedToken) => {
+	const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
+	if (ws.skipped) return {
+		done: false,
+		matchedAny,
+		matchedToken,
+		out: ws.out,
+		pos: ws.pos,
+		steps: 0
+	};
+	const tok = appendToken(s, pos, out, compiled);
+	if (tok.matched) return {
+		done: false,
+		matchedAny: true,
+		matchedToken: true,
+		out: tok.out,
+		pos: tok.pos,
+		steps: 1
+	};
+	if (matchedAny) {
+		const delim = appendDelimiter(s, pos, out);
+		if (delim.matched) return {
+			done: false,
+			matchedAny,
+			matchedToken,
+			out: delim.out,
+			pos: delim.pos,
+			steps: 0
+		};
+		if (opts.includeFirstWordFallback && !matchedToken) {
+			const fallback = appendFallbackWord(s, pos, out);
+			if (fallback) return {
+				done: true,
+				matchedAny,
+				matchedToken,
+				out: fallback,
+				pos,
+				steps: 1
+			};
+		}
+		return {
+			done: true,
+			matchedAny,
+			matchedToken,
+			out,
+			pos,
+			steps: 0
+		};
+	}
+	if (!opts.includeFirstWordFallback) return {
+		done: true,
+		matchedAny,
+		matchedToken,
+		out,
+		pos,
+		steps: 0
 	};
-	return {
-		matched: true,
-		out: out + escapeSignatureLiteral(ch),
-		pos: pos + 1
+	const fallback = appendFallbackWord(s, pos, out);
+	return fallback ? {
+		done: true,
+		matchedAny: true,
+		matchedToken,
+		out: fallback,
+		pos,
+		steps: 0
+	} : {
+		done: true,
+		matchedAny,
+		matchedToken,
+		out,
+		pos,
+		steps: 0
 	};
 };
+/** Remove trailing whitespace placeholders */
+const trimTrailingWs = (out, mode) => {
+	const suffix = mode === "regex" ? "\\s*" : " ";
+	while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
+	return out;
+};
+/** Try to extract first word for fallback */
+const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
 /** Skip whitespace at position */
 const skipWhitespace$1 = (s, pos, out, ws) => {
 	const m = /^[ \t]+/u.exec(s.slice(pos));
@@ -737,47 +887,25 @@ const tokenizeLineStart = (line, tokenNames, opts) => {
 	const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
 	const compiled = compileTokenRegexes(tokenNames);
 	let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
-	const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
+	const prefix = appendPrefix(s, pos, out, opts.prefixMatchers, opts.whitespace);
 	pos = prefix.pos;
 	out = prefix.out;
 	matchedAny = prefix.matched;
 	while (steps < 6 && pos < s.length) {
-		const ws = skipWhitespace$1(s, pos, out, opts.whitespace);
-		if (ws.skipped) {
-			pos = ws.pos;
-			out = ws.out;
-			continue;
-		}
-		const tok = tryMatchToken(s, pos, out, compiled);
-		if (tok.matched) {
-			pos = tok.pos;
-			out = tok.out;
-			matchedAny = matchedToken = true;
-			steps++;
-			continue;
-		}
-		if (matchedAny) {
-			const delim = tryMatchDelimiter(s, pos, out);
-			if (delim.matched) {
-				pos = delim.pos;
-				out = delim.out;
-				continue;
-			}
-		}
-		if (matchedAny) {
-			if (opts.includeFirstWordFallback && !matchedToken) {
-				const word = extractFirstWord(s.slice(pos));
-				if (word) {
-					out += escapeSignatureLiteral(word);
-					steps++;
-				}
-			}
+		const next = consumeLineStartStep(s, pos, out, compiled, opts, matchedAny, matchedToken);
+		if (next.done) {
+			if (!next.matchedAny && !next.matchedToken && next.out === out && next.pos === pos) return null;
+			if (next.steps > 0) steps += next.steps;
+			matchedAny = next.matchedAny;
+			matchedToken = next.matchedToken;
+			out = next.out;
 			break;
 		}
-		if (!opts.includeFirstWordFallback) return null;
-		const word = extractFirstWord(s.slice(pos));
-		if (!word) return null;
-		return escapeSignatureLiteral(word);
+		pos = next.pos;
+		out = next.out;
+		matchedAny = next.matchedAny;
+		matchedToken = next.matchedToken;
+		steps += next.steps;
 	}
 	return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
 };
@@ -821,7 +949,6 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
 		pattern
 	})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
 };
 //#endregion
 //#region src/analysis/repeating-sequences.ts
 const resolveOptions = (options) => {
@@ -843,6 +970,7 @@ const resolveOptions = (options) => {
 const createRawCursor = (text, normalize) => {
 	let rawPos = 0;
 	return {
+		/** Advance cursor, returning the raw text chunk consumed */
 		advance(normalizedLen) {
 			if (!normalize) {
 				const chunk = text.slice(rawPos, rawPos + normalizedLen);
@@ -947,23 +1075,27 @@ const buildExample = (page, window, contextChars) => {
 		text: page.content.slice(start, end)
 	};
 };
+const recordPattern = (page, window, opts, stats) => {
+	if (opts.requireToken && !hasTokenInWindow(window)) return;
+	const pattern = buildPattern(window, opts.whitespace);
+	let entry = stats.get(pattern);
+	if (!entry) {
+		if (stats.size >= opts.maxUniquePatterns) return;
+		entry = {
+			count: 0,
+			examples: [],
+			...computeWindowStats(window)
+		};
+		stats.set(pattern, entry);
+	}
+	entry.count++;
+	if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
+};
 /** Extract N-grams from a single page */
 const extractPageNgrams = (page, items, opts, stats) => {
-	for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
-		const window = items.slice(i, i + n);
-		if (opts.requireToken && !hasTokenInWindow(window)) continue;
-		const pattern = buildPattern(window, opts.whitespace);
-		if (!stats.has(pattern)) {
-			if (stats.size >= opts.maxUniquePatterns) continue;
-			stats.set(pattern, {
-				count: 0,
-				examples: [],
-				...computeWindowStats(window)
-			});
-		}
-		const entry = stats.get(pattern);
-		entry.count++;
-		if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
+	for (let i = 0; i <= items.length - opts.minElements; i++) {
+		const maxWindowSize = Math.min(opts.maxElements, items.length - i);
+		for (let n = opts.minElements; n <= maxWindowSize; n++) recordPattern(page, items.slice(i, i + n), opts, stats);
 	}
 };
 /**
@@ -985,7 +1117,6 @@ const analyzeRepeatingSequences = (pages, options) => {
 		pattern
 	}));
 };
 //#endregion
 //#region src/detection.ts
 /**
@@ -1147,7 +1278,6 @@ const analyzeTextForRule = (text) => {
 		...suggestPatternConfig(detected)
 	};
 };
 //#endregion
 //#region src/types/rules.ts
 /**
@@ -1172,7 +1302,6 @@ const PATTERN_TYPE_KEYS = [
 	"template",
 	"regex"
 ];
 //#endregion
 //#region src/optimization/optimize-rules.ts
 const MERGEABLE_KEYS = new Set([
@@ -1231,7 +1360,6 @@ const optimizeRules = (rules) => {
 		rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
 	};
 };
 //#endregion
 //#region src/preprocessing/transforms.ts
 /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
@@ -1340,170 +1468,89 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
 	}
 	return result;
 };
 //#endregion
-//#region src/segmentation/rule-regex.ts
-/**
-* Checks if a regex pattern contains standard (anonymous) capturing groups.
-*
-* Detects standard capturing groups `(...)` while excluding:
-* - Non-capturing groups `(?:...)`
-* - Lookahead assertions `(?=...)` and `(?!...)`
-* - Lookbehind assertions `(?<=...)` and `(?<!...)`
-* - Named groups `(?<name>...)` (start with `(?` so excluded here)
-*
-* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
-*/
-const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
-/**
-* Extracts named capture group names from a regex pattern.
-*
-* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
-*
-* @example
-* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
-* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
-* extractNamedCaptureNames('^\\d+') // []
-*/
-const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([^>]+)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
-/**
-* Safely compiles a regex pattern, throwing a helpful error if invalid.
-*/
-const compileRuleRegex = (pattern) => {
-	try {
-		return new RegExp(pattern, "gmu");
-	} catch (error) {
-		throw new Error(`Invalid regex pattern: ${pattern}\n  Cause: ${error instanceof Error ? error.message : String(error)}`);
+//#region src/segmentation/arabic-dictionary-rule.ts
+const uniqueNormalizedWords = (words) => {
+	const seen = /* @__PURE__ */ new Set();
+	const result = [];
+	for (const word of words) {
+		const normalized = normalizeArabicForComparison(word);
+		if (!normalized || seen.has(normalized)) continue;
+		seen.add(normalized);
+		result.push(normalized);
 	}
+	return result;
 };
-/**
-* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
-*
-* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
-*/
-const processPattern = (pattern, fuzzy, capturePrefix) => {
-	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
-	return {
-		captureNames,
-		pattern: expanded
-	};
-};
-/**
-* Processes a breakpoint pattern by expanding tokens only.
-*
-* Unlike `processPattern`, this does NOT escape brackets because breakpoints
-* are treated as raw regex patterns (like the `regex` rule type).
-* Users have full control over regex syntax including `(?:...)` groups.
-*/
-const processBreakpointPattern = (pattern) => {
-	const { pattern: expanded } = expandTokensWithCaptures(pattern);
-	return expanded;
-};
-const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
-	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
-	return {
-		captureNames: processed.flatMap((p) => p.captureNames),
-		regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
-	};
-};
-const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
-	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
-	return {
-		captureNames: processed.flatMap((p) => p.captureNames),
-		regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
-	};
+const buildStopAlternation = (stopWords) => {
+	const unique = uniqueNormalizedWords(stopWords);
+	if (unique.length === 0) return "";
+	return unique.map((word) => makeDiacriticInsensitive(word)).join("|");
 };
-const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
-	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
-	return {
-		captureNames: processed.flatMap((p) => p.captureNames),
-		regex: `(?:${alternatives})$`
-	};
+const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
+	if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
+	const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
+	return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
 };
-const buildTemplateRegexSource = (template, capturePrefix) => {
-	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
-	return {
-		captureNames,
-		regex: pattern
-	};
+const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
+	const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
+	const withCapture = captureName ? `(?<${captureName}>${headwordBody})` : `(?:${headwordBody})`;
+	if (!allowParenthesized) return `${withCapture}${colon}`;
+	return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
 };
 /**
-* Builds a compiled regex and metadata from a split rule.
+* Creates a reusable split rule for Arabic dictionary entries.
 *
-* Behavior mirrors the previous implementation in `segmenter.ts`.
+* The generated rule:
+* - keeps the lemma marker in `segment.content`
+* - stores the lemma in `segment.meta[captureName]`
+* - matches root entries at true line/page starts
+* - matches mid-line subentries conservatively when they begin with `و`
+* - can optionally support parenthesized headwords like `(عنبر) :`
+* - can optionally support comma-separated headword lists like `سبد، دبس:`
+*
+* @example
+* createArabicDictionaryEntryRule({
+*   stopWords: ['وقيل', 'ويقال', 'قال'],
+*   pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
+* })
+*
+* @example
+* createArabicDictionaryEntryRule({
+*   allowParenthesized: true,
+*   allowWhitespaceBeforeColon: true,
+*   allowCommaSeparated: true,
+*   stopWords: ['الليث', 'العجاج'],
+* })
 */
-const buildRuleRegex = (rule, capturePrefix) => {
-	const { lineStartsWith, lineStartsAfter, lineEndsWith, template, regex } = rule;
-	const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy([
-		...lineStartsWith ?? [],
-		...lineStartsAfter ?? [],
-		...lineEndsWith ?? []
-	]);
-	if (lineStartsAfter?.length) {
-		const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(lineStartsAfter, fuzzy, capturePrefix);
-		return {
-			captureNames,
-			regex: compileRuleRegex(lsaRegex),
-			usesCapture: true,
-			usesLineStartsAfter: true
-		};
-	}
-	let finalRegex = regex;
-	let allCaptureNames = [];
-	if (lineStartsWith?.length) {
-		const res = buildLineStartsWithRegexSource(lineStartsWith, fuzzy, capturePrefix);
-		finalRegex = res.regex;
-		allCaptureNames = res.captureNames;
-	}
-	if (lineEndsWith?.length) {
-		const res = buildLineEndsWithRegexSource(lineEndsWith, fuzzy, capturePrefix);
-		finalRegex = res.regex;
-		allCaptureNames = res.captureNames;
-	}
-	if (template) {
-		const res = buildTemplateRegexSource(template, capturePrefix);
-		finalRegex = res.regex;
-		allCaptureNames = [...allCaptureNames, ...res.captureNames];
-	}
-	if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
-	if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
+const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
+	if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
+	if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
+	if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
+	const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
+	const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
+	const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
+	const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
+	const stopAlternation = buildStopAlternation(stopWords);
+	const lemmaBody = buildHeadwordBody({
+		allowCommaSeparated,
+		colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
+		stopAlternation,
+		stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
+		unit: lemmaUnit
+	});
 	return {
-		captureNames: allCaptureNames,
-		regex: compileRuleRegex(finalRegex),
-		usesCapture: hasCapturingGroup(finalRegex),
-		usesLineStartsAfter: false
+		meta,
+		pageStartPrevWordStoplist,
+		regex: `(?:${`(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`}|${allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`})` + buildBalancedMarker({
+			allowParenthesized,
+			allowWhitespaceBeforeColon,
+			captureName,
+			headwordBody: lemmaBody
+		}),
+		samePagePrevWordStoplist,
+		split: "at"
 	};
 };
-//#endregion
-//#region src/segmentation/breakpoint-constants.ts
-/**
-* Shared constants for segmentation breakpoint processing.
-*/
-/**
-* Threshold for using offset-based fast path in boundary processing.
-*
-* Below this: accurate string-search (handles offset drift from structural rules).
-* At or above this: O(n) arithmetic (performance critical for large books).
-*
-* The value of 1000 is chosen based on typical Arabic book sizes:
-* - Sahih al-Bukhari: ~1000-3000 pages
-* - Standard hadith collections: 1000-7000 pages
-* - Large aggregated corpora: 10k-50k pages
-*
-* For segments ≥1000 pages, the performance gain from offset-based slicing
-* outweighs the minor accuracy loss from potential offset drift.
-*
-* @remarks
-* Fast path is skipped when:
-* - `maxContentLength` is set (requires character-accurate splitting)
-* - `debugMetaKey` is set (requires proper provenance tracking)
-* - Content was structurally modified by marker stripping (offsets may drift)
-*/
-const FAST_PATH_THRESHOLD = 1e3;
 const WINDOW_PREFIX_LENGTHS = [
 	80,
 	60,
@@ -1530,23 +1577,6 @@ const STOP_CHARACTERS = /[\s\n.,;!?؛،۔۝۞]/;
 * Matches outside this range are rejected unless `ignoreDeviation` is active.
 */
 const MAX_DEVIATION = 2e3;
-/**
-* Penalty score applied to non-newline anchor candidates.
-*
-* Designed to prioritize newline-aligned boundaries unless a whitespace match is
-* significantly closer (within 20 chars). Handles cases where marker stripping
-* shifts the boundary slightly.
-*/
-const NON_NEWLINE_PENALTY = 20;
-/**
-* Limit for inferring start offset from a relaxed search (characters).
-*
-* If the relaxed search finds a match more than this distance away from the
-* expected position, we assume it's a false positive (e.g. repeated content)
-* and do not use it to infer the start offset.
-*/
-const INFERENCE_PROXIMITY_LIMIT = 500;
 //#endregion
 //#region src/segmentation/match-utils.ts
 /**
@@ -1665,7 +1695,6 @@ const extractDebugIndex = (groups, prefix) => {
 		if (!Number.isNaN(idx)) return idx;
 	}
 };
 //#endregion
 //#region src/segmentation/breakpoint-utils.ts
 /**
@@ -2067,8 +2096,8 @@ const findAnchorCandidates = (content, prefix, start, end) => {
 /** Selects the best anchor candidate, prioritizing newlines then proximity to boundary */
 const selectBestAnchor = (candidates, expectedBoundary) => {
 	return candidates.reduce((best, curr) => {
-		const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : NON_NEWLINE_PENALTY);
-		return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : NON_NEWLINE_PENALTY) < bestScore ? curr : best;
+		const bestScore = Math.abs(best.pos - expectedBoundary) + (best.isNewline ? 0 : 20);
+		return Math.abs(curr.pos - expectedBoundary) + (curr.isNewline ? 0 : 20) < bestScore ? curr : best;
 	});
 };
 /**
@@ -2122,7 +2151,7 @@ const resolveBoundaryMatch = (segmentContent, pageIdx, rawBoundary, startOffsetI
 		if (relaxedPos > 0) {
 			const inferredStartOffset = rawBoundary - relaxedPos;
 			const currentExpected = Math.max(0, rawBoundary - startOffsetInFromPage);
-			if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < INFERENCE_PROXIMITY_LIMIT) {
+			if (inferredStartOffset >= 0 && Math.abs(relaxedPos - currentExpected) < 500) {
 				startOffsetInFromPage = inferredStartOffset;
 				expectedBoundary = Math.max(0, rawBoundary - startOffsetInFromPage);
 				pos = relaxedPos;
@@ -2196,7 +2225,7 @@ const buildBoundaryPositionsAccurate = (segmentContent, fromIdx, toIdx, pageCoun
 const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
 	const pageCount = toIdx - fromIdx + 1;
 	const expectedLength = (cumulativeOffsets[toIdx + 1] ?? 0) - (cumulativeOffsets[fromIdx] ?? 0);
-	if (pageCount >= FAST_PATH_THRESHOLD && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
+	if (pageCount >= 1e3 && segmentContent.length === expectedLength) return buildBoundaryPositionsFastPath(segmentContent, fromIdx, toIdx, pageCount, cumulativeOffsets, logger);
 	return buildBoundaryPositionsAccurate(segmentContent, fromIdx, toIdx, pageCount, pageIds, normalizedPages, cumulativeOffsets, logger);
 };
 /**
@@ -2428,7 +2457,6 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
 	}
 	return -1;
 };
 //#endregion
 //#region src/segmentation/debug-meta.ts
 const resolveDebugConfig = (debug) => {
@@ -2470,59 +2498,197 @@ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
 		...word !== void 0 ? { word } : {}
 	} };
 };
-const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
-	index: breakpointIndex,
-	kind: rule.pattern === "" ? "pageBoundary" : "pattern",
-	pattern: rule.pattern ?? rule.regex,
-	...wordIndex !== void 0 ? { wordIndex } : {},
-	...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
-} });
+const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
+	index: breakpointIndex,
+	kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
+	pattern: rule.pattern ?? rule.regex,
+	...wordIndex !== void 0 ? { wordIndex } : {},
+	...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
+} });
+/**
+* Helper to format the debug info into a human-readable string.
+* @param meta - The segment metadata object
+* @param options - Formatting options
+*/
+const formatRuleReason = (rule, concise) => {
+	const { index, patternType, wordIndex, word } = rule;
+	if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
+	const wordInfo = word ? ` (Matched: "${word}")` : "";
+	return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
+};
+const formatBreakpointReason = (breakpoint, concise) => {
+	const { index, kind, pattern, wordIndex, word } = breakpoint;
+	if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
+	if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
+	if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
+	return `Breakpoint #${index} (${kind}) - "${pattern}"`;
+};
+const formatContentLengthReason = (split, concise) => {
+	const { maxContentLength, splitReason } = split;
+	if (concise) return `> ${maxContentLength} (${splitReason})`;
+	return `Safety Split (${splitReason}) > ${maxContentLength}`;
+};
+/**
+* Helper to format the debug info into a human-readable string.
+* @param meta - The segment metadata object
+* @param options - Formatting options
+*/
+const getDebugReason = (meta, options) => {
+	const debug = meta?._flappa;
+	if (!debug) return "-";
+	const concise = options?.concise;
+	if (debug.rule) return formatRuleReason(debug.rule, concise);
+	if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
+	if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
+	return "Unknown";
+};
+/**
+* Convenience helper to get the formatted debug reason directly from a segment.
+* @param segment - The segment object
+* @param options - Formatting options
+*/
+const getSegmentDebugReason = (segment, options) => {
+	return getDebugReason(segment.meta, options);
+};
+//#endregion
+//#region src/segmentation/pattern-validator.ts
+const KNOWN_TOKENS = new Set(getAvailableTokens());
+const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
+const buildBareTokenRegex = () => {
+	const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
+	return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
+};
+/**
+* Validates a single pattern for common issues.
+*/
+const validatePattern = (pattern, seenPatterns) => {
+	if (!pattern.trim()) return {
+		message: "Empty pattern is not allowed",
+		type: "empty_pattern"
+	};
+	if (seenPatterns.has(pattern)) return {
+		message: `Duplicate pattern: "${pattern}"`,
+		pattern,
+		type: "duplicate"
+	};
+	seenPatterns.add(pattern);
+	TOKEN_INSIDE_BRACES.lastIndex = 0;
+	for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
+		const name = match[1];
+		if (!KNOWN_TOKENS.has(name)) return {
+			message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
+			suggestion: "Check spelling or use a known token",
+			token: name,
+			type: "unknown_token"
+		};
+	}
+	for (const match of pattern.matchAll(buildBareTokenRegex())) {
+		const [full, name] = match;
+		const idx = match.index;
+		if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
+			message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
+			suggestion: `{{${full}}}`,
+			token: name,
+			type: "missing_braces"
+		};
+	}
+};
 /**
-* Helper to format the debug info into a human-readable string.
-* @param meta - The segment metadata object
-* @param options - Formatting options
+* Validates an array of patterns, returning parallel array of issues.
 */
-const formatRuleReason = (rule, concise) => {
-	const { index, patternType, wordIndex, word } = rule;
-	if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
-	const wordInfo = word ? ` (Matched: "${word}")` : "";
-	return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
+const validatePatternArray = (patterns) => {
+	const seen = /* @__PURE__ */ new Set();
+	const issues = patterns.map((p) => validatePattern(p, seen));
+	return issues.some(Boolean) ? issues : void 0;
 };
-const formatBreakpointReason = (breakpoint, concise) => {
-	const { index, kind, pattern, wordIndex, word } = breakpoint;
-	if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
-	if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
-	if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
-	return `Breakpoint #${index} (${kind}) - "${pattern}"`;
+const applyRulePatternValidation = (result, key, patterns) => {
+	if (!patterns) return false;
+	const issues = validatePatternArray(patterns);
+	if (!issues) return false;
+	result[key] = issues;
+	return true;
 };
-const formatContentLengthReason = (split, concise) => {
-	const { maxContentLength, splitReason } = split;
-	if (concise) return `> ${maxContentLength} (${splitReason})`;
-	return `Safety Split (${splitReason}) > ${maxContentLength}`;
+const validateTemplateRule = (rule, result) => {
+	if (rule.template === void 0) return false;
+	const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
+	if (!issue) return false;
+	result.template = issue;
+	return true;
+};
+const validateRegexRule = (rule, result) => {
+	if (rule.regex === void 0) return false;
+	if (!rule.regex.trim()) {
+		result.regex = {
+			message: "Empty pattern is not allowed",
+			type: "empty_pattern"
+		};
+		return true;
+	}
+	try {
+		new RegExp(rule.regex, "u");
+		return false;
+	} catch (error) {
+		result.regex = {
+			message: error instanceof Error ? error.message : String(error),
+			pattern: rule.regex,
+			type: "invalid_regex"
+		};
+		return true;
+	}
+};
+const formatValidationIssue = (_type, issue, loc) => {
+	if (!issue) return null;
+	if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
+	if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
+	if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
+	if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
+	return `${loc}: ${issue.message || issue.type}`;
 };
 /**
-* Helper to format the debug info into a human-readable string.
-* @param meta - The segment metadata object
-* @param options - Formatting options
+* Validates split rules for common pattern issues.
+*
+* Checks for:
+* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
+* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
+* - Duplicate patterns within the same rule
+*
+* @param rules - Array of split rules to validate
+* @returns Array parallel to input with validation results (undefined if no issues)
+*
+* @example
+* const issues = validateRules([
+*   { lineStartsAfter: ['raqms:num'] },  // Missing braces
+*   { lineStartsWith: ['{{unknown}}'] }, // Unknown token
+* ]);
+* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
+* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
 */
-const getDebugReason = (meta, options) => {
-	const debug = meta?._flappa;
-	if (!debug) return "-";
-	const concise = options?.concise;
-	if (debug.rule) return formatRuleReason(debug.rule, concise);
-	if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
-	if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
-	return "Unknown";
-};
+const validateRules = (rules) => rules.map((rule) => {
+	const result = {};
+	const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
+	const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
+	const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
+	const templateIssues = validateTemplateRule(rule, result);
+	const regexIssues = validateRegexRule(rule, result);
+	return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues ? result : void 0;
+});
 /**
-* Convenience helper to get the formatted debug reason directly from a segment.
-* @param segment - The segment object
-* @param options - Formatting options
+* Formats a validation result array into a list of human-readable error messages.
+*
+* Useful for displaying validation errors in UIs.
+*
+* @param results - The result array from `validateRules()`
+* @returns Array of formatted error strings
+*
+* @example
+* const issues = validateRules(rules);
+* const errors = formatValidationReport(issues);
+* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
 */
-const getSegmentDebugReason = (segment, options) => {
-	return getDebugReason(segment.meta, options);
-};
+const formatValidationReport = (results) => results.flatMap((result, i) => {
+	if (!result) return [];
+	return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${i + 1}, ${type}`)).filter((msg) => msg !== null));
+});
 //#endregion
 //#region src/segmentation/breakpoint-processor.ts
 const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
@@ -2650,7 +2816,7 @@ const checkFastPathAlignment = (cumulativeOffsets, fullContent, fromIdx, toIdx,
 	const expectedLength = (cumulativeOffsets[toIdx + 1] ?? fullContent.length) - (cumulativeOffsets[fromIdx] ?? 0);
 	const driftTolerance = Math.max(100, fullContent.length * .01);
 	const isAligned = Math.abs(expectedLength - fullContent.length) <= driftTolerance;
-	if (!isAligned && pageCount >= FAST_PATH_THRESHOLD) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
+	if (!isAligned && pageCount >= 1e3) logger?.warn?.("[breakpoints] Offset drift detected in fast-path candidate, falling back to slow path", {
 		actualLength: fullContent.length,
 		drift: Math.abs(expectedLength - fullContent.length),
 		expectedLength,
@@ -2791,8 +2957,7 @@ const computeWindowEndPositionForIteration = (remainingContent, cursorPos, curre
 	if (maxPages === 0) {
 		const nextPageStartPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? Number.POSITIVE_INFINITY;
 		const remainingInCurrentPage = Math.max(0, nextPageStartPos - cursorPos);
-		const capped = maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage;
-		return Math.min(capped, remainingContent.length);
+		return Math.min(maxContentLength ? Math.min(remainingInCurrentPage, maxContentLength) : remainingInCurrentPage, remainingContent.length);
 	}
 	const pos = getWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, maxContentLength, logger);
 	return Math.min(pos, remainingContent.length);
@@ -2847,7 +3012,7 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
 	const pageCount = toIdx - fromIdx + 1;
 	const isAligned = checkFastPathAlignment(cumulativeOffsets, fullContent, fromIdx, toIdx, pageCount, logger);
 	const isPageBoundaryOnly = expandedBreakpoints.every((bp) => bp.regex === null && bp.excludeSet.size === 0 && bp.skipWhenRegex === null);
-	if (pageCount < FAST_PATH_THRESHOLD || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
+	if (pageCount < 1e3 || !isAligned || !isPageBoundaryOnly || maxContentLength || debugMetaKey) return null;
 	if (maxPages === 0) return processTrivialFastPath(fromIdx, toIdx, pageIds, normalizedPages, pageCount, segment.meta, debugMetaKey, logger);
 	return processOffsetFastPath(fullContent, fromIdx, toIdx, pageIds, cumulativeOffsets, maxPages, segment.meta, debugMetaKey, logger);
 };
@@ -3030,7 +3195,178 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 	logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
 	return result;
 };
+//#endregion
+//#region src/segmentation/rule-regex.ts
+/**
+* Checks if a regex pattern contains standard (anonymous) capturing groups.
+*
+* Detects standard capturing groups `(...)` while excluding:
+* - Non-capturing groups `(?:...)`
+* - Lookahead assertions `(?=...)` and `(?!...)`
+* - Lookbehind assertions `(?<=...)` and `(?<!...)`
+* - Named groups `(?<name>...)` (start with `(?` so excluded here)
+*
+* NOTE: Named capture groups are still captures, but they're tracked via `captureNames`.
+*/
+const hasCapturingGroup = (pattern) => /\((?!\?)/.test(pattern);
+/**
+* Extracts named capture group names from a regex pattern.
+*
+* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
+*
+* @example
+* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
+* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
+* extractNamedCaptureNames('^\\d+') // []
+*/
+const extractNamedCaptureNames = (pattern) => [...pattern.matchAll(/\(\?<([A-Za-z_]\w*)>/g)].map((m) => m[1]).filter((n) => !n.startsWith("_r") && !n.startsWith("_w"));
+/**
+* Safely compiles a regex pattern, throwing a helpful error if invalid.
+*/
+const compileRuleRegex = (pattern) => {
+	try {
+		return new RegExp(pattern, "gmu");
+	} catch (error) {
+		throw new Error(`Invalid regex pattern: ${pattern}\n  Cause: ${error instanceof Error ? error.message : String(error)}`);
+	}
+};
+/**
+* Processes a pattern string by expanding tokens and optionally applying fuzzy matching.
+*
+* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
+*/
+const processPattern = (pattern, fuzzy, capturePrefix) => {
+	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
+	return {
+		captureNames,
+		pattern: expanded
+	};
+};
+/**
+* Processes a breakpoint pattern by expanding tokens only.
+*
+* Unlike `processPattern`, this does NOT escape brackets because breakpoints
+* are treated as raw regex patterns (like the `regex` rule type).
+* Users have full control over regex syntax including `(?:...)` groups.
+*/
+const processBreakpointPattern = (pattern) => {
+	const { pattern: expanded } = expandTokensWithCaptures(pattern);
+	return expanded;
+};
+/**
+* Builds the raw regex source for a `lineStartsAfter` rule.
+*
+* Expands each pattern through `processPattern()`, combines them into an
+* alternation at the start of a line, and appends a trailing content capture.
+*
+* @param patterns - Template-like line-start markers to match
+* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
+* @param capturePrefix - Optional prefix used for internal named captures
+* @returns Regex source plus the named captures extracted from the patterns
+*/
+const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
+	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
+	return {
+		captureNames: processed.flatMap((p) => p.captureNames),
+		regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})${capturePrefix ? `(?<${capturePrefix}__content>.*)` : "(.*)"}`
+	};
+};
+/**
+* Builds the raw regex source for a `lineStartsWith` rule.
+*
+* Expands each pattern through `processPattern()` and combines them into an
+* alternation anchored at the start of a line.
+*
+* @param patterns - Template-like line-start markers to match
+* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
+* @param capturePrefix - Optional prefix used for internal named captures
+* @returns Regex source plus the named captures extracted from the patterns
+*/
+const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
+	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
+	return {
+		captureNames: processed.flatMap((p) => p.captureNames),
+		regex: `^[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*(?:${alternatives})`
+	};
+};
+/**
+* Builds the raw regex source for a `lineEndsWith` rule.
+*
+* Expands each pattern through `processPattern()` and combines them into an
+* end-anchored alternation.
+*
+* @param patterns - Template-like line-end markers to match
+* @param fuzzy - Whether Arabic fuzzy matching should be applied during expansion
+* @param capturePrefix - Optional prefix used for internal named captures
+* @returns Regex source plus the named captures extracted from the patterns
+*/
+const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
+	const alternatives = processed.map((p, i) => `(?<_r${i}>${p.pattern})`).join("|");
+	return {
+		captureNames: processed.flatMap((p) => p.captureNames),
+		regex: `(?:${alternatives})$`
+	};
+};
+/**
+* Builds the raw regex source for a `template` rule.
+*
+* Expands tokens and named captures via `expandTokensWithCaptures()` after
+* applying `escapeTemplateBrackets()` to non-token brackets.
+*
+* @param template - Template string containing optional `{{token}}` markers
+* @param capturePrefix - Optional prefix used for internal named captures
+* @returns Regex source plus the named captures extracted from the template
+*/
+const buildTemplateRegexSource = (template, capturePrefix) => {
+	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
+	return {
+		captureNames,
+		regex: pattern
+	};
+};
+const getFuzzyCandidatePatterns = (rule) => [
+	..."lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) ? rule.lineStartsWith : [],
+	..."lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) ? rule.lineStartsAfter : [],
+	..."lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) ? rule.lineEndsWith : []
+];
+const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
+	if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
+	if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
+	if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
+	return null;
+};
+/**
+* Builds a compiled regex and metadata from a split rule.
+*
+* Behavior mirrors the previous implementation in `segmenter.ts`.
+*/
+const buildRuleRegex = (rule, capturePrefix) => {
+	const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(getFuzzyCandidatePatterns(rule));
+	if ("lineStartsAfter" in rule && Array.isArray(rule.lineStartsAfter) && rule.lineStartsAfter.length > 0) {
+		const { regex: lsaRegex, captureNames } = buildLineStartsAfterRegexSource(rule.lineStartsAfter, fuzzy, capturePrefix);
+		return {
+			captureNames,
+			regex: compileRuleRegex(lsaRegex),
+			usesCapture: true,
+			usesLineStartsAfter: true
+		};
+	}
+	const ruleRegexSource = buildLineBasedRuleRegex(rule, fuzzy, capturePrefix);
+	let finalRegex = ruleRegexSource?.regex;
+	let allCaptureNames = ruleRegexSource?.captureNames ?? [];
+	if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
+	if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
+	if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
+	return {
+		captureNames: allCaptureNames,
+		regex: compileRuleRegex(finalRegex),
+		usesCapture: hasCapturingGroup(finalRegex),
+		usesLineStartsAfter: false
+	};
+};
 //#endregion
 //#region src/segmentation/fast-fuzzy-prefix.ts
 /**
@@ -3078,9 +3414,8 @@ const compileFastFuzzyTokenRule = (tokenTemplate) => {
 	const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
 	if (!m) return null;
 	const token = m[1];
-	const tokenPattern = getTokenPattern(token);
-	if (!tokenPattern) return null;
-	const compiled = compileLiteralAlternation(tokenPattern);
+	if (!(token in TOKEN_PATTERNS)) return null;
+	const compiled = compileLiteralAlternation(getTokenPattern(token));
 	return compiled ? {
 		alternatives: compiled.alternatives,
 		token
@@ -3093,11 +3428,11 @@ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
 	}
 	return null;
 };
 //#endregion
 //#region src/segmentation/segmenter-rule-utils.ts
 const tryCompileFastFuzzyRule = (rule) => {
-	if (!rule.fuzzy) return null;
+	const fuzzyCandidatePatterns = [..."lineStartsWith" in rule ? rule.lineStartsWith : [], ..."lineStartsAfter" in rule ? rule.lineStartsAfter : []];
+	if (!(rule.fuzzy ?? shouldDefaultToFuzzy(fuzzyCandidatePatterns))) return null;
 	if ("lineStartsWith" in rule && rule.lineStartsWith?.length === 1) {
 		const compiled = compileFastFuzzyTokenRule(rule.lineStartsWith[0]);
 		if (compiled) return {
@@ -3139,7 +3474,10 @@ const partitionRulesForMatching = (rules) => {
 			prefix: `r${index}_`,
 			rule
 		});
-		else standaloneRules.push(rule);
+		else standaloneRules.push({
+			index,
+			rule
+		});
 	}
 	return {
 		combinableRules,
@@ -3147,9 +3485,37 @@ const partitionRulesForMatching = (rules) => {
 		standaloneRules
 	};
 };
+const STRONG_SENTENCE_TERMINATORS = /[.!?؟؛۔…]$/u;
+const TRAILING_PAGE_WRAP_NOISE = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
+const TRAILING_WORD_DELIMITERS = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
+const ARABIC_WORD_REGEX = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
+const trimTrailingPageWrapNoise = (text) => {
+	let trimmed = text.trimEnd();
+	while (trimmed !== trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "")) trimmed = trimmed.replace(TRAILING_PAGE_WRAP_NOISE, "");
+	return trimmed;
+};
+const endsWithStrongSentenceTerminator = (pageContent) => {
+	return STRONG_SENTENCE_TERMINATORS.test(trimTrailingPageWrapNoise(pageContent));
+};
+const extractLastArabicWord = (pageContent) => {
+	return [...trimTrailingPageWrapNoise(pageContent).replace(TRAILING_WORD_DELIMITERS, "").matchAll(ARABIC_WORD_REGEX)].at(-1)?.[0] ?? "";
+};
+const shouldAllowPageStartMatch = (previousPageContent, prevWordStoplist) => {
+	if (!prevWordStoplist || endsWithStrongSentenceTerminator(previousPageContent)) return true;
+	const lastWord = extractLastArabicWord(previousPageContent);
+	return !lastWord || !prevWordStoplist.has(normalizeArabicForComparison(lastWord));
+};
+const shouldAllowSamePageMatch = (contentBeforeMatch, stoplist) => {
+	if (!stoplist) return true;
+	const lastWord = extractLastArabicWord(contentBeforeMatch);
+	return !lastWord || !stoplist.has(normalizeArabicForComparison(lastWord));
+};
 const createPageStartGuardChecker = (matchContent, pageMap) => {
 	const pageStartToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.start, i]));
 	const compiledPageStartPrev = /* @__PURE__ */ new Map();
+	const compiledPrevWordStoplists = /* @__PURE__ */ new Map();
+	const compiledSamePagePrevWordStoplists = /* @__PURE__ */ new Map();
+	const pageIdToBoundaryIndex = new Map(pageMap.boundaries.map((b, i) => [b.id, i]));
 	const getPageStartPrevRegex = (rule, ruleIndex) => {
 		if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
 		const pattern = rule.pageStartGuard;
@@ -3161,6 +3527,33 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
 		compiledPageStartPrev.set(ruleIndex, re);
 		return re;
 	};
+	const getPrevWordStoplist = (rule, ruleIndex) => {
+		if (compiledPrevWordStoplists.has(ruleIndex)) return compiledPrevWordStoplists.get(ruleIndex) ?? null;
+		const stoplist = rule.pageStartPrevWordStoplist;
+		if (!stoplist?.length) {
+			compiledPrevWordStoplists.set(ruleIndex, null);
+			return null;
+		}
+		const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
+		compiledPrevWordStoplists.set(ruleIndex, normalized);
+		return normalized;
+	};
+	const getSamePagePrevWordStoplist = (rule, ruleIndex) => {
+		if (compiledSamePagePrevWordStoplists.has(ruleIndex)) return compiledSamePagePrevWordStoplists.get(ruleIndex) ?? null;
+		const stoplist = rule.samePagePrevWordStoplist;
+		if (!stoplist?.length) {
+			compiledSamePagePrevWordStoplists.set(ruleIndex, null);
+			return null;
+		}
+		const normalized = new Set(stoplist.map((word) => normalizeArabicForComparison(word)).filter(Boolean));
+		compiledSamePagePrevWordStoplists.set(ruleIndex, normalized);
+		return normalized;
+	};
+	const getPreviousPageContent = (boundaryIndex) => {
+		if (boundaryIndex <= 0) return "";
+		const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
+		return matchContent.slice(prevBoundary.start, prevBoundary.end);
+	};
 	const getPrevPageLastNonWsChar = (boundaryIndex) => {
 		if (boundaryIndex <= 0) return "";
 		const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
@@ -3170,13 +3563,24 @@ const createPageStartGuardChecker = (matchContent, pageMap) => {
 		}
 		return "";
 	};
+	const getCurrentPageContentBeforeMatch = (matchStart) => {
+		const pageId = pageMap.getId(matchStart);
+		const boundaryIndex = pageIdToBoundaryIndex.get(pageId);
+		if (boundaryIndex === void 0) return "";
+		const boundary = pageMap.boundaries[boundaryIndex];
+		return matchContent.slice(boundary.start, matchStart);
+	};
 	return (rule, ruleIndex, matchStart) => {
 		const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
-		if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
-		const prevReq = getPageStartPrevRegex(rule, ruleIndex);
-		if (!prevReq) return true;
-		const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
-		return lastChar ? prevReq.test(lastChar) : false;
+		if (boundaryIndex !== void 0 && boundaryIndex !== 0) {
+			const prevReq = getPageStartPrevRegex(rule, ruleIndex);
+			if (prevReq) {
+				const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
+				if (!lastChar || !prevReq.test(lastChar)) return false;
+			}
+			return shouldAllowPageStartMatch(getPreviousPageContent(boundaryIndex), getPrevWordStoplist(rule, ruleIndex));
+		}
+		return shouldAllowSamePageMatch(getCurrentPageContentBeforeMatch(matchStart), getSamePagePrevWordStoplist(rule, ruleIndex));
 	};
 };
 /**
@@ -3212,10 +3616,10 @@ const attemptFastFuzzyMatch = (matchContent, lineStart, { compiled, kind, rule,
 /**
 * Processes matches for all fast-fuzzy rules at a specific line start.
 */
-const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart, splitPointsByRule) => {
+const processFastFuzzyMatchesAt = (matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule) => {
 	for (const ffRule of fastFuzzyRules) {
 		if (!passesRuleConstraints$1(ffRule.rule, pageId)) continue;
-		if (isPageStart && !passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
+		if (!passesPageStartGuard(ffRule.rule, ffRule.ruleIndex, lineStart)) continue;
 		attemptFastFuzzyMatch(matchContent, lineStart, ffRule, splitPointsByRule);
 	}
 };
@@ -3230,19 +3634,17 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
 			currentBoundary = pageMap.boundaries[boundaryIdx];
 		}
 	};
-	const isPageStart = (offset) => offset === currentBoundary?.start;
 	for (let lineStart = 0; lineStart <= matchContent.length;) {
 		advanceBoundaryTo(lineStart);
 		const pageId = currentBoundary?.id ?? 0;
 		if (lineStart >= matchContent.length) break;
-		processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, isPageStart(lineStart), splitPointsByRule);
+		processFastFuzzyMatchesAt(matchContent, lineStart, pageId, fastFuzzyRules, passesPageStartGuard, splitPointsByRule);
 		const nextNl = matchContent.indexOf("\n", lineStart);
 		if (nextNl === -1) break;
 		lineStart = nextNl + 1;
 	}
 	return splitPointsByRule;
 };
 //#endregion
 //#region src/segmentation/split-point-helpers.ts
 const MAX_REGEX_ITERATIONS = 1e5;
@@ -3256,7 +3658,7 @@ const buildContentOffsets = (match, ruleInfo) => {
 	if (!ruleInfo.usesLineStartsAfter) return {};
 	const captured = match.groups?.[`${ruleInfo.prefix}__content`];
 	if (captured === void 0) return {};
-	return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
+	return { contentStartOffset: (match.groups?.[ruleInfo.prefix] ?? match[0]).length - captured.length };
 };
 const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
 const createSplitPointFromMatch = (match, rule, ruleInfo) => {
@@ -3271,7 +3673,32 @@ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
 		wordIndex
 	};
 };
+const addSplitPoint = (splitPointsByRule, originalIndex, point) => {
+	const arr = splitPointsByRule.get(originalIndex);
+	if (!arr) {
+		splitPointsByRule.set(originalIndex, [point]);
+		return;
+	}
+	arr.push(point);
+};
+/**
+* Executes a combined regex over the content for combinable rules and records
+* any resulting split points into `splitPointsByRule`.
+*
+* This function mutates `splitPointsByRule` in place and throws if the regex
+* iteration guard is exceeded.
+*
+* @param matchContent - Concatenated content being segmented
+* @param combinableRules - Rules that can be combined into a single alternation
+* @param ruleRegexes - Compiled regex metadata aligned with `combinableRules`
+* @param pageMap - Page boundary mapping utilities for the content
+* @param passesPageStartGuard - Callback that decides whether a match is allowed
+* @param splitPointsByRule - Mutable map collecting split points by rule index
+* @param logger - Optional logger for iteration diagnostics
+* @returns Nothing; results are written into `splitPointsByRule`
+*/
 const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
+	assertCombinedRuleAlignment(combinableRules, ruleRegexes);
 	const combinedSource = ruleRegexes.map((r) => r.source).join("|");
 	const combinedRegex = new RegExp(combinedSource, "gm");
 	logger?.debug?.("[segmenter] combined regex built", {
@@ -3286,19 +3713,29 @@ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, page
 			iterations,
 			position: m.index
 		});
-		const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
-		if (matchedIndex !== -1) {
-			const { rule, index: originalIndex } = combinableRules[matchedIndex];
-			if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
-				const arr = splitPointsByRule.get(originalIndex);
-				if (!arr) splitPointsByRule.set(originalIndex, [createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex])]);
-				else arr.push(createSplitPointFromMatch(m, rule, ruleRegexes[matchedIndex]));
-			}
-		}
+		processCombinedMatch(combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, m);
 		if (m[0].length === 0) combinedRegex.lastIndex++;
 		m = combinedRegex.exec(matchContent);
 	}
 };
+const assertCombinedRuleAlignment = (combinableRules, ruleRegexes) => {
+	if (combinableRules.length !== ruleRegexes.length) throw new Error(`processCombinedMatches: combinableRules/ruleRegexes length mismatch (${combinableRules.length} !== ${ruleRegexes.length})`);
+	for (let i = 0; i < combinableRules.length; i++) if (!ruleRegexes[i].source.includes(`(?<${combinableRules[i].prefix}>`)) throw new Error(`processCombinedMatches: regex alignment mismatch for prefix "${combinableRules[i].prefix}" at index ${i}`);
+};
+const processCombinedMatch = (combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, match) => {
+	const matchedIndex = combinableRules.findIndex(({ prefix }) => match.groups?.[prefix] !== void 0);
+	if (matchedIndex === -1) return;
+	const { rule, index: originalIndex } = combinableRules[matchedIndex];
+	if (!passesRuleConstraints(rule, pageMap.getId(match.index)) || !passesPageStartGuard(rule, originalIndex, match.index)) return;
+	addSplitPoint(splitPointsByRule, originalIndex, createSplitPointFromMatch(match, rule, ruleRegexes[matchedIndex]));
+};
+/**
+* Builds compiled regex metadata for each combinable rule while preserving the
+* prefix used to identify the matching branch inside a combined alternation.
+*
+* @param combinableRules - Rules eligible for combined-regex processing
+* @returns Rule regex metadata aligned with the input order
+*/
 const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
 	const built = buildRuleRegex(rule, prefix);
 	return {
@@ -3307,6 +3744,18 @@ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefi
 		source: `(?<${prefix}>${built.regex.source})`
 	};
 });
+/**
+* Processes a standalone rule by matching it independently and appending its
+* resulting split points into `splitPointsByRule`.
+*
+* @param rule - The standalone split rule to evaluate
+* @param ruleIndex - Original rule index in the caller's rules array
+* @param matchContent - Concatenated content being segmented
+* @param pageMap - Page boundary mapping utilities for the content
+* @param passesPageStartGuard - Callback that decides whether a match is allowed
+* @param splitPointsByRule - Mutable map collecting split points by rule index
+* @returns Nothing; results are written into `splitPointsByRule`
+*/
 const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
 	const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
 	const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
@@ -3341,6 +3790,15 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
 	}
 	return matches;
 };
+/**
+* Applies per-rule occurrence filtering and optional debug metadata patches to
+* the collected split points.
+*
+* @param rules - Full rule list in original order
+* @param splitPointsByRule - Split points grouped by originating rule index
+* @param debugMetaKey - Optional metadata key used for debug provenance patches
+* @returns Flattened split points after occurrence filtering and debug merging
+*/
 const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
 	const result = [];
 	rules.forEach((rule, index) => {
@@ -3358,7 +3816,6 @@ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
 	});
 	return result;
 };
 //#endregion
 //#region src/segmentation/segmenter.ts
 /**
@@ -3432,10 +3889,30 @@ const dedupeSplitPoints = (splitPoints) => {
 	const byIndex = /* @__PURE__ */ new Map();
 	for (const p of splitPoints) {
 		const existing = byIndex.get(p.index);
-		if (!existing || p.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || p.meta !== void 0 && existing.meta === void 0) byIndex.set(p.index, p);
+		if (!existing) {
+			byIndex.set(p.index, p);
+			continue;
+		}
+		byIndex.set(p.index, mergeSplitPoints(existing, p));
 	}
 	return [...byIndex.values()].sort((a, b) => a.index - b.index);
 };
+const prefersIncomingSplitPoint = (existing, incoming) => incoming.contentStartOffset !== void 0 && existing.contentStartOffset === void 0 || incoming.meta !== void 0 && existing.meta === void 0;
+const mergeRecord = (existing, incoming) => existing || incoming ? {
+	...existing ?? {},
+	...incoming ?? {}
+} : void 0;
+const mergeSplitPoints = (existing, incoming) => {
+	const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
+	const fallback = preferred === incoming ? existing : incoming;
+	return {
+		...fallback,
+		...preferred,
+		contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
+		meta: mergeRecord(existing.meta, incoming.meta),
+		namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
+	};
+};
 /**
 * If no structural rules produced segments, create a single segment spanning all pages.
 * This allows breakpoint processing to still run.
@@ -3468,7 +3945,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey,
 	});
 	const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
 	if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
-	for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
+	for (const { rule, index } of standaloneRules) processStandaloneRule(rule, index, matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
 	return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
 };
 /**
@@ -3508,7 +3985,7 @@ const findBreaksInRange = (startOffset, endOffset, sortedBreaks) => {
 * @returns Content with page-break newlines converted to spaces (or left as-is for `newline`)
 */
 const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
-	if (!content || !content.includes("\n")) return content;
+	if (!content?.includes("\n")) return content;
 	if (pageJoiner === "newline") return content;
 	const breaksInRange = findBreaksInRange(startOffset, startOffset + content.length, pageBreaks);
 	if (breaksInRange.length === 0) return content;
@@ -3616,16 +4093,23 @@ const segmentPages = (pages, options) => {
 * @returns Array of segment objects
 */
 const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
+	const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
+	const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
+	const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
+	const applyMeta = (meta, namedCaptures) => meta || namedCaptures ? {
+		...meta,
+		...namedCaptures
+	} : void 0;
 	/**
 	* Creates a single segment from a content range.
 	*/
 	const createSegment = (start, end, meta, capturedContent, namedCaptures, contentStartOffset) => {
-		const actualStart = start + (contentStartOffset ?? 0);
+		const actualStart = getActualStart(start, contentStartOffset);
 		const sliced = content.slice(actualStart, end);
-		let text = capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
+		let text = trimSegmentText(sliced, capturedContent, contentStartOffset);
 		if (!text) return null;
 		if (!capturedContent) text = convertPageBreaks(text, actualStart, pageMap.pageBreaks, pageJoiner);
-		const adjustedStart = actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
+		const adjustedStart = getAdjustedStart(actualStart, sliced, contentStartOffset);
 		const from = pageMap.getId(adjustedStart);
 		const to = capturedContent ? pageMap.getId(end - 1) : pageMap.getId(adjustedStart + text.length - 1);
 		const seg = {
@@ -3633,10 +4117,8 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
 			from
 		};
 		if (to !== from) seg.to = to;
-		if (meta || namedCaptures) seg.meta = {
-			...meta,
-			...namedCaptures
-		};
+		const mergedMeta = applyMeta(meta, namedCaptures);
+		if (mergedMeta) seg.meta = mergedMeta;
 		return seg;
 	};
 	/**
@@ -3668,659 +4150,6 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
 	}
 	return [...segments, ...createSegmentsFromSplitPoints()];
 };
-//#endregion
-//#region src/recovery.ts
-const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
-const normalizeForCompare = (s, mode) => {
-	if (mode === "none") return s;
-	let out = s;
-	if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
-	out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
-	return out;
-};
-const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
-const buildFixedOptions = (options, selectedRuleIndices) => {
-	const fixedRules = (options.rules ?? []).map((r, idx) => {
-		if (!selectedRuleIndices.has(idx)) return r;
-		if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
-		const { lineStartsAfter, ...rest } = r;
-		return {
-			...rest,
-			lineStartsWith: lineStartsAfter
-		};
-	});
-	return {
-		...options,
-		rules: fixedRules
-	};
-};
-const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
-const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
-	const parts = [];
-	for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
-	const matchContent = parts.join("\n");
-	if (pageJoiner === "newline") return {
-		matchContent,
-		outputContent: matchContent
-	};
-	return {
-		matchContent,
-		outputContent: parts.join(" ")
-	};
-};
-const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
-	const rules = options.rules ?? [];
-	const compiled = [];
-	for (const idx of selectedRuleIndices) {
-		const r = rules[idx];
-		if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
-		const { lineStartsAfter, ...rest } = r;
-		const built = buildRuleRegex({
-			...rest,
-			lineStartsWith: lineStartsAfter
-		});
-		compiled.push({
-			ruleIndex: idx,
-			startsWithRegex: new RegExp(built.regex.source, "mu")
-		});
-	}
-	return compiled;
-};
-const findUniqueAnchorPos = (outputContent, segmentContent) => {
-	for (const len of [
-		80,
-		60,
-		40,
-		30,
-		20,
-		15
-	]) {
-		const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
-		if (!needle.trim()) continue;
-		const first = outputContent.indexOf(needle);
-		if (first === -1) continue;
-		if (outputContent.indexOf(needle, first + 1) === -1) return first;
-	}
-	return null;
-};
-const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
-	const line = matchContent.slice(lineStart);
-	for (const mr of compiledMistaken) {
-		mr.startsWithRegex.lastIndex = 0;
-		const m = mr.startsWithRegex.exec(line);
-		if (!m || m.index !== 0) continue;
-		const markerMatch = m[0];
-		const markerEnd = lineStart + markerMatch.length;
-		if (anchorPos < markerEnd) continue;
-		const gap = matchContent.slice(markerEnd, anchorPos);
-		const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
-		if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
-		return { prefix: recoveredPrefix };
-	}
-	return { reason: "no selected marker pattern matched at anchored line start" };
-};
-const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
-	const fromIdx = pageIdToIndex.get(segment.from);
-	const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
-	if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
-		kind: "unresolved",
-		reason: "segment page range not found in pages"
-	};
-	const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
-	if (!segment.content) return {
-		kind: "unresolved",
-		reason: "empty segment content"
-	};
-	const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
-	if (anchorPos === null) return {
-		kind: "unresolved",
-		reason: "could not uniquely anchor segment content in page range"
-	};
-	const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
-	const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
-	if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
-		kind: "unresolved",
-		reason: found.reason
-	};
-	return {
-		kind: "recovered",
-		recoveredContent: `${found.prefix}${segment.content}`,
-		recoveredPrefix: found.prefix
-	};
-};
-const resolveRuleIndicesSelector = (rules, indicesIn) => {
-	const errors = [];
-	const indices = /* @__PURE__ */ new Set();
-	for (const idx of indicesIn) {
-		if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
-			errors.push(`Selector index out of range: ${idx}`);
-			continue;
-		}
-		const rule = rules[idx];
-		if (!rule || !("lineStartsAfter" in rule)) {
-			errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
-			continue;
-		}
-		indices.add(idx);
-	}
-	return {
-		errors,
-		indices,
-		warnings: []
-	};
-};
-const resolvePredicateSelector = (rules, predicate) => {
-	const errors = [];
-	const warnings = [];
-	const indices = /* @__PURE__ */ new Set();
-	rules.forEach((r, i) => {
-		try {
-			if (!predicate(r, i)) return;
-			if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
-				indices.add(i);
-				return;
-			}
-			warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
-		} catch (e) {
-			const msg = e instanceof Error ? e.message : String(e);
-			errors.push(`Predicate threw at rule ${i}: ${msg}`);
-		}
-	});
-	if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
-	return {
-		errors,
-		indices,
-		warnings
-	};
-};
-const resolvePatternsSelector = (rules, patterns, matchMode) => {
-	const errors = [];
-	const warnings = [];
-	const indices = /* @__PURE__ */ new Set();
-	const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
-	const targets = patterns.map(normalizePattern);
-	for (let pi = 0; pi < patterns.length; pi++) {
-		const rawPattern = patterns[pi];
-		const pat = targets[pi];
-		const matched = [];
-		for (let i = 0; i < rules.length; i++) {
-			const r = rules[i];
-			if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
-			if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
-		}
-		if (matched.length === 0) {
-			errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
-			continue;
-		}
-		if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
-		matched.forEach((i) => {
-			indices.add(i);
-		});
-	}
-	return {
-		errors,
-		indices,
-		warnings
-	};
-};
-const resolveSelectorToRuleIndices = (options, selector) => {
-	const rules = options.rules ?? [];
-	if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
-	if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
-	return resolvePatternsSelector(rules, selector.patterns, selector.match);
-};
-const longestCommonSuffixLength = (a, b) => {
-	const max = Math.min(a.length, b.length);
-	let i = 0;
-	while (i < max) {
-		if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
-		i++;
-	}
-	return i;
-};
-const AMBIGUITY_SCORE_GAP = 5;
-const scoreCandidate = (orig, fixed, normalizeMode) => {
-	if (fixed.content === orig.content) return {
-		fixedIndex: -1,
-		kind: "exact",
-		score: 100
-	};
-	if (fixed.content.endsWith(orig.content)) {
-		const markerLen = fixed.content.length - orig.content.length;
-		return {
-			fixedIndex: -1,
-			kind: "exact_suffix",
-			score: 90 + Math.min(30, markerLen)
-		};
-	}
-	if (normalizeMode !== "none") {
-		const normFixed = normalizeForCompare(fixed.content, normalizeMode);
-		const normOrig = normalizeForCompare(orig.content, normalizeMode);
-		if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
-			const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
-			return {
-				fixedIndex: -1,
-				kind: "normalized_suffix",
-				score: 70 + Math.floor(overlap * 20)
-			};
-		}
-	}
-	return null;
-};
-const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
-	const warnings = [...reportBase.warnings];
-	warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
-	const details = segments.map((s, i) => {
-		const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
-		return {
-			from: s.from,
-			notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
-			originalStartPreview: preview(s.content),
-			segmentIndex: i,
-			status,
-			strategy: "none",
-			to: s.to
-		};
-	});
-	return {
-		report: {
-			...reportBase,
-			details,
-			summary: {
-				mode,
-				recovered: 0,
-				totalSegments: segments.length,
-				unchanged: segments.length,
-				unresolved: selectorErrors.length ? segments.length : 0
-			},
-			warnings
-		},
-		segments
-	};
-};
-const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
-	const recoveredAtIndex = /* @__PURE__ */ new Map();
-	const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
-	if (mode !== "best_effort_then_rerun") return {
-		recoveredAtIndex,
-		recoveredDetailAtIndex
-	};
-	const pageIdToIndex = buildPageIdToIndex(pages);
-	const pageJoiner = options.pageJoiner ?? "space";
-	const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
-	for (let i = 0; i < segments.length; i++) {
-		const orig = segments[i];
-		const r = tryBestEffortRecoverOneSegment(orig, pages, pageIdToIndex, compiledMistaken, pageJoiner);
-		if (r.kind !== "recovered") continue;
-		const seg = {
-			...orig,
-			content: r.recoveredContent
-		};
-		recoveredAtIndex.set(i, seg);
-		recoveredDetailAtIndex.set(i, {
-			from: orig.from,
-			originalStartPreview: preview(orig.content),
-			recoveredPrefixPreview: preview(r.recoveredPrefix),
-			recoveredStartPreview: preview(seg.content),
-			segmentIndex: i,
-			status: "recovered",
-			strategy: "stage1",
-			to: orig.to
-		});
-	}
-	return {
-		recoveredAtIndex,
-		recoveredDetailAtIndex
-	};
-};
-const buildFixedBuckets = (fixedSegments) => {
-	const buckets = /* @__PURE__ */ new Map();
-	for (let i = 0; i < fixedSegments.length; i++) {
-		const k = segmentRangeKey(fixedSegments[i]);
-		const arr = buckets.get(k);
-		if (!arr) buckets.set(k, [i]);
-		else arr.push(i);
-	}
-	return buckets;
-};
-const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
-	let best = null;
-	let secondBestScore = -Infinity;
-	for (const fixedIdx of candidates) {
-		if (usedFixed.has(fixedIdx)) continue;
-		const fixed = fixedSegments[fixedIdx];
-		const scored = scoreCandidate(orig, fixed, normalizeCompare);
-		if (!scored) continue;
-		const candidateScore = scored.score;
-		if (!best || candidateScore > best.score) {
-			secondBestScore = best?.score ?? -Infinity;
-			best = {
-				fixedIdx,
-				score: candidateScore
-			};
-		} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
-	}
-	if (!best) return { kind: "none" };
-	if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
-	return {
-		fixedIdx: best.fixedIdx,
-		kind: "match"
-	};
-};
-const detailUnresolved = (orig, segmentIndex, notes) => ({
-	from: orig.from,
-	notes,
-	originalStartPreview: preview(orig.content),
-	segmentIndex,
-	status: "unresolved_alignment",
-	strategy: "rerun",
-	to: orig.to
-});
-const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
-	from: orig.from,
-	notes,
-	originalStartPreview: preview(orig.content),
-	segmentIndex,
-	status: "skipped_idempotent",
-	strategy: "rerun",
-	to: orig.to
-});
-const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
-	let recoveredPrefixPreview;
-	if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
-	return {
-		from: orig.from,
-		originalStartPreview: preview(orig.content),
-		recoveredPrefixPreview,
-		recoveredStartPreview: preview(fixed.content),
-		segmentIndex,
-		status: "recovered",
-		strategy: "rerun",
-		to: orig.to
-	};
-};
-const mergeWithRerun = (params) => {
-	const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
-	const usedFixed = /* @__PURE__ */ new Set();
-	const out = [];
-	const details = [];
-	let recovered = 0;
-	let unresolved = 0;
-	let unchanged = 0;
-	for (let i = 0; i < originalSegments.length; i++) {
-		const stage1Recovered = stage1RecoveredAtIndex.get(i);
-		if (stage1Recovered) {
-			out.push(stage1Recovered);
-			recovered++;
-			details.push(recoveredDetailAtIndex.get(i) ?? {
-				from: stage1Recovered.from,
-				originalStartPreview: preview(originalSegments[i].content),
-				recoveredStartPreview: preview(stage1Recovered.content),
-				segmentIndex: i,
-				status: "recovered",
-				strategy: "stage1",
-				to: stage1Recovered.to
-			});
-			continue;
-		}
-		const orig = originalSegments[i];
-		const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
-		if (best.kind === "none") {
-			out.push(orig);
-			unresolved++;
-			details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
-			continue;
-		}
-		if (best.kind === "ambiguous") {
-			out.push(orig);
-			unresolved++;
-			details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
-			continue;
-		}
-		usedFixed.add(best.fixedIdx);
-		const fixed = fixedSegments[best.fixedIdx];
-		if (fixed.content === orig.content) {
-			out.push(orig);
-			unchanged++;
-			details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
-			continue;
-		}
-		out.push({
-			...orig,
-			content: fixed.content
-		});
-		recovered++;
-		details.push(detailRecoveredRerun(orig, fixed, i));
-	}
-	return {
-		details,
-		segments: out,
-		summary: {
-			recovered,
-			unchanged,
-			unresolved
-		}
-	};
-};
-function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
-	const mode = opts?.mode ?? "rerun_only";
-	const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
-	const resolved = resolveSelectorToRuleIndices(options, selector);
-	const reportBase = {
-		byRun: void 0,
-		errors: resolved.errors,
-		warnings: resolved.warnings
-	};
-	if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
-	const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
-	const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
-	const merged = mergeWithRerun({
-		fixedBuckets: buildFixedBuckets(fixedSegments),
-		fixedSegments,
-		normalizeCompare,
-		originalSegments: segments,
-		recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
-		stage1RecoveredAtIndex: stage1.recoveredAtIndex
-	});
-	return {
-		report: {
-			...reportBase,
-			details: merged.details,
-			summary: {
-				mode,
-				recovered: merged.summary.recovered,
-				totalSegments: segments.length,
-				unchanged: merged.summary.unchanged,
-				unresolved: merged.summary.unresolved
-			}
-		},
-		segments: merged.segments
-	};
-}
-function recoverMistakenMarkersForRuns(runs, opts) {
-	const allSegments = [];
-	const byRun = [];
-	const details = [];
-	const warnings = [];
-	const errors = [];
-	let recovered = 0;
-	let unchanged = 0;
-	let unresolved = 0;
-	let offset = 0;
-	for (let i = 0; i < runs.length; i++) {
-		const run = runs[i];
-		const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
-		allSegments.push(...res.segments);
-		for (const d of res.report.details) details.push({
-			...d,
-			segmentIndex: d.segmentIndex + offset
-		});
-		offset += run.segments.length;
-		recovered += res.report.summary.recovered;
-		unchanged += res.report.summary.unchanged;
-		unresolved += res.report.summary.unresolved;
-		warnings.push(...res.report.warnings);
-		errors.push(...res.report.errors);
-		byRun.push({
-			recovered: res.report.summary.recovered,
-			runIndex: i,
-			totalSegments: run.segments.length,
-			unresolved: res.report.summary.unresolved
-		});
-	}
-	return {
-		report: {
-			byRun,
-			details,
-			errors,
-			summary: {
-				mode: opts?.mode ?? "rerun_only",
-				recovered,
-				totalSegments: offset,
-				unchanged,
-				unresolved
-			},
-			warnings
-		},
-		segments: allSegments
-	};
-}
-//#endregion
-//#region src/segmentation/pattern-validator.ts
-const KNOWN_TOKENS = new Set(getAvailableTokens());
-const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
-const buildBareTokenRegex = () => {
-	const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
-	return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
-};
-/**
-* Validates a single pattern for common issues.
-*/
-const validatePattern = (pattern, seenPatterns) => {
-	if (!pattern.trim()) return {
-		message: "Empty pattern is not allowed",
-		type: "empty_pattern"
-	};
-	if (seenPatterns.has(pattern)) return {
-		message: `Duplicate pattern: "${pattern}"`,
-		pattern,
-		type: "duplicate"
-	};
-	seenPatterns.add(pattern);
-	TOKEN_INSIDE_BRACES.lastIndex = 0;
-	for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
-		const name = match[1];
-		if (!KNOWN_TOKENS.has(name)) return {
-			message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
-			suggestion: "Check spelling or use a known token",
-			token: name,
-			type: "unknown_token"
-		};
-	}
-	for (const match of pattern.matchAll(buildBareTokenRegex())) {
-		const [full, name] = match;
-		const idx = match.index;
-		if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
-			message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
-			suggestion: `{{${full}}}`,
-			token: name,
-			type: "missing_braces"
-		};
-	}
-};
-/**
-* Validates an array of patterns, returning parallel array of issues.
-*/
-const validatePatternArray = (patterns) => {
-	const seen = /* @__PURE__ */ new Set();
-	const issues = patterns.map((p) => validatePattern(p, seen));
-	return issues.some(Boolean) ? issues : void 0;
-};
-/**
-* Validates split rules for common pattern issues.
-*
-* Checks for:
-* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
-* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
-* - Duplicate patterns within the same rule
-*
-* @param rules - Array of split rules to validate
-* @returns Array parallel to input with validation results (undefined if no issues)
-*
-* @example
-* const issues = validateRules([
-*   { lineStartsAfter: ['raqms:num'] },  // Missing braces
-*   { lineStartsWith: ['{{unknown}}'] }, // Unknown token
-* ]);
-* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
-* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
-*/
-const validateRules = (rules) => rules.map((rule) => {
-	const result = {};
-	let hasIssues = false;
-	for (const key of [
-		"lineStartsWith",
-		"lineStartsAfter",
-		"lineEndsWith"
-	]) if (key in rule && rule[key]) {
-		const issues = validatePatternArray(rule[key]);
-		if (issues) {
-			result[key] = issues;
-			hasIssues = true;
-		}
-	}
-	if ("template" in rule && rule.template !== void 0) {
-		const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
-		if (issue) {
-			result.template = issue;
-			hasIssues = true;
-		}
-	}
-	return hasIssues ? result : void 0;
-});
-/**
-* Formats a validation result array into a list of human-readable error messages.
-*
-* Useful for displaying validation errors in UIs.
-*
-* @param results - The result array from `validateRules()`
-* @returns Array of formatted error strings
-*
-* @example
-* const issues = validateRules(rules);
-* const errors = formatValidationReport(issues);
-* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
-*/
-const formatValidationReport = (results) => results.flatMap((result, i) => {
-	if (!result) return [];
-	return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => {
-		if (!issue) return null;
-		const loc = `Rule ${i + 1}, ${type}`;
-		if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
-		if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
-		if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
-		return `${loc}: ${issue.message || issue.type}`;
-	})).filter((msg) => msg !== null);
-});
-//#endregion
-//#region src/validation/validation-constants.ts
-/**
-* Validation-specific constants
-*/
-/**
-* Limit for validation issue preview length (characters).
-*/
-const PREVIEW_LIMIT = 140;
-/**
-* Threshold for short segment content (characters).
-* Segments shorter than this will trigger a full-document search fallback
-* if not found in the expected window.
-*/
-const FULL_SEARCH_THRESHOLD = 500;
 //#endregion
 //#region src/validation/validate-segments.ts
 /**
@@ -4329,8 +4158,8 @@ const FULL_SEARCH_THRESHOLD = 500;
 */
 const buildPreview = (text) => {
 	const normalized = text.replace(/\s+/g, " ").trim();
-	if (normalized.length <= PREVIEW_LIMIT) return normalized;
-	return `${normalized.slice(0, PREVIEW_LIMIT)}...`;
+	if (normalized.length <= 140) return normalized;
+	return `${normalized.slice(0, 140)}...`;
 };
 /**
 * Creates a lightweight snapshot of a segment for inclusion in validation checks.
@@ -4358,19 +4187,18 @@ const normalizePages = (pages, options) => {
 */
 const buildJoinedContent = (pages, joiner) => {
 	const boundaries = [];
-	const nonEmptyPages = pages.filter((p) => p.content);
-	const joined = nonEmptyPages.map((p) => p.content).join(joiner);
+	const joined = pages.map((p) => p.content).join(joiner);
 	let offset = 0;
-	for (let i = 0; i < nonEmptyPages.length; i++) {
-		const content = nonEmptyPages[i].content;
+	for (let i = 0; i < pages.length; i++) {
+		const content = pages[i].content;
 		const start = offset;
-		const end = start + content.length - 1;
+		const end = start + content.length;
 		boundaries.push({
 			end,
-			id: nonEmptyPages[i].id,
+			id: pages[i].id,
 			start
 		});
-		offset = end + 1 + (i < nonEmptyPages.length - 1 ? joiner.length : 0);
+		offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
 	}
 	return {
 		boundaries,
@@ -4561,7 +4389,7 @@ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, search
 	const bufferSize = 1e3;
 	const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
 	if (rawMatches.length === 0) {
-		const threshold = validationOptions?.fullSearchThreshold ?? FULL_SEARCH_THRESHOLD;
+		const threshold = validationOptions?.fullSearchThreshold ?? 500;
 		if (content.length < threshold) {
 			const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
 			const validMatch = fullMatches.find((m) => {
@@ -4715,7 +4543,7 @@ const validateSegments = (pages, options, segments, validationOptions) => {
 		}
 	};
 };
 //#endregion
-export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
+export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
 //# sourceMappingURL=index.mjs.map