npm - flappa-doormal - Versions diffs - 2.6.4 → 2.7.0 - Mend

flappa-doormal 2.6.4 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -156,1328 +156,1440 @@ const makeDiacriticInsensitive = (text) => {
 };
 //#endregion
-//#region src/segmentation/breakpoint-utils.ts
-const WINDOW_PREFIX_LENGTHS = [
-	80,
-	60,
-	40,
-	30,
-	20,
-	15
-];
-const JOINER_PREFIX_LENGTHS = [
-	80,
-	60,
-	40,
-	30,
-	20,
-	15,
-	12,
-	10,
-	8,
-	6
-];
+//#region src/segmentation/tokens.ts
 /**
-* Normalizes a breakpoint to the object form.
-* Strings are converted to { pattern: str } with no constraints.
+* Token-based template system for Arabic text pattern matching.
 *
-* @param bp - Breakpoint as string or object
-* @returns Normalized BreakpointRule object
+* This module provides a human-readable way to define regex patterns using
+* `{{token}}` placeholders that expand to their regex equivalents. It supports
+* named capture groups for extracting matched values into metadata.
+*
+* @module tokens
 *
 * @example
-* normalizeBreakpoint('\\n\\n')
-* // → { pattern: '\\n\\n' }
+* // Simple token expansion
+* expandTokens('{{raqms}} {{dash}}')
+* // → '[\\u0660-\\u0669]+ [-–—ـ]'
 *
-* normalizeBreakpoint({ pattern: '\\n', min: 10 })
-* // → { pattern: '\\n', min: 10 }
+* @example
+* // Named capture groups
+* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
 */
-const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
 /**
-* Checks if a page ID is in an excluded list (single pages or ranges).
+* Token definitions mapping human-readable token names to regex patterns.
 *
-* @param pageId - Page ID to check
-* @param excludeList - List of page IDs or [from, to] ranges to exclude
-* @returns True if page is excluded
+* Tokens are used in template strings with double-brace syntax:
+* - `{{token}}` - Expands to the pattern (non-capturing in context)
+* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
+* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
 *
-* @example
-* isPageExcluded(5, [1, 5, 10])
-* // → true
+* @remarks
+* These patterns are designed for Arabic text matching. For diacritic-insensitive
+* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
+* which applies `makeDiacriticInsensitive()` to the expanded patterns.
 *
-* isPageExcluded(5, [[3, 7]])
-* // → true
+* @example
+* // Using tokens in a split rule
+* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
 *
-* isPageExcluded(5, [[10, 20]])
-* // → false
+* @example
+* // Using tokens with named captures
+* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
 */
-const isPageExcluded = (pageId, excludeList) => {
-	if (!excludeList || excludeList.length === 0) return false;
-	for (const item of excludeList) if (typeof item === "number") {
-		if (pageId === item) return true;
-	} else {
-		const [from, to] = item;
-		if (pageId >= from && pageId <= to) return true;
-	}
-	return false;
-};
 /**
-* Checks if a page ID is within a breakpoint's min/max range and not excluded.
+* Escapes regex metacharacters (parentheses and brackets) in template patterns,
+* but preserves content inside `{{...}}` token delimiters.
 *
-* @param pageId - Page ID to check
-* @param rule - Breakpoint rule with optional min/max/exclude constraints
-* @returns True if page is within valid range
+* This allows users to write intuitive patterns like `({{harf}}):` instead of
+* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
+* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
+*
+* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
+* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
 *
 * @example
-* isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
-* // → true
+* escapeTemplateBrackets('({{harf}}): ')
+* // → '\\({{harf}}\\): '
 *
-* isInBreakpointRange(5, { pattern: '\\n', min: 10 })
-* // → false (below min)
+* @example
+* escapeTemplateBrackets('[{{raqm}}] ')
+* // → '\\[{{raqm}}\\] '
+*
+* @example
+* escapeTemplateBrackets('{{harf}}')
+* // → '{{harf}}' (unchanged - no brackets outside tokens)
 */
-const isInBreakpointRange = (pageId, rule) => {
-	if (rule.min !== void 0 && pageId < rule.min) return false;
-	if (rule.max !== void 0 && pageId > rule.max) return false;
-	return !isPageExcluded(pageId, rule.exclude);
+const escapeTemplateBrackets = (pattern) => {
+	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
+		if (token) return token;
+		return `\\${bracket}`;
+	});
+};
+const RUMUZ_ATOM = `(?:${[
+	"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
+	"خت",
+	"خغ",
+	"بخ",
+	"عخ",
+	"مق",
+	"مت",
+	"عس",
+	"سي",
+	"سن",
+	"كن",
+	"مد",
+	"قد",
+	"خد",
+	"فد",
+	"دل",
+	"كد",
+	"غد",
+	"صد",
+	"دت",
+	"دس",
+	"تم",
+	"فق",
+	"دق",
+	"[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
+	"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
+].join("|")})`;
+const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
+const BASE_TOKENS = {
+	bab: "باب",
+	basmalah: ["بسم الله", "﷽"].join("|"),
+	bullet: "[•*°]",
+	dash: "[-–—ـ]",
+	fasl: ["مسألة", "فصل"].join("|"),
+	harf: "[أ-ي]",
+	harfs: "[أ-ي](?:\\s+[أ-ي])*",
+	kitab: "كتاب",
+	naql: [
+		"حدثني",
+		"وأخبرنا",
+		"حدثنا",
+		"سمعت",
+		"أنبأنا",
+		"وحدثنا",
+		"أخبرنا",
+		"وحدثني",
+		"وحدثنيه"
+	].join("|"),
+	raqm: "[\\u0660-\\u0669]",
+	raqms: "[\\u0660-\\u0669]+",
+	rumuz: RUMUZ_BLOCK,
+	tarqim: "[.!?؟؛]"
 };
 /**
-* Builds an exclude set from a PageRange array for O(1) lookups.
+* Composite token definitions using template syntax.
 *
-* @param excludeList - List of page IDs or [from, to] ranges
-* @returns Set of all excluded page IDs
+* These tokens reference base tokens using `{{token}}` syntax and are
+* automatically expanded to their final regex patterns at module load time.
 *
-* @remarks
-* This expands ranges into explicit page IDs for fast membership checks. For typical
-* book-scale inputs (thousands of pages), this is small and keeps downstream logic
-* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
-* consider avoiding broad excludes or introducing a range-based membership structure.
+* This provides better abstraction - if base tokens change, composites
+* automatically update on the next build.
 *
-* @example
-* buildExcludeSet([1, 5, [10, 12]])
-* // → Set { 1, 5, 10, 11, 12 }
+* @internal
 */
-const buildExcludeSet = (excludeList) => {
-	const excludeSet = /* @__PURE__ */ new Set();
-	for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
-	else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
-	return excludeSet;
-};
+const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
 /**
-* Creates a segment with optional to and meta fields.
-* Returns null if content is empty after trimming.
+* Expands any *composite* tokens (like `{{numbered}}`) into their underlying template form
+* (like `{{raqms}} {{dash}} `).
 *
-* @param content - Segment content
-* @param fromPageId - Starting page ID
-* @param toPageId - Optional ending page ID (omitted if same as from)
-* @param meta - Optional metadata to attach
-* @returns Segment object or null if empty
+* This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
+* and turn it into an editable template where you can add named captures, e.g.:
 *
-* @example
-* createSegment('Hello world', 1, 3, { chapter: 1 })
-* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
+* - `{{numbered}}` → `{{raqms}} {{dash}} `
+* - then: `{{raqms:num}} {{dash}} ` to capture the number
 *
-* createSegment('   ', 1, undefined, undefined)
-* // → null (empty content)
+* Notes:
+* - This only expands the plain `{{token}}` form (not `{{token:name}}`).
+* - Expansion is repeated a few times to support nested composites.
 */
-const createSegment = (content, fromPageId, toPageId, meta) => {
-	const trimmed = content.trim();
-	if (!trimmed) return null;
-	const seg = {
-		content: trimmed,
-		from: fromPageId
-	};
-	if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
-	if (meta) seg.meta = meta;
-	return seg;
+const expandCompositeTokensInTemplate = (template) => {
+	let out = template;
+	for (let i = 0; i < 10; i++) {
+		const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => {
+			return COMPOSITE_TOKENS[tokenName] ?? m;
+		});
+		if (next === out) break;
+		out = next;
+	}
+	return out;
 };
 /**
-* Expands breakpoint patterns and pre-computes exclude sets.
+* Expands base tokens in a template string.
+* Used internally to pre-expand composite tokens.
 *
-* @param breakpoints - Array of breakpoint patterns or rules
-* @param processPattern - Function to expand tokens in patterns
-* @returns Array of expanded breakpoints with compiled regexes
+* @param template - Template string with `{{token}}` placeholders
+* @returns Expanded pattern with base tokens replaced
+* @internal
+*/
+const expandBaseTokens = (template) => {
+	return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
+		return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
+	});
+};
+/**
+* Token definitions mapping human-readable token names to regex patterns.
+*
+* Tokens are used in template strings with double-brace syntax:
+* - `{{token}}` - Expands to the pattern (non-capturing in context)
+* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
+* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
 *
 * @remarks
-* This function compiles regex patterns dynamically. This can be a ReDoS vector
-* if patterns come from untrusted sources. In typical usage, breakpoint rules
-* are application configuration, not user input.
-*/
-const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
-	const rule = normalizeBreakpoint(bp);
-	const excludeSet = buildExcludeSet(rule.exclude);
-	const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
-		const expandedSkip = processPattern$1(rule.skipWhen);
-		try {
-			return new RegExp(expandedSkip, "mu");
-		} catch (error) {
-			const message = error instanceof Error ? error.message : String(error);
-			throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n  Cause: ${message}`);
-		}
-	})() : null;
-	if (rule.pattern === "") return {
-		excludeSet,
-		regex: null,
-		rule,
-		skipWhenRegex
-	};
-	const expanded = processPattern$1(rule.pattern);
-	try {
-		return {
-			excludeSet,
-			regex: new RegExp(expanded, "gmu"),
-			rule,
-			skipWhenRegex
-		};
-	} catch (error) {
-		const message = error instanceof Error ? error.message : String(error);
-		throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n  Cause: ${message}`);
-	}
-});
-/**
-* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
+* These patterns are designed for Arabic text matching. For diacritic-insensitive
+* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
+* which applies `makeDiacriticInsensitive()` to the expanded patterns.
 *
-* This is used for breakpoint-generated segments which don't have access to the original
-* `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
-* prefix after the previous boundary, then replace ONLY the single newline immediately before
-* that page start.
+* @example
+* // Using tokens in a split rule
+* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
 *
-* This avoids converting real in-page newlines, while still normalizing page joins consistently.
-*/
-const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
-	if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
-	let updated = content;
-	let searchFrom = 0;
-	for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
-		const pageData = normalizedPages.get(pageIds[pi]);
-		if (!pageData) continue;
-		const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
-		if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
-		if (found > 0) searchFrom = found;
-	}
-	return updated;
-};
-/**
-* Finds the position of a page prefix in content, trying multiple prefix lengths.
+* @example
+* // Using tokens with named captures
+* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
+*
+* @example
+* // Using the numbered convenience token
+* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
 */
-const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
-	for (const len of JOINER_PREFIX_LENGTHS) {
-		const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
-		if (!prefix) continue;
-		const pos = content.indexOf(prefix, searchFrom);
-		if (pos > 0) return pos;
-	}
-	return -1;
+const TOKEN_PATTERNS = {
+	...BASE_TOKENS,
+	...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
 };
 /**
-* Estimates how far into the current page `remainingContent` begins.
+* Regex pattern for matching tokens with optional named capture syntax.
 *
-* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
-* When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
-* expected boundary positions. This helper computes an approximate starting offset by matching
-* a short prefix of `remainingContent` inside the current page content.
+* Matches:
+* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
+* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
+* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
+*
+* @internal
 */
-const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
-	const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
-	if (!currentPageData) return 0;
-	const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
-	const needle = remStart.slice(0, Math.min(30, remStart.length));
-	if (!needle) return 0;
-	const idx = currentPageData.content.indexOf(needle);
-	return idx > 0 ? idx : 0;
-};
+const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
 /**
-* Attempts to find the start position of a target page within remainingContent,
-* anchored near an expected boundary position to reduce collisions.
+* Regex pattern for simple token matching (no capture syntax).
 *
-* This is used to define breakpoint windows in terms of actual content being split, rather than
-* raw per-page offsets which can desync when structural rules strip markers.
+* Matches only `{{token}}` format where token is one or more word characters.
+* Used by `containsTokens()` for quick detection.
+*
+* @internal
 */
-const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
-	const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
-	if (!targetPageData) return -1;
-	const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
-	const searchStart = Math.max(0, approx - 1e4);
-	const searchEnd = Math.min(remainingContent.length, approx + 2e3);
-	const targetTrimmed = targetPageData.content.trimStart();
-	for (const len of WINDOW_PREFIX_LENGTHS) {
-		const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
-		if (!prefix) continue;
-		let pos = remainingContent.indexOf(prefix, searchStart);
-		while (pos !== -1 && pos <= searchEnd) {
-			if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
-			pos = remainingContent.indexOf(prefix, pos + 1);
-		}
-		const last = remainingContent.lastIndexOf(prefix, approx);
-		if (last > 0) return last;
-	}
-	return -1;
-};
+const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
 /**
-* Builds a boundary position map for pages within the given range.
-*
-* This function computes page boundaries once per segment and enables
-* O(log n) page lookups via binary search with `findPageIndexForPosition`.
+* Checks if a query string contains template tokens.
 *
-* Boundaries are derived from segmentContent (post-structural-rules).
-* When the segment starts mid-page, an offset correction is applied to
-* keep boundary estimates aligned with the segment's actual content space.
+* Performs a quick test for `{{token}}` patterns without actually
+* expanding them. Useful for determining whether to apply token
+* expansion to a string.
 *
-* @param segmentContent - Full segment content (already processed by structural rules)
-* @param fromIdx - Starting page index
-* @param toIdx - Ending page index
-* @param pageIds - Array of all page IDs
-* @param normalizedPages - Map of page ID to normalized content
-* @param cumulativeOffsets - Cumulative character offsets (for estimates)
-* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
-*          with a sentinel boundary at segmentContent.length as the last element
+* @param query - String to check for tokens
+* @returns `true` if the string contains at least one `{{token}}` pattern
 *
 * @example
-* // For a 3-page segment:
-* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
-* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
+* containsTokens('{{raqms}} {{dash}}') // → true
+* containsTokens('plain text')          // → false
+* containsTokens('[٠-٩]+ - ')           // → false (raw regex, no tokens)
 */
-const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
-	const boundaryPositions = [0];
-	const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
-	for (let i = fromIdx + 1; i <= toIdx; i++) {
-		const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
-		const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
-		const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
-		if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
-		else {
-			const estimate = Math.max(prevBoundary + 1, expectedBoundary);
-			boundaryPositions.push(Math.min(estimate, segmentContent.length));
-		}
+const containsTokens = (query) => {
+	SIMPLE_TOKEN_REGEX.lastIndex = 0;
+	return SIMPLE_TOKEN_REGEX.test(query);
+};
+const splitTemplateIntoSegments = (query) => {
+	const segments = [];
+	let lastIndex = 0;
+	TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
+	let match;
+	while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
+		if (match.index > lastIndex) segments.push({
+			type: "text",
+			value: query.slice(lastIndex, match.index)
+		});
+		segments.push({
+			type: "token",
+			value: match[0]
+		});
+		lastIndex = match.index + match[0].length;
 	}
-	boundaryPositions.push(segmentContent.length);
-	return boundaryPositions;
+	if (lastIndex < query.length) segments.push({
+		type: "text",
+		value: query.slice(lastIndex)
+	});
+	return segments;
+};
+const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
+	if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
+	return text;
+};
+const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
+	if (!fuzzyTransform) return tokenPattern;
+	return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
+};
+const parseTokenLiteral = (literal) => {
+	TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
+	const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
+	if (!tokenMatch) return null;
+	const [, tokenName, captureName] = tokenMatch;
+	return {
+		captureName,
+		tokenName
+	};
+};
+const createCaptureRegistry = (capturePrefix) => {
+	const captureNames = [];
+	const captureNameCounts = /* @__PURE__ */ new Map();
+	const register = (baseName) => {
+		const count = captureNameCounts.get(baseName) ?? 0;
+		captureNameCounts.set(baseName, count + 1);
+		const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
+		const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
+		captureNames.push(prefixedName);
+		return prefixedName;
+	};
+	return {
+		captureNames,
+		register
+	};
+};
+const expandTokenLiteral = (literal, opts) => {
+	const parsed = parseTokenLiteral(literal);
+	if (!parsed) return literal;
+	const { tokenName, captureName } = parsed;
+	if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
+	let tokenPattern = TOKEN_PATTERNS[tokenName];
+	if (!tokenPattern) return literal;
+	tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
+	if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
+	return tokenPattern;
 };
 /**
-* Binary search to find which page a position falls within.
-* Uses "largest i where boundaryPositions[i] <= position" semantics.
+* Expands template tokens with support for named captures.
 *
-* @param position - Character position in segmentContent
-* @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
-* @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
-* @returns Page index in pageIds array
+* This is the primary token expansion function that handles all token syntax:
+* - `{{token}}` → Expands to the token's pattern (no capture group)
+* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
+* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
 *
-* @example
-* // With boundaries [0, 20, 40, 60] and fromIdx=0:
-* findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
-* findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
-* findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
+* Unknown tokens are left as-is in the output, allowing for partial templates.
+*
+* @param query - The template string containing tokens
+* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
+*                         Applied to both token patterns and plain Arabic text between tokens.
+*                         Typically `makeDiacriticInsensitive` from the fuzzy module.
+* @returns Object with expanded pattern, capture names, and capture flag
+*
+* @example
+* // Simple token expansion
+* expandTokensWithCaptures('{{raqms}} {{dash}}')
+* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
+*
+* @example
+* // Named capture
+* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
+*
+* @example
+* // Capture-only token
+* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
+* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
+*
+* @example
+* // With fuzzy transform
+* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
+* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
 */
-const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
-	if (boundaryPositions.length <= 1) return fromIdx;
-	let left = 0;
-	let right = boundaryPositions.length - 2;
-	while (left < right) {
-		const mid = Math.ceil((left + right) / 2);
-		if (boundaryPositions[mid] <= position) left = mid;
-		else right = mid - 1;
-	}
-	return fromIdx + left;
+const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
+	const segments = splitTemplateIntoSegments(query);
+	const registry = createCaptureRegistry(capturePrefix);
+	const processedParts = segments.map((segment) => {
+		if (segment.type === "text") return maybeApplyFuzzyToText(segment.value, fuzzyTransform);
+		return expandTokenLiteral(segment.value, {
+			capturePrefix,
+			fuzzyTransform,
+			registerCapture: registry.register
+		});
+	});
+	return {
+		captureNames: registry.captureNames,
+		hasCaptures: registry.captureNames.length > 0,
+		pattern: processedParts.join("")
+	};
 };
 /**
-* Finds the end position of a breakpoint window inside `remainingContent`.
+* Expands template tokens in a query string to their regex equivalents.
 *
-* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
-* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
-* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
+* This is the simple version without capture support. It returns only the
+* expanded pattern string, not capture metadata.
+*
+* Unknown tokens are left as-is, allowing for partial templates.
+*
+* @param query - Template string containing `{{token}}` placeholders
+* @returns Expanded regex pattern string
+*
+* @example
+* expandTokens('، {{raqms}}')     // → '، [\\u0660-\\u0669]+'
+* expandTokens('{{raqm}}*')       // → '[\\u0660-\\u0669]*'
+* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
+* expandTokens('{{unknown}}')     // → '{{unknown}}' (left as-is)
+*
+* @see expandTokensWithCaptures for full capture group support
 */
-const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
-	if (windowEndIdx >= toIdx) return remainingContent.length;
-	const desiredNextIdx = windowEndIdx + 1;
-	const minNextIdx = currentFromIdx + 1;
-	const maxNextIdx = Math.min(desiredNextIdx, toIdx);
-	const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
-	for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
-		const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
-		const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
-		if (pos > 0) return pos;
-	}
-	return remainingContent.length;
-};
+const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
 /**
-* Finds exclusion-based break position using raw cumulative offsets.
+* Converts a template string to a compiled RegExp.
 *
-* This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
-* Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
+* Expands all tokens and attempts to compile the result as a RegExp
+* with Unicode flag. Returns `null` if the resulting pattern is invalid.
+*
+* @remarks
+* This function dynamically compiles regular expressions from template strings.
+* If templates may come from untrusted sources, be aware of potential ReDoS
+* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
+* Consider validating pattern complexity or applying execution timeouts when
+* running user-submitted patterns.
+*
+* @param template - Template string containing `{{token}}` placeholders
+* @returns Compiled RegExp with 'u' flag, or `null` if invalid
+*
+* @example
+* templateToRegex('، {{raqms}}')  // → /، [٠-٩]+/u
+* templateToRegex('{{raqms}}+')   // → /[٠-٩]++/u (might be invalid in some engines)
+* templateToRegex('(((')          // → null (invalid regex)
 */
-const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
-	const startingPageId = pageIds[currentFromIdx];
-	if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
-	for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
-		const pageId = pageIds[pageIdx];
-		if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
+const templateToRegex = (template) => {
+	const expanded = expandTokens(template);
+	try {
+		return new RegExp(expanded, "u");
+	} catch {
+		return null;
 	}
-	return -1;
 };
 /**
-* Checks if any page in a range is excluded by the given exclude set.
+* Lists all available token names defined in `TOKEN_PATTERNS`.
 *
-* @param excludeSet - Set of excluded page IDs
-* @param pageIds - Array of page IDs
-* @param fromIdx - Start index (inclusive)
-* @param toIdx - End index (inclusive)
-* @returns True if any page in range is excluded
+* Useful for documentation, validation, or building user interfaces
+* that show available tokens.
+*
+* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
+*
+* @example
+* getAvailableTokens()
+* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
 */
-const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
-	if (excludeSet.size === 0) return false;
-	for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
-	return false;
-};
+const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
 /**
-* Finds the position of the next page content within remaining content.
-* Returns -1 if not found.
+* Gets the regex pattern for a specific token name.
 *
-* @param remainingContent - Content to search in
-* @param nextPageData - Normalized data for the next page
-* @returns Position of next page content, or -1 if not found
+* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
+* without any expansion or capture group wrapping.
+*
+* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
+* @returns The regex pattern string, or `undefined` if token doesn't exist
+*
+* @example
+* getTokenPattern('raqms')   // → '[\\u0660-\\u0669]+'
+* getTokenPattern('dash')    // → '[-–—ـ]'
+* getTokenPattern('unknown') // → undefined
 */
-const findNextPagePosition = (remainingContent, nextPageData) => {
-	const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
-	if (searchPrefix.length === 0) return -1;
-	const pos = remainingContent.indexOf(searchPrefix);
-	return pos > 0 ? pos : -1;
+const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
+/**
+* Regex to detect fuzzy-default tokens in a pattern string.
+* Matches {{token}} or {{token:name}} syntax.
+*/
+const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
+	"bab",
+	"basmalah",
+	"fasl",
+	"kitab",
+	"naql"
+].join("|")})(?::\\w+)?\\}\\}`, "g");
+/**
+* Checks if a pattern (or array of patterns) contains tokens that should
+* default to fuzzy matching.
+*
+* Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
+*
+* @param patterns - Single pattern string or array of pattern strings
+* @returns `true` if any pattern contains a fuzzy-default token
+*
+* @example
+* shouldDefaultToFuzzy('{{bab}} الإيمان')     // true
+* shouldDefaultToFuzzy('{{raqms}} {{dash}}')  // false
+* shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
+*/
+const shouldDefaultToFuzzy = (patterns) => {
+	return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
+		FUZZY_TOKEN_REGEX.lastIndex = 0;
+		return FUZZY_TOKEN_REGEX.test(p);
+	});
 };
+//#endregion
+//#region src/segmentation/pattern-validator.ts
 /**
-* Finds matches within a window and returns the selected position based on preference.
+* Pattern validation utilities for detecting common mistakes in rule patterns.
 *
-* @param windowContent - Content to search
-* @param regex - Regex to match
-* @param prefer - 'longer' for last match, 'shorter' for first match
-* @returns Break position after the selected match, or -1 if no matches
+* These utilities help catch typos and issues early, before rules are used
+* for segmentation.
 */
-const findPatternBreakPosition = (windowContent, regex, prefer) => {
-	let first;
-	let last;
-	for (const m of windowContent.matchAll(regex)) {
-		const match = {
-			index: m.index,
-			length: m[0].length
-		};
-		if (!first) first = match;
-		last = match;
-	}
-	if (!first) return -1;
-	const selected = prefer === "longer" ? last : first;
-	return selected.index + selected.length;
+const KNOWN_TOKENS = new Set(getAvailableTokens());
+const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
+const buildBareTokenRegex = () => {
+	const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
+	return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
 };
 /**
-* Handles page boundary breakpoint (empty pattern).
-* Returns break position or -1 if no valid position found.
+* Validates a single pattern for common issues.
 */
-const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
-	const nextPageIdx = windowEndIdx + 1;
-	if (nextPageIdx <= toIdx) {
-		const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
-		if (nextPageData) {
-			const pos = findNextPagePosition(remainingContent, nextPageData);
-			if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
-		}
+const validatePattern = (pattern, seenPatterns) => {
+	if (seenPatterns.has(pattern)) return {
+		message: `Duplicate pattern: "${pattern}"`,
+		type: "duplicate"
+	};
+	seenPatterns.add(pattern);
+	const tokensInBraces = [...pattern.matchAll(TOKEN_INSIDE_BRACES)];
+	for (const match of tokensInBraces) {
+		const tokenName = match[1];
+		if (!KNOWN_TOKENS.has(tokenName)) return {
+			message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
+			suggestion: `Check spelling or use a known token`,
+			type: "unknown_token"
+		};
+	}
+	const bareTokenRegex = buildBareTokenRegex();
+	const bareMatches = [...pattern.matchAll(bareTokenRegex)];
+	for (const match of bareMatches) {
+		const tokenName = match[1];
+		const fullMatch = match[0];
+		const matchIndex = match.index;
+		const before = pattern.slice(Math.max(0, matchIndex - 2), matchIndex);
+		const after = pattern.slice(matchIndex + fullMatch.length, matchIndex + fullMatch.length + 2);
+		if (before !== "{{" && after !== "}}") return {
+			message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
+			suggestion: `{{${fullMatch}}}`,
+			type: "missing_braces"
+		};
 	}
-	return Math.min(windowEndPosition, remainingContent.length);
 };
 /**
-* Tries to find a break position within the current window using breakpoint patterns.
-* Returns the break position or -1 if no suitable break was found.
-*
-* @param remainingContent - Content remaining to be segmented
-* @param currentFromIdx - Current starting page index
-* @param toIdx - Ending page index
-* @param windowEndIdx - Maximum window end index
-* @param ctx - Breakpoint context with page data and patterns
-* @returns Break position in the content, or -1 if no break found
+* Validates an array of patterns, returning parallel array of issues.
 */
-const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
-	const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
-	for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
-		if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
-		if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
-		if (skipWhenRegex?.test(remainingContent)) continue;
-		if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
-		const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
-		if (breakPos > 0) return breakPos;
-	}
-	return -1;
+const validatePatternArray = (patterns) => {
+	const seenPatterns = /* @__PURE__ */ new Set();
+	const issues = patterns.map((p) => validatePattern(p, seenPatterns));
+	if (issues.every((i) => i === void 0)) return;
+	return issues;
 };
-//#endregion
-//#region src/segmentation/breakpoint-processor.ts
 /**
-* Breakpoint post-processing engine extracted from segmenter.ts.
+* Validates split rules for common pattern issues.
 *
-* This module is intentionally split into small helpers to reduce cognitive complexity
-* and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
-*/
-const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
-const buildNormalizedPagesMap = (pages, normalizedContent) => {
-	const normalizedPages = /* @__PURE__ */ new Map();
-	for (let i = 0; i < pages.length; i++) {
-		const content = normalizedContent[i];
-		normalizedPages.set(pages[i].id, {
-			content,
-			index: i,
-			length: content.length
-		});
-	}
-	return normalizedPages;
+* Checks for:
+* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
+* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
+* - Duplicate patterns within the same rule
+*
+* @param rules - Array of split rules to validate
+* @returns Array parallel to input with validation results (undefined if no issues)
+*
+* @example
+* const issues = validateRules([
+*   { lineStartsAfter: ['raqms:num'] },  // Missing braces
+*   { lineStartsWith: ['{{unknown}}'] }, // Unknown token
+* ]);
+* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
+* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
+*/
+const validateRules = (rules) => {
+	return rules.map((rule) => {
+		const result = {};
+		let hasIssues = false;
+		if ("lineStartsWith" in rule && rule.lineStartsWith) {
+			const issues = validatePatternArray(rule.lineStartsWith);
+			if (issues) {
+				result.lineStartsWith = issues;
+				hasIssues = true;
+			}
+		}
+		if ("lineStartsAfter" in rule && rule.lineStartsAfter) {
+			const issues = validatePatternArray(rule.lineStartsAfter);
+			if (issues) {
+				result.lineStartsAfter = issues;
+				hasIssues = true;
+			}
+		}
+		if ("lineEndsWith" in rule && rule.lineEndsWith) {
+			const issues = validatePatternArray(rule.lineEndsWith);
+			if (issues) {
+				result.lineEndsWith = issues;
+				hasIssues = true;
+			}
+		}
+		if ("template" in rule && rule.template) {
+			const seenPatterns = /* @__PURE__ */ new Set();
+			const issue = validatePattern(rule.template, seenPatterns);
+			if (issue) {
+				result.template = issue;
+				hasIssues = true;
+			}
+		}
+		return hasIssues ? result : void 0;
+	});
 };
-const buildCumulativeOffsets = (pageIds, normalizedPages) => {
-	const cumulativeOffsets = [0];
-	let totalOffset = 0;
-	for (let i = 0; i < pageIds.length; i++) {
-		const pageData = normalizedPages.get(pageIds[i]);
-		totalOffset += pageData ? pageData.length : 0;
-		if (i < pageIds.length - 1) totalOffset += 1;
-		cumulativeOffsets.push(totalOffset);
+//#endregion
+//#region src/segmentation/replace.ts
+const DEFAULT_REPLACE_FLAGS = "gu";
+const normalizeReplaceFlags = (flags) => {
+	if (!flags) return DEFAULT_REPLACE_FLAGS;
+	const allowed = new Set([
+		"g",
+		"i",
+		"m",
+		"s",
+		"u",
+		"y"
+	]);
+	const set = /* @__PURE__ */ new Set();
+	for (const ch of flags) {
+		if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
+		set.add(ch);
 	}
-	return cumulativeOffsets;
+	set.add("g");
+	set.add("u");
+	return [
+		"g",
+		"i",
+		"m",
+		"s",
+		"y",
+		"u"
+	].filter((c) => set.has(c)).join("");
 };
-const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
-const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
-	const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
-	let windowEndIdx = currentFromIdx;
-	for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
-	else break;
-	return windowEndIdx;
+const compileReplaceRules = (rules) => {
+	const compiled = [];
+	for (const r of rules) {
+		if (r.pageIds && r.pageIds.length === 0) continue;
+		const flags = normalizeReplaceFlags(r.flags);
+		const re = new RegExp(r.regex, flags);
+		compiled.push({
+			pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
+			re,
+			replacement: r.replacement
+		});
+	}
+	return compiled;
 };
-const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
-const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
 /**
-* Computes the actual start and end page indices for a piece using
-* precomputed boundary positions and binary search.
+* Applies ordered regex replacements to page content (per page).
 *
-* @param pieceStartPos - Start position of the piece in the full segment content
-* @param pieceEndPos - End position (exclusive) of the piece
-* @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
-* @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
-* @param toIdx - Maximum page index
-* @returns Object with actualStartIdx and actualEndIdx
+* - Replacement rules are applied in array order.
+* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
+* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
+*
+* This function is intentionally **pure**:
+* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
 */
-const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
-	const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
-	const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
-	return {
-		actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
-		actualStartIdx
-	};
-};
-const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
-	let nextFromIdx = actualEndIdx;
-	if (remainingContent && actualEndIdx + 1 <= toIdx) {
-		const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
-		if (nextPageData) {
-			const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
-			const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
-			if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
+const applyReplacements = (pages, rules) => {
+	if (!rules || rules.length === 0 || pages.length === 0) return pages;
+	const compiled = compileReplaceRules(rules);
+	if (compiled.length === 0) return pages;
+	return pages.map((p) => {
+		let content = p.content;
+		for (const rule of compiled) {
+			if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
+			content = content.replace(rule.re, rule.replacement);
 		}
-	}
-	return nextFromIdx;
+		if (content === p.content) return p;
+		return {
+			...p,
+			content
+		};
+	});
 };
-const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
+//#endregion
+//#region src/segmentation/breakpoint-utils.ts
+const WINDOW_PREFIX_LENGTHS = [
+	80,
+	60,
+	40,
+	30,
+	20,
+	15
+];
+const JOINER_PREFIX_LENGTHS = [
+	80,
+	60,
+	40,
+	30,
+	20,
+	15,
+	12,
+	10,
+	8,
+	6
+];
 /**
-* Finds the break offset within a window, trying exclusions first, then patterns.
+* Normalizes a breakpoint to the object form.
+* Strings are converted to { pattern: str } with no constraints.
 *
-* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
+* @param bp - Breakpoint as string or object
+* @returns Normalized BreakpointRule object
+*
+* @example
+* normalizeBreakpoint('\\n\\n')
+* // → { pattern: '\\n\\n' }
+*
+* normalizeBreakpoint({ pattern: '\\n', min: 10 })
+* // → { pattern: '\\n', min: 10 }
 */
-const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
-	if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
-		const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
-		if (exclusionBreak > 0) return exclusionBreak;
+const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
+/**
+* Checks if a page ID is in an excluded list (single pages or ranges).
+*
+* @param pageId - Page ID to check
+* @param excludeList - List of page IDs or [from, to] ranges to exclude
+* @returns True if page is excluded
+*
+* @example
+* isPageExcluded(5, [1, 5, 10])
+* // → true
+*
+* isPageExcluded(5, [[3, 7]])
+* // → true
+*
+* isPageExcluded(5, [[10, 20]])
+* // → false
+*/
+const isPageExcluded = (pageId, excludeList) => {
+	if (!excludeList || excludeList.length === 0) return false;
+	for (const item of excludeList) if (typeof item === "number") {
+		if (pageId === item) return true;
+	} else {
+		const [from, to] = item;
+		if (pageId >= from && pageId <= to) return true;
 	}
-	const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
-		expandedBreakpoints,
-		normalizedPages,
-		pageIds,
-		prefer
-	});
-	return patternBreak > 0 ? patternBreak : windowEndPosition;
+	return false;
 };
 /**
-* Advances cursor position past any leading whitespace.
+* Checks if a page ID is within a breakpoint's min/max range and not excluded.
+*
+* @param pageId - Page ID to check
+* @param rule - Breakpoint rule with optional min/max/exclude constraints
+* @returns True if page is within valid range
+*
+* @example
+* isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
+* // → true
+*
+* isInBreakpointRange(5, { pattern: '\\n', min: 10 })
+* // → false (below min)
 */
-const skipWhitespace = (content, startPos) => {
-	let pos = startPos;
-	while (pos < content.length && /\s/.test(content[pos])) pos++;
-	return pos;
+const isInBreakpointRange = (pageId, rule) => {
+	if (rule.min !== void 0 && pageId < rule.min) return false;
+	if (rule.max !== void 0 && pageId > rule.max) return false;
+	return !isPageExcluded(pageId, rule.exclude);
 };
 /**
-* Processes an oversized segment by iterating through the content and
-* breaking it into smaller pieces that fit within maxPages constraints.
+* Builds an exclude set from a PageRange array for O(1) lookups.
 *
-* Uses precomputed boundary positions for O(log n) page attribution lookups.
+* @param excludeList - List of page IDs or [from, to] ranges
+* @returns Set of all excluded page IDs
+*
+* @remarks
+* This expands ranges into explicit page IDs for fast membership checks. For typical
+* book-scale inputs (thousands of pages), this is small and keeps downstream logic
+* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
+* consider avoiding broad excludes or introducing a range-based membership structure.
+*
+* @example
+* buildExcludeSet([1, 5, [10, 12]])
+* // → Set { 1, 5, 10, 11, 12 }
 */
-const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
-	const result = [];
-	const fullContent = segment.content;
-	let cursorPos = 0;
-	let currentFromIdx = fromIdx;
-	let isFirstPiece = true;
-	const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
-	logger?.debug?.("[breakpoints] boundaryPositions built", {
-		boundaryPositions,
-		fromIdx,
-		fullContentLength: fullContent.length,
-		toIdx
-	});
-	const maxIterations = 1e4;
-	for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
-		const remainingContent = fullContent.slice(cursorPos);
-		if (!remainingContent.trim()) break;
-		const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
-		const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
-		if (remainingSpan <= maxPages && !remainingHasExclusions) {
-			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
-			if (finalSeg) result.push(finalSeg);
-			break;
-		}
-		const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
-		const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
-		logger?.debug?.(`[breakpoints] iteration=${i}`, {
-			currentFromIdx,
-			cursorPos,
-			windowEndIdx
-		});
-		const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
-		const breakPos = cursorPos + breakOffset;
-		const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
-		const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
-		logger?.trace?.("[breakpoints] piece", {
-			actualEndIdx,
-			actualStartIdx,
-			pieceLength: pieceContent.length
-		});
-		if (pieceContent) {
-			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
-			if (pieceSeg) result.push(pieceSeg);
-		}
-		cursorPos = skipWhitespace(fullContent, breakPos);
-		currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
-		isFirstPiece = false;
-	}
-	logger?.debug?.("[breakpoints] done", { resultCount: result.length });
-	return result;
+const buildExcludeSet = (excludeList) => {
+	const excludeSet = /* @__PURE__ */ new Set();
+	for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
+	else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
+	return excludeSet;
 };
 /**
-* Applies breakpoints to oversized segments.
+* Creates a segment with optional to and meta fields.
+* Returns null if content is empty after trimming.
 *
-* Note: This is an internal engine used by `segmentPages()`.
+* @param content - Segment content
+* @param fromPageId - Starting page ID
+* @param toPageId - Optional ending page ID (omitted if same as from)
+* @param meta - Optional metadata to attach
+* @returns Segment object or null if empty
+*
+* @example
+* createSegment('Hello world', 1, 3, { chapter: 1 })
+* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
+*
+* createSegment('   ', 1, undefined, undefined)
+* // → null (empty content)
 */
-const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
-	const pageIds = pages.map((p) => p.id);
-	const pageIdToIndex = buildPageIdToIndexMap(pageIds);
-	const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
-	const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
-	const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
-	const result = [];
-	logger?.info?.("Starting breakpoint processing", {
-		maxPages,
-		segmentCount: segments.length
-	});
-	logger?.debug?.("[breakpoints] inputSegments", {
-		segmentCount: segments.length,
-		segments: segments.map((s) => ({
-			contentLength: s.content.length,
-			from: s.from,
-			to: s.to
-		}))
-	});
-	for (const segment of segments) {
-		const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
-		const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
-		const segmentSpan = (segment.to ?? segment.from) - segment.from;
-		const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
-		if (segmentSpan <= maxPages && !hasExclusions) {
-			result.push(segment);
-			continue;
-		}
-		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
-		result.push(...broken.map((s) => {
-			const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
-			const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
-			if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
-				...s,
-				content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
-			};
-			return s;
-		}));
-	}
-	logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
-	return result;
+const createSegment = (content, fromPageId, toPageId, meta) => {
+	const trimmed = content.trim();
+	if (!trimmed) return null;
+	const seg = {
+		content: trimmed,
+		from: fromPageId
+	};
+	if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
+	if (meta) seg.meta = meta;
+	return seg;
 };
-//#endregion
-//#region src/segmentation/match-utils.ts
 /**
-* Utility functions for regex matching and result processing.
+* Expands breakpoint patterns and pre-computes exclude sets.
 *
-* These functions were extracted from `segmenter.ts` to reduce complexity
-* and enable independent testing. They handle match filtering, capture
-* extraction, and occurrence-based selection.
+* @param breakpoints - Array of breakpoint patterns or rules
+* @param processPattern - Function to expand tokens in patterns
+* @returns Array of expanded breakpoints with compiled regexes
 *
-* @module match-utils
+* @remarks
+* This function compiles regex patterns dynamically. This can be a ReDoS vector
+* if patterns come from untrusted sources. In typical usage, breakpoint rules
+* are application configuration, not user input.
 */
+const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
+	const rule = normalizeBreakpoint(bp);
+	const excludeSet = buildExcludeSet(rule.exclude);
+	const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
+		const expandedSkip = processPattern$1(rule.skipWhen);
+		try {
+			return new RegExp(expandedSkip, "mu");
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error);
+			throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n  Cause: ${message}`);
+		}
+	})() : null;
+	if (rule.pattern === "") return {
+		excludeSet,
+		regex: null,
+		rule,
+		skipWhenRegex
+	};
+	const expanded = processPattern$1(rule.pattern);
+	try {
+		return {
+			excludeSet,
+			regex: new RegExp(expanded, "gmu"),
+			rule,
+			skipWhenRegex
+		};
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n  Cause: ${message}`);
+	}
+});
 /**
-* Extracts named capture groups from a regex match.
-*
-* Only includes groups that are in the `captureNames` list and have
-* defined values. This filters out positional captures and ensures
-* only explicitly requested named captures are returned.
-*
-* @param groups - The `match.groups` object from `RegExp.exec()`
-* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
-* @returns Object with capture name → value pairs, or `undefined` if none found
-*
-* @example
-* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
-* extractNamedCaptures(match.groups, ['num'])
-* // → { num: '٦٦٩٦' }
+* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
 *
-* @example
-* // No matching captures
-* extractNamedCaptures({}, ['num'])
-* // → undefined
+* This is used for breakpoint-generated segments which don't have access to the original
+* `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
+* prefix after the previous boundary, then replace ONLY the single newline immediately before
+* that page start.
 *
-* @example
-* // Undefined groups
-* extractNamedCaptures(undefined, ['num'])
-* // → undefined
+* This avoids converting real in-page newlines, while still normalizing page joins consistently.
 */
-const extractNamedCaptures = (groups, captureNames) => {
-	if (!groups || captureNames.length === 0) return;
-	const namedCaptures = {};
-	for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
-	return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
+const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
+	if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
+	let updated = content;
+	let searchFrom = 0;
+	for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
+		const pageData = normalizedPages.get(pageIds[pi]);
+		if (!pageData) continue;
+		const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
+		if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
+		if (found > 0) searchFrom = found;
+	}
+	return updated;
 };
 /**
-* Gets the last defined positional capture group from a match array.
-*
-* Used for `lineStartsAfter` patterns where the content capture (`.*`)
-* is always at the end of the pattern. Named captures may shift the
-* positional indices, so we iterate backward to find the actual content.
-*
-* @param match - RegExp exec result array
-* @returns The last defined capture group value, or `undefined` if none
-*
-* @example
-* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
-* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
-* getLastPositionalCapture(match)
-* // → 'content'
-*
-* @example
-* // No captures
-* getLastPositionalCapture(['full match'])
-* // → undefined
+* Finds the position of a page prefix in content, trying multiple prefix lengths.
 */
-const getLastPositionalCapture = (match) => {
-	if (match.length <= 1) return;
-	for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
+const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
+	for (const len of JOINER_PREFIX_LENGTHS) {
+		const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
+		if (!prefix) continue;
+		const pos = content.indexOf(prefix, searchFrom);
+		if (pos > 0) return pos;
+	}
+	return -1;
 };
 /**
-* Filters matches to only include those within page ID constraints.
+* Estimates how far into the current page `remainingContent` begins.
 *
-* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
-* matches that occur on pages outside the allowed range or explicitly excluded.
+* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
+* When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
+* expected boundary positions. This helper computes an approximate starting offset by matching
+* a short prefix of `remainingContent` inside the current page content.
+*/
+const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
+	const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
+	if (!currentPageData) return 0;
+	const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
+	const needle = remStart.slice(0, Math.min(30, remStart.length));
+	if (!needle) return 0;
+	const idx = currentPageData.content.indexOf(needle);
+	return idx > 0 ? idx : 0;
+};
+/**
+* Attempts to find the start position of a target page within remainingContent,
+* anchored near an expected boundary position to reduce collisions.
 *
-* @param matches - Array of match results to filter
-* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
-* @param getId - Function that returns the page ID for a given offset
-* @returns Filtered array containing only matches within constraints
-*
-* @example
-* const matches = [
-*   { start: 0, end: 10 },   // Page 1
-*   { start: 100, end: 110 }, // Page 5
-*   { start: 200, end: 210 }, // Page 10
-* ];
-* filterByConstraints(matches, { min: 3, max: 8 }, getId)
-* // → [{ start: 100, end: 110 }] (only page 5 match)
+* This is used to define breakpoint windows in terms of actual content being split, rather than
+* raw per-page offsets which can desync when structural rules strip markers.
 */
-const filterByConstraints = (matches, rule, getId) => {
-	return matches.filter((m) => {
-		const id = getId(m.start);
-		if (rule.min !== void 0 && id < rule.min) return false;
-		if (rule.max !== void 0 && id > rule.max) return false;
-		if (isPageExcluded(id, rule.exclude)) return false;
-		return true;
-	});
+const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
+	const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
+	if (!targetPageData) return -1;
+	const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
+	const searchStart = Math.max(0, approx - 1e4);
+	const searchEnd = Math.min(remainingContent.length, approx + 2e3);
+	const targetTrimmed = targetPageData.content.trimStart();
+	for (const len of WINDOW_PREFIX_LENGTHS) {
+		const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
+		if (!prefix) continue;
+		let pos = remainingContent.indexOf(prefix, searchStart);
+		while (pos !== -1 && pos <= searchEnd) {
+			if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
+			pos = remainingContent.indexOf(prefix, pos + 1);
+		}
+		const last = remainingContent.lastIndexOf(prefix, approx);
+		if (last > 0) return last;
+	}
+	return -1;
 };
 /**
-* Checks if any rule in the list allows the given page ID.
+* Builds a boundary position map for pages within the given range.
 *
-* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
-* Rules without constraints allow all page IDs.
+* This function computes page boundaries once per segment and enables
+* O(log n) page lookups via binary search with `findPageIndexForPosition`.
 *
-* This is used to determine whether to create a segment for content
-* that appears before any split points (the "first segment").
+* Boundaries are derived from segmentContent (post-structural-rules).
+* When the segment starts mid-page, an offset correction is applied to
+* keep boundary estimates aligned with the segment's actual content space.
 *
-* @param rules - Array of rules with optional `min` and `max` constraints
-* @param pageId - Page ID to check
-* @returns `true` if at least one rule allows the page ID
+* @param segmentContent - Full segment content (already processed by structural rules)
+* @param fromIdx - Starting page index
+* @param toIdx - Ending page index
+* @param pageIds - Array of all page IDs
+* @param normalizedPages - Map of page ID to normalized content
+* @param cumulativeOffsets - Cumulative character offsets (for estimates)
+* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
+*          with a sentinel boundary at segmentContent.length as the last element
 *
 * @example
-* const rules = [
-*   { min: 5, max: 10 },  // Allows pages 5-10
-*   { min: 20 },          // Allows pages 20+
-* ];
+* // For a 3-page segment:
+* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
+* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
+*/
+const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
+	const boundaryPositions = [0];
+	const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
+	for (let i = fromIdx + 1; i <= toIdx; i++) {
+		const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
+		const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
+		const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
+		if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
+		else {
+			const estimate = Math.max(prevBoundary + 1, expectedBoundary);
+			boundaryPositions.push(Math.min(estimate, segmentContent.length));
+		}
+	}
+	boundaryPositions.push(segmentContent.length);
+	return boundaryPositions;
+};
+/**
+* Binary search to find which page a position falls within.
+* Uses "largest i where boundaryPositions[i] <= position" semantics.
 *
-* anyRuleAllowsId(rules, 7)   // → true (first rule allows)
-* anyRuleAllowsId(rules, 3)   // → false (no rule allows)
-* anyRuleAllowsId(rules, 25)  // → true (second rule allows)
+* @param position - Character position in segmentContent
+* @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
+* @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
+* @returns Page index in pageIds array
 *
 * @example
-* // Rules without constraints allow everything
-* anyRuleAllowsId([{}], 999) // → true
+* // With boundaries [0, 20, 40, 60] and fromIdx=0:
+* findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
+* findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
+* findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
 */
-const anyRuleAllowsId = (rules, pageId) => {
-	return rules.some((r) => {
-		const minOk = r.min === void 0 || pageId >= r.min;
-		const maxOk = r.max === void 0 || pageId <= r.max;
-		return minOk && maxOk;
-	});
-};
-//#endregion
-//#region src/segmentation/replace.ts
-const DEFAULT_REPLACE_FLAGS = "gu";
-const normalizeReplaceFlags = (flags) => {
-	if (!flags) return DEFAULT_REPLACE_FLAGS;
-	const allowed = new Set([
-		"g",
-		"i",
-		"m",
-		"s",
-		"u",
-		"y"
-	]);
-	const set = /* @__PURE__ */ new Set();
-	for (const ch of flags) {
-		if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
-		set.add(ch);
+const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
+	if (boundaryPositions.length <= 1) return fromIdx;
+	let left = 0;
+	let right = boundaryPositions.length - 2;
+	while (left < right) {
+		const mid = Math.ceil((left + right) / 2);
+		if (boundaryPositions[mid] <= position) left = mid;
+		else right = mid - 1;
 	}
-	set.add("g");
-	set.add("u");
-	return [
-		"g",
-		"i",
-		"m",
-		"s",
-		"y",
-		"u"
-	].filter((c) => set.has(c)).join("");
+	return fromIdx + left;
 };
-const compileReplaceRules = (rules) => {
-	const compiled = [];
-	for (const r of rules) {
-		if (r.pageIds && r.pageIds.length === 0) continue;
-		const flags = normalizeReplaceFlags(r.flags);
-		const re = new RegExp(r.regex, flags);
-		compiled.push({
-			pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
-			re,
-			replacement: r.replacement
-		});
+/**
+* Finds the end position of a breakpoint window inside `remainingContent`.
+*
+* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
+* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
+* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
+*/
+const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
+	if (windowEndIdx >= toIdx) return remainingContent.length;
+	const desiredNextIdx = windowEndIdx + 1;
+	const minNextIdx = currentFromIdx + 1;
+	const maxNextIdx = Math.min(desiredNextIdx, toIdx);
+	const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
+	for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
+		const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
+		const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
+		if (pos > 0) return pos;
 	}
-	return compiled;
+	return remainingContent.length;
 };
 /**
-* Applies ordered regex replacements to page content (per page).
-*
-* - Replacement rules are applied in array order.
-* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
-* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
+* Finds exclusion-based break position using raw cumulative offsets.
 *
-* This function is intentionally **pure**:
-* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
+* This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
+* Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
 */
-const applyReplacements = (pages, rules) => {
-	if (!rules || rules.length === 0 || pages.length === 0) return pages;
-	const compiled = compileReplaceRules(rules);
-	if (compiled.length === 0) return pages;
-	return pages.map((p) => {
-		let content = p.content;
-		for (const rule of compiled) {
-			if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
-			content = content.replace(rule.re, rule.replacement);
-		}
-		if (content === p.content) return p;
-		return {
-			...p,
-			content
-		};
-	});
+const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
+	const startingPageId = pageIds[currentFromIdx];
+	if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
+	for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
+		const pageId = pageIds[pageIdx];
+		if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
+	}
+	return -1;
 };
-//#endregion
-//#region src/segmentation/tokens.ts
 /**
-* Token-based template system for Arabic text pattern matching.
-*
-* This module provides a human-readable way to define regex patterns using
-* `{{token}}` placeholders that expand to their regex equivalents. It supports
-* named capture groups for extracting matched values into metadata.
-*
-* @module tokens
+* Checks if any page in a range is excluded by the given exclude set.
 *
-* @example
-* // Simple token expansion
-* expandTokens('{{raqms}} {{dash}}')
-* // → '[\\u0660-\\u0669]+ [-–—ـ]'
+* @param excludeSet - Set of excluded page IDs
+* @param pageIds - Array of page IDs
+* @param fromIdx - Start index (inclusive)
+* @param toIdx - End index (inclusive)
+* @returns True if any page in range is excluded
+*/
+const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
+	if (excludeSet.size === 0) return false;
+	for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
+	return false;
+};
+/**
+* Finds the position of the next page content within remaining content.
+* Returns -1 if not found.
 *
-* @example
-* // Named capture groups
-* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
-* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
+* @param remainingContent - Content to search in
+* @param nextPageData - Normalized data for the next page
+* @returns Position of next page content, or -1 if not found
 */
+const findNextPagePosition = (remainingContent, nextPageData) => {
+	const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
+	if (searchPrefix.length === 0) return -1;
+	const pos = remainingContent.indexOf(searchPrefix);
+	return pos > 0 ? pos : -1;
+};
 /**
-* Token definitions mapping human-readable token names to regex patterns.
+* Finds matches within a window and returns the selected position based on preference.
 *
-* Tokens are used in template strings with double-brace syntax:
-* - `{{token}}` - Expands to the pattern (non-capturing in context)
-* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
-* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
-*
-* @remarks
-* These patterns are designed for Arabic text matching. For diacritic-insensitive
-* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
-* which applies `makeDiacriticInsensitive()` to the expanded patterns.
-*
-* @example
-* // Using tokens in a split rule
-* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
-*
-* @example
-* // Using tokens with named captures
-* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
+* @param windowContent - Content to search
+* @param regex - Regex to match
+* @param prefer - 'longer' for last match, 'shorter' for first match
+* @returns Break position after the selected match, or -1 if no matches
 */
+const findPatternBreakPosition = (windowContent, regex, prefer) => {
+	let first;
+	let last;
+	for (const m of windowContent.matchAll(regex)) {
+		const match = {
+			index: m.index,
+			length: m[0].length
+		};
+		if (!first) first = match;
+		last = match;
+	}
+	if (!first) return -1;
+	const selected = prefer === "longer" ? last : first;
+	return selected.index + selected.length;
+};
 /**
-* Escapes regex metacharacters (parentheses and brackets) in template patterns,
-* but preserves content inside `{{...}}` token delimiters.
-*
-* This allows users to write intuitive patterns like `({{harf}}):` instead of
-* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
-* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
-*
-* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
-* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
-*
-* @example
-* escapeTemplateBrackets('({{harf}}): ')
-* // → '\\({{harf}}\\): '
-*
-* @example
-* escapeTemplateBrackets('[{{raqm}}] ')
-* // → '\\[{{raqm}}\\] '
-*
-* @example
-* escapeTemplateBrackets('{{harf}}')
-* // → '{{harf}}' (unchanged - no brackets outside tokens)
+* Handles page boundary breakpoint (empty pattern).
+* Returns break position or -1 if no valid position found.
 */
-const escapeTemplateBrackets = (pattern) => {
-	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
-		if (token) return token;
-		return `\\${bracket}`;
-	});
-};
-const RUMUZ_ATOM = `(?:${[
-	"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
-	"خت",
-	"خغ",
-	"بخ",
-	"عخ",
-	"مق",
-	"مت",
-	"عس",
-	"سي",
-	"سن",
-	"كن",
-	"مد",
-	"قد",
-	"خد",
-	"فد",
-	"دل",
-	"كد",
-	"غد",
-	"صد",
-	"دت",
-	"دس",
-	"تم",
-	"فق",
-	"دق",
-	"[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
-	"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
-].join("|")})`;
-const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
-const BASE_TOKENS = {
-	bab: "باب",
-	basmalah: ["بسم الله", "﷽"].join("|"),
-	bullet: "[•*°]",
-	dash: "[-–—ـ]",
-	fasl: ["مسألة", "فصل"].join("|"),
-	harf: "[أ-ي]",
-	harfs: "[أ-ي](?:\\s+[أ-ي])*",
-	kitab: "كتاب",
-	naql: [
-		"حدثني",
-		"وأخبرنا",
-		"حدثنا",
-		"سمعت",
-		"أنبأنا",
-		"وحدثنا",
-		"أخبرنا",
-		"وحدثني",
-		"وحدثنيه"
-	].join("|"),
-	raqm: "[\\u0660-\\u0669]",
-	raqms: "[\\u0660-\\u0669]+",
-	rumuz: RUMUZ_BLOCK,
-	tarqim: "[.!?؟؛]"
+const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
+	const nextPageIdx = windowEndIdx + 1;
+	if (nextPageIdx <= toIdx) {
+		const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
+		if (nextPageData) {
+			const pos = findNextPagePosition(remainingContent, nextPageData);
+			if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
+		}
+	}
+	return Math.min(windowEndPosition, remainingContent.length);
 };
 /**
-* Composite token definitions using template syntax.
-*
-* These tokens reference base tokens using `{{token}}` syntax and are
-* automatically expanded to their final regex patterns at module load time.
-*
-* This provides better abstraction - if base tokens change, composites
-* automatically update on the next build.
+* Tries to find a break position within the current window using breakpoint patterns.
+* Returns the break position or -1 if no suitable break was found.
 *
-* @internal
+* @param remainingContent - Content remaining to be segmented
+* @param currentFromIdx - Current starting page index
+* @param toIdx - Ending page index
+* @param windowEndIdx - Maximum window end index
+* @param ctx - Breakpoint context with page data and patterns
+* @returns Break position in the content, or -1 if no break found
 */
-const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
+const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
+	const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
+	for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
+		if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
+		if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
+		if (skipWhenRegex?.test(remainingContent)) continue;
+		if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
+		const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
+		if (breakPos > 0) return breakPos;
+	}
+	return -1;
+};
+//#endregion
+//#region src/segmentation/breakpoint-processor.ts
 /**
-* Expands any *composite* tokens (like `{{numbered}}`) into their underlying template form
-* (like `{{raqms}} {{dash}} `).
-*
-* This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
-* and turn it into an editable template where you can add named captures, e.g.:
-*
-* - `{{numbered}}` → `{{raqms}} {{dash}} `
-* - then: `{{raqms:num}} {{dash}} ` to capture the number
+* Breakpoint post-processing engine extracted from segmenter.ts.
 *
-* Notes:
-* - This only expands the plain `{{token}}` form (not `{{token:name}}`).
-* - Expansion is repeated a few times to support nested composites.
+* This module is intentionally split into small helpers to reduce cognitive complexity
+* and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
 */
-const expandCompositeTokensInTemplate = (template) => {
-	let out = template;
-	for (let i = 0; i < 10; i++) {
-		const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => {
-			return COMPOSITE_TOKENS[tokenName] ?? m;
+const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
+const buildNormalizedPagesMap = (pages, normalizedContent) => {
+	const normalizedPages = /* @__PURE__ */ new Map();
+	for (let i = 0; i < pages.length; i++) {
+		const content = normalizedContent[i];
+		normalizedPages.set(pages[i].id, {
+			content,
+			index: i,
+			length: content.length
 		});
-		if (next === out) break;
-		out = next;
 	}
-	return out;
+	return normalizedPages;
 };
-/**
-* Expands base tokens in a template string.
-* Used internally to pre-expand composite tokens.
-*
-* @param template - Template string with `{{token}}` placeholders
-* @returns Expanded pattern with base tokens replaced
-* @internal
-*/
-const expandBaseTokens = (template) => {
-	return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
-		return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
-	});
+const buildCumulativeOffsets = (pageIds, normalizedPages) => {
+	const cumulativeOffsets = [0];
+	let totalOffset = 0;
+	for (let i = 0; i < pageIds.length; i++) {
+		const pageData = normalizedPages.get(pageIds[i]);
+		totalOffset += pageData ? pageData.length : 0;
+		if (i < pageIds.length - 1) totalOffset += 1;
+		cumulativeOffsets.push(totalOffset);
+	}
+	return cumulativeOffsets;
+};
+const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
+const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
+	const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
+	let windowEndIdx = currentFromIdx;
+	for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
+	else break;
+	return windowEndIdx;
 };
+const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
+const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
 /**
-* Token definitions mapping human-readable token names to regex patterns.
-*
-* Tokens are used in template strings with double-brace syntax:
-* - `{{token}}` - Expands to the pattern (non-capturing in context)
-* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
-* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
-*
-* @remarks
-* These patterns are designed for Arabic text matching. For diacritic-insensitive
-* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
-* which applies `makeDiacriticInsensitive()` to the expanded patterns.
-*
-* @example
-* // Using tokens in a split rule
-* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
-*
-* @example
-* // Using tokens with named captures
-* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
+* Computes the actual start and end page indices for a piece using
+* precomputed boundary positions and binary search.
 *
-* @example
-* // Using the numbered convenience token
-* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
+* @param pieceStartPos - Start position of the piece in the full segment content
+* @param pieceEndPos - End position (exclusive) of the piece
+* @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
+* @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
+* @param toIdx - Maximum page index
+* @returns Object with actualStartIdx and actualEndIdx
 */
-const TOKEN_PATTERNS = {
-	...BASE_TOKENS,
-	...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
+const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
+	const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
+	const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
+	return {
+		actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
+		actualStartIdx
+	};
+};
+const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
+	let nextFromIdx = actualEndIdx;
+	if (remainingContent && actualEndIdx + 1 <= toIdx) {
+		const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
+		if (nextPageData) {
+			const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
+			const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
+			if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
+		}
+	}
+	return nextFromIdx;
 };
+const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
 /**
-* Regex pattern for matching tokens with optional named capture syntax.
-*
-* Matches:
-* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
-* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
-* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
+* Finds the break offset within a window, trying exclusions first, then patterns.
 *
-* @internal
+* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
 */
-const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
+const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
+	if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
+		const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
+		if (exclusionBreak > 0) return exclusionBreak;
+	}
+	const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
+		expandedBreakpoints,
+		normalizedPages,
+		pageIds,
+		prefer
+	});
+	return patternBreak > 0 ? patternBreak : windowEndPosition;
+};
 /**
-* Regex pattern for simple token matching (no capture syntax).
-*
-* Matches only `{{token}}` format where token is one or more word characters.
-* Used by `containsTokens()` for quick detection.
-*
-* @internal
+* Advances cursor position past any leading whitespace.
 */
-const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
+const skipWhitespace = (content, startPos) => {
+	let pos = startPos;
+	while (pos < content.length && /\s/.test(content[pos])) pos++;
+	return pos;
+};
 /**
-* Checks if a query string contains template tokens.
-*
-* Performs a quick test for `{{token}}` patterns without actually
-* expanding them. Useful for determining whether to apply token
-* expansion to a string.
-*
-* @param query - String to check for tokens
-* @returns `true` if the string contains at least one `{{token}}` pattern
+* Processes an oversized segment by iterating through the content and
+* breaking it into smaller pieces that fit within maxPages constraints.
 *
-* @example
-* containsTokens('{{raqms}} {{dash}}') // → true
-* containsTokens('plain text')          // → false
-* containsTokens('[٠-٩]+ - ')           // → false (raw regex, no tokens)
+* Uses precomputed boundary positions for O(log n) page attribution lookups.
 */
-const containsTokens = (query) => {
-	SIMPLE_TOKEN_REGEX.lastIndex = 0;
-	return SIMPLE_TOKEN_REGEX.test(query);
-};
-const splitTemplateIntoSegments = (query) => {
-	const segments = [];
-	let lastIndex = 0;
-	TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
-	let match;
-	while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
-		if (match.index > lastIndex) segments.push({
-			type: "text",
-			value: query.slice(lastIndex, match.index)
+const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
+	const result = [];
+	const fullContent = segment.content;
+	let cursorPos = 0;
+	let currentFromIdx = fromIdx;
+	let isFirstPiece = true;
+	const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
+	logger?.debug?.("[breakpoints] boundaryPositions built", {
+		boundaryPositions,
+		fromIdx,
+		fullContentLength: fullContent.length,
+		toIdx
+	});
+	const maxIterations = 1e4;
+	for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
+		const remainingContent = fullContent.slice(cursorPos);
+		if (!remainingContent.trim()) break;
+		const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
+		const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
+		if (remainingSpan <= maxPages && !remainingHasExclusions) {
+			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
+			if (finalSeg) result.push(finalSeg);
+			break;
+		}
+		const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
+		const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
+		logger?.debug?.(`[breakpoints] iteration=${i}`, {
+			currentFromIdx,
+			cursorPos,
+			windowEndIdx
 		});
-		segments.push({
-			type: "token",
-			value: match[0]
+		const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
+		const breakPos = cursorPos + breakOffset;
+		const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
+		const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
+		logger?.trace?.("[breakpoints] piece", {
+			actualEndIdx,
+			actualStartIdx,
+			pieceLength: pieceContent.length
 		});
-		lastIndex = match.index + match[0].length;
+		if (pieceContent) {
+			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
+			if (pieceSeg) result.push(pieceSeg);
+		}
+		cursorPos = skipWhitespace(fullContent, breakPos);
+		currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
+		isFirstPiece = false;
 	}
-	if (lastIndex < query.length) segments.push({
-		type: "text",
-		value: query.slice(lastIndex)
-	});
-	return segments;
-};
-const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
-	if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
-	return text;
-};
-const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
-	if (!fuzzyTransform) return tokenPattern;
-	return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
-};
-const parseTokenLiteral = (literal) => {
-	TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
-	const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
-	if (!tokenMatch) return null;
-	const [, tokenName, captureName] = tokenMatch;
-	return {
-		captureName,
-		tokenName
-	};
-};
-const createCaptureRegistry = (capturePrefix) => {
-	const captureNames = [];
-	const captureNameCounts = /* @__PURE__ */ new Map();
-	const register = (baseName) => {
-		const count = captureNameCounts.get(baseName) ?? 0;
-		captureNameCounts.set(baseName, count + 1);
-		const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
-		const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
-		captureNames.push(prefixedName);
-		return prefixedName;
-	};
-	return {
-		captureNames,
-		register
-	};
+	logger?.debug?.("[breakpoints] done", { resultCount: result.length });
+	return result;
 };
-const expandTokenLiteral = (literal, opts) => {
-	const parsed = parseTokenLiteral(literal);
-	if (!parsed) return literal;
-	const { tokenName, captureName } = parsed;
-	if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
-	let tokenPattern = TOKEN_PATTERNS[tokenName];
-	if (!tokenPattern) return literal;
-	tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
-	if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
-	return tokenPattern;
+/**
+* Applies breakpoints to oversized segments.
+*
+* Note: This is an internal engine used by `segmentPages()`.
+*/
+const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
+	const pageIds = pages.map((p) => p.id);
+	const pageIdToIndex = buildPageIdToIndexMap(pageIds);
+	const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
+	const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
+	const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
+	const result = [];
+	logger?.info?.("Starting breakpoint processing", {
+		maxPages,
+		segmentCount: segments.length
+	});
+	logger?.debug?.("[breakpoints] inputSegments", {
+		segmentCount: segments.length,
+		segments: segments.map((s) => ({
+			contentLength: s.content.length,
+			from: s.from,
+			to: s.to
+		}))
+	});
+	for (const segment of segments) {
+		const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
+		const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
+		const segmentSpan = (segment.to ?? segment.from) - segment.from;
+		const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
+		if (segmentSpan <= maxPages && !hasExclusions) {
+			result.push(segment);
+			continue;
+		}
+		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
+		result.push(...broken.map((s) => {
+			const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
+			const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
+			if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
+				...s,
+				content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
+			};
+			return s;
+		}));
+	}
+	logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
+	return result;
 };
+//#endregion
+//#region src/segmentation/match-utils.ts
 /**
-* Expands template tokens with support for named captures.
+* Utility functions for regex matching and result processing.
 *
-* This is the primary token expansion function that handles all token syntax:
-* - `{{token}}` → Expands to the token's pattern (no capture group)
-* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
-* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
+* These functions were extracted from `segmenter.ts` to reduce complexity
+* and enable independent testing. They handle match filtering, capture
+* extraction, and occurrence-based selection.
 *
-* Unknown tokens are left as-is in the output, allowing for partial templates.
+* @module match-utils
+*/
+/**
+* Extracts named capture groups from a regex match.
 *
-* @param query - The template string containing tokens
-* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
-*                         Applied to both token patterns and plain Arabic text between tokens.
-*                         Typically `makeDiacriticInsensitive` from the fuzzy module.
-* @returns Object with expanded pattern, capture names, and capture flag
+* Only includes groups that are in the `captureNames` list and have
+* defined values. This filters out positional captures and ensures
+* only explicitly requested named captures are returned.
 *
-* @example
-* // Simple token expansion
-* expandTokensWithCaptures('{{raqms}} {{dash}}')
-* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
+* @param groups - The `match.groups` object from `RegExp.exec()`
+* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
+* @returns Object with capture name → value pairs, or `undefined` if none found
 *
 * @example
-* // Named capture
-* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
-* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
+* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
+* extractNamedCaptures(match.groups, ['num'])
+* // → { num: '٦٦٩٦' }
 *
 * @example
-* // Capture-only token
-* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
-* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
+* // No matching captures
+* extractNamedCaptures({}, ['num'])
+* // → undefined
 *
 * @example
-* // With fuzzy transform
-* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
-* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
+* // Undefined groups
+* extractNamedCaptures(undefined, ['num'])
+* // → undefined
 */
-const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
-	const segments = splitTemplateIntoSegments(query);
-	const registry = createCaptureRegistry(capturePrefix);
-	const processedParts = segments.map((segment) => {
-		if (segment.type === "text") return maybeApplyFuzzyToText(segment.value, fuzzyTransform);
-		return expandTokenLiteral(segment.value, {
-			capturePrefix,
-			fuzzyTransform,
-			registerCapture: registry.register
-		});
-	});
-	return {
-		captureNames: registry.captureNames,
-		hasCaptures: registry.captureNames.length > 0,
-		pattern: processedParts.join("")
-	};
+const extractNamedCaptures = (groups, captureNames) => {
+	if (!groups || captureNames.length === 0) return;
+	const namedCaptures = {};
+	for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
+	return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
 };
 /**
-* Expands template tokens in a query string to their regex equivalents.
-*
-* This is the simple version without capture support. It returns only the
-* expanded pattern string, not capture metadata.
+* Gets the last defined positional capture group from a match array.
 *
-* Unknown tokens are left as-is, allowing for partial templates.
+* Used for `lineStartsAfter` patterns where the content capture (`.*`)
+* is always at the end of the pattern. Named captures may shift the
+* positional indices, so we iterate backward to find the actual content.
 *
-* @param query - Template string containing `{{token}}` placeholders
-* @returns Expanded regex pattern string
+* @param match - RegExp exec result array
+* @returns The last defined capture group value, or `undefined` if none
 *
 * @example
-* expandTokens('، {{raqms}}')     // → '، [\\u0660-\\u0669]+'
-* expandTokens('{{raqm}}*')       // → '[\\u0660-\\u0669]*'
-* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
-* expandTokens('{{unknown}}')     // → '{{unknown}}' (left as-is)
-*
-* @see expandTokensWithCaptures for full capture group support
-*/
-const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
-/**
-* Converts a template string to a compiled RegExp.
-*
-* Expands all tokens and attempts to compile the result as a RegExp
-* with Unicode flag. Returns `null` if the resulting pattern is invalid.
-*
-* @remarks
-* This function dynamically compiles regular expressions from template strings.
-* If templates may come from untrusted sources, be aware of potential ReDoS
-* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
-* Consider validating pattern complexity or applying execution timeouts when
-* running user-submitted patterns.
-*
-* @param template - Template string containing `{{token}}` placeholders
-* @returns Compiled RegExp with 'u' flag, or `null` if invalid
+* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
+* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
+* getLastPositionalCapture(match)
+* // → 'content'
 *
 * @example
-* templateToRegex('، {{raqms}}')  // → /، [٠-٩]+/u
-* templateToRegex('{{raqms}}+')   // → /[٠-٩]++/u (might be invalid in some engines)
-* templateToRegex('(((')          // → null (invalid regex)
+* // No captures
+* getLastPositionalCapture(['full match'])
+* // → undefined
 */
-const templateToRegex = (template) => {
-	const expanded = expandTokens(template);
-	try {
-		return new RegExp(expanded, "u");
-	} catch {
-		return null;
-	}
+const getLastPositionalCapture = (match) => {
+	if (match.length <= 1) return;
+	for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
 };
 /**
-* Lists all available token names defined in `TOKEN_PATTERNS`.
+* Filters matches to only include those within page ID constraints.
 *
-* Useful for documentation, validation, or building user interfaces
-* that show available tokens.
+* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
+* matches that occur on pages outside the allowed range or explicitly excluded.
 *
-* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
+* @param matches - Array of match results to filter
+* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
+* @param getId - Function that returns the page ID for a given offset
+* @returns Filtered array containing only matches within constraints
 *
 * @example
-* getAvailableTokens()
-* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
+* const matches = [
+*   { start: 0, end: 10 },   // Page 1
+*   { start: 100, end: 110 }, // Page 5
+*   { start: 200, end: 210 }, // Page 10
+* ];
+* filterByConstraints(matches, { min: 3, max: 8 }, getId)
+* // → [{ start: 100, end: 110 }] (only page 5 match)
 */
-const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
+const filterByConstraints = (matches, rule, getId) => {
+	return matches.filter((m) => {
+		const id = getId(m.start);
+		if (rule.min !== void 0 && id < rule.min) return false;
+		if (rule.max !== void 0 && id > rule.max) return false;
+		if (isPageExcluded(id, rule.exclude)) return false;
+		return true;
+	});
+};
 /**
-* Gets the regex pattern for a specific token name.
+* Checks if any rule in the list allows the given page ID.
 *
-* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
-* without any expansion or capture group wrapping.
+* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
+* Rules without constraints allow all page IDs.
 *
-* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
-* @returns The regex pattern string, or `undefined` if token doesn't exist
+* This is used to determine whether to create a segment for content
+* that appears before any split points (the "first segment").
 *
-* @example
-* getTokenPattern('raqms')   // → '[\\u0660-\\u0669]+'
-* getTokenPattern('dash')    // → '[-–—ـ]'
-* getTokenPattern('unknown') // → undefined
-*/
-const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
-/**
-* Regex to detect fuzzy-default tokens in a pattern string.
-* Matches {{token}} or {{token:name}} syntax.
-*/
-const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
-	"bab",
-	"basmalah",
-	"fasl",
-	"kitab",
-	"naql"
-].join("|")})(?::\\w+)?\\}\\}`, "g");
-/**
-* Checks if a pattern (or array of patterns) contains tokens that should
-* default to fuzzy matching.
+* @param rules - Array of rules with optional `min` and `max` constraints
+* @param pageId - Page ID to check
+* @returns `true` if at least one rule allows the page ID
 *
-* Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
+* @example
+* const rules = [
+*   { min: 5, max: 10 },  // Allows pages 5-10
+*   { min: 20 },          // Allows pages 20+
+* ];
 *
-* @param patterns - Single pattern string or array of pattern strings
-* @returns `true` if any pattern contains a fuzzy-default token
+* anyRuleAllowsId(rules, 7)   // → true (first rule allows)
+* anyRuleAllowsId(rules, 3)   // → false (no rule allows)
+* anyRuleAllowsId(rules, 25)  // → true (second rule allows)
 *
 * @example
-* shouldDefaultToFuzzy('{{bab}} الإيمان')     // true
-* shouldDefaultToFuzzy('{{raqms}} {{dash}}')  // false
-* shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
+* // Rules without constraints allow everything
+* anyRuleAllowsId([{}], 999) // → true
 */
-const shouldDefaultToFuzzy = (patterns) => {
-	return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
-		FUZZY_TOKEN_REGEX.lastIndex = 0;
-		return FUZZY_TOKEN_REGEX.test(p);
+const anyRuleAllowsId = (rules, pageId) => {
+	return rules.some((r) => {
+		const minOk = r.min === void 0 || pageId >= r.min;
+		const maxOk = r.max === void 0 || pageId <= r.max;
+		return minOk && maxOk;
 	});
 };
@@ -1873,6 +1985,7 @@ const normalizeLineEndings = (content) => {
 *
 * @module segmenter
 */
+const MAX_REGEX_ITERATIONS = 1e5;
 /**
 * Builds a concatenated content string and page mapping from input pages.
 *
@@ -1980,9 +2093,18 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
 	if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
 	return [initialSeg];
 };
-const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
+const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
+	logger?.debug?.("[segmenter] collecting split points from rules", {
+		contentLength: matchContent.length,
+		ruleCount: rules.length
+	});
 	const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
 	const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
+	logger?.debug?.("[segmenter] rules partitioned", {
+		combinableCount: combinableRules.length,
+		fastFuzzyCount: fastFuzzyRules.length,
+		standaloneCount: standaloneRules.length
+	});
 	const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
 	if (combinableRules.length > 0) {
 		const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
@@ -1995,9 +2117,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
 		});
 		const combinedSource = ruleRegexes.map((r) => r.source).join("|");
 		const combinedRegex = new RegExp(combinedSource, "gm");
+		logger?.debug?.("[segmenter] combined regex built", {
+			combinableRuleCount: combinableRules.length,
+			combinedSourceLength: combinedSource.length
+		});
 		combinedRegex.lastIndex = 0;
 		let m = combinedRegex.exec(matchContent);
+		let iterationCount = 0;
 		while (m !== null) {
+			iterationCount++;
+			if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
+			if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
+				iterationCount,
+				lastIndex: combinedRegex.lastIndex,
+				matchLength: m[0].length,
+				matchPosition: m.index
+			});
 			const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
 			if (matchedRuleIndex !== -1) {
 				const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
@@ -2018,8 +2153,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
 				const start = m.index;
 				const end = m.index + m[0].length;
 				const pageId = pageMap.getId(start);
-				if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
-					if (!passesPageStartGuard(rule, originalIndex, start)) continue;
+				if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
 					const sp = {
 						capturedContent: void 0,
 						contentStartOffset,
@@ -2205,7 +2339,7 @@ const segmentPages = (pages, options) => {
 		pageIds: pageMap.pageIds,
 		totalContentLength: matchContent.length
 	});
-	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
+	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
 	const unique = dedupeSplitPoints(splitPoints);
 	logger?.debug?.("[segmenter] split points collected", {
 		rawSplitPoints: splitPoints.length,
@@ -2697,5 +2831,5 @@ const analyzeTextForRule = (text) => {
 };
 //#endregion
-export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
+export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
 //# sourceMappingURL=index.mjs.map