npm - flappa-doormal - Versions diffs - 2.7.0 → 2.9.0 - Mend

flappa-doormal 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -645,6 +645,10 @@ const buildBareTokenRegex = () => {
 * Validates a single pattern for common issues.
 */
 const validatePattern = (pattern, seenPatterns) => {
+	if (!pattern.trim()) return {
+		message: "Empty pattern is not allowed",
+		type: "empty_pattern"
+	};
 	if (seenPatterns.has(pattern)) return {
 		message: `Duplicate pattern: "${pattern}"`,
 		type: "duplicate"
@@ -727,7 +731,7 @@ const validateRules = (rules) => {
 				hasIssues = true;
 			}
 		}
-		if ("template" in rule && rule.template) {
+		if ("template" in rule && rule.template !== void 0) {
 			const seenPatterns = /* @__PURE__ */ new Set();
 			const issue = validatePattern(rule.template, seenPatterns);
 			if (issue) {
@@ -1245,16 +1249,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
 */
 const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
 	const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
-	for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
+	for (let i = 0; i < expandedBreakpoints.length; i++) {
+		const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
 		if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
 		if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
 		if (skipWhenRegex?.test(remainingContent)) continue;
-		if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
+		if (regex === null) return {
+			breakpointIndex: i,
+			breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
+			rule
+		};
 		const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
-		if (breakPos > 0) return breakPos;
+		if (breakPos > 0) return {
+			breakpointIndex: i,
+			breakPos,
+			rule
+		};
 	}
-	return -1;
+	return null;
+};
+//#endregion
+//#region src/segmentation/debug-meta.ts
+const resolveDebugConfig = (debug) => {
+	if (!debug) return null;
+	if (debug === true) return {
+		includeBreakpoint: true,
+		includeRule: true,
+		metaKey: "_flappa"
+	};
+	if (typeof debug !== "object") return null;
+	const metaKey = debug.metaKey;
+	const include = debug.include;
+	const includeRule = Array.isArray(include) ? include.includes("rule") : true;
+	return {
+		includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
+		includeRule,
+		metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
+	};
+};
+const getRulePatternType = (rule) => {
+	if ("lineStartsWith" in rule) return "lineStartsWith";
+	if ("lineStartsAfter" in rule) return "lineStartsAfter";
+	if ("lineEndsWith" in rule) return "lineEndsWith";
+	if ("template" in rule) return "template";
+	return "regex";
+};
+const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
+const mergeDebugIntoMeta = (meta, metaKey, patch) => {
+	const out = meta ? { ...meta } : {};
+	const existing = out[metaKey];
+	out[metaKey] = {
+		...isPlainObject(existing) ? existing : {},
+		...patch
+	};
+	return out;
 };
+const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
+	index: ruleIndex,
+	patternType: getRulePatternType(rule)
+} });
+const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
+	index: breakpointIndex,
+	kind: rule.pattern === "" ? "pageBoundary" : "pattern",
+	pattern: rule.pattern
+} });
 //#endregion
 //#region src/segmentation/breakpoint-processor.ts
@@ -1338,20 +1397,25 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
 const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
 	if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
 		const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
-		if (exclusionBreak > 0) return exclusionBreak;
+		if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
 	}
-	const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
+	const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
 		expandedBreakpoints,
 		normalizedPages,
 		pageIds,
 		prefer
 	});
-	return patternBreak > 0 ? patternBreak : windowEndPosition;
+	if (patternMatch && patternMatch.breakPos > 0) return {
+		breakOffset: patternMatch.breakPos,
+		breakpointIndex: patternMatch.breakpointIndex,
+		breakpointRule: patternMatch.rule
+	};
+	return { breakOffset: windowEndPosition };
 };
 /**
 * Advances cursor position past any leading whitespace.
 */
-const skipWhitespace = (content, startPos) => {
+const skipWhitespace$1 = (content, startPos) => {
 	let pos = startPos;
 	while (pos < content.length && /\s/.test(content[pos])) pos++;
 	return pos;
@@ -1362,12 +1426,13 @@ const skipWhitespace = (content, startPos) => {
 *
 * Uses precomputed boundary positions for O(log n) page attribution lookups.
 */
-const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
+const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
 	const result = [];
 	const fullContent = segment.content;
 	let cursorPos = 0;
 	let currentFromIdx = fromIdx;
 	let isFirstPiece = true;
+	let lastBreakpoint = null;
 	const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
 	logger?.debug?.("[breakpoints] boundaryPositions built", {
 		boundaryPositions,
@@ -1382,7 +1447,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 		const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
 		const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
 		if (remainingSpan <= maxPages && !remainingHasExclusions) {
-			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
+			const includeMeta = isFirstPiece || Boolean(debugMetaKey);
+			const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
+			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
 			if (finalSeg) result.push(finalSeg);
 			break;
 		}
@@ -1393,8 +1460,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 			cursorPos,
 			windowEndIdx
 		});
-		const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
-		const breakPos = cursorPos + breakOffset;
+		const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
+		if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
+			breakpointIndex: found.breakpointIndex,
+			rule: found.breakpointRule
+		};
+		const breakPos = cursorPos + found.breakOffset;
 		const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
 		const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
 		logger?.trace?.("[breakpoints] piece", {
@@ -1403,10 +1474,11 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 			pieceLength: pieceContent.length
 		});
 		if (pieceContent) {
-			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
+			const includeMeta = isFirstPiece || Boolean(debugMetaKey);
+			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
 			if (pieceSeg) result.push(pieceSeg);
 		}
-		cursorPos = skipWhitespace(fullContent, breakPos);
+		cursorPos = skipWhitespace$1(fullContent, breakPos);
 		currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
 		isFirstPiece = false;
 	}
@@ -1418,7 +1490,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 *
 * Note: This is an internal engine used by `segmentPages()`.
 */
-const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
+const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
 	const pageIds = pages.map((p) => p.id);
 	const pageIdToIndex = buildPageIdToIndexMap(pageIds);
 	const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1446,7 +1518,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 			result.push(segment);
 			continue;
 		}
-		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
+		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
 		result.push(...broken.map((s) => {
 			const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
 			const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -1959,6 +2031,129 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
 	return splitPointsByRule;
 };
+//#endregion
+//#region src/segmentation/split-point-helpers.ts
+/**
+* Helper module for collectSplitPointsFromRules to reduce complexity.
+* Handles combined regex matching and split point creation.
+*/
+const MAX_REGEX_ITERATIONS = 1e5;
+const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
+	const result = {};
+	if (!groups) return result;
+	for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
+	return result;
+};
+const buildContentOffsets = (match, ruleInfo) => {
+	if (!ruleInfo.usesLineStartsAfter) return {};
+	const captured = match.groups?.[`${ruleInfo.prefix}__content`];
+	if (captured === void 0) return {};
+	return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
+};
+const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
+const createSplitPointFromMatch = (match, rule, ruleInfo) => {
+	const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
+	const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
+	return {
+		capturedContent: void 0,
+		contentStartOffset,
+		index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
+		meta: rule.meta,
+		namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
+	};
+};
+const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
+	const combinedSource = ruleRegexes.map((r) => r.source).join("|");
+	const combinedRegex = new RegExp(combinedSource, "gm");
+	logger?.debug?.("[segmenter] combined regex built", {
+		combinableRuleCount: combinableRules.length,
+		combinedSourceLength: combinedSource.length
+	});
+	let m = combinedRegex.exec(matchContent);
+	let iterations = 0;
+	while (m !== null) {
+		iterations++;
+		if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
+		if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
+			iterations,
+			position: m.index
+		});
+		const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
+		if (matchedIndex !== -1) {
+			const { rule, index: originalIndex } = combinableRules[matchedIndex];
+			const ruleInfo = ruleRegexes[matchedIndex];
+			if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
+				const sp = createSplitPointFromMatch(m, rule, ruleInfo);
+				if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
+				splitPointsByRule.get(originalIndex).push(sp);
+			}
+		}
+		if (m[0].length === 0) combinedRegex.lastIndex++;
+		m = combinedRegex.exec(matchContent);
+	}
+};
+const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
+	const built = buildRuleRegex(rule, prefix);
+	return {
+		...built,
+		prefix,
+		source: `(?<${prefix}>${built.regex.source})`
+	};
+});
+const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
+	const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
+	const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
+		const isLSA = usesLineStartsAfter && m.captured !== void 0;
+		const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
+		return {
+			capturedContent: isLSA ? void 0 : m.captured,
+			contentStartOffset: isLSA ? markerLen : void 0,
+			index: (rule.split ?? "at") === "at" ? m.start : m.end,
+			meta: rule.meta,
+			namedCaptures: m.namedCaptures
+		};
+	});
+	if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
+	splitPointsByRule.get(ruleIndex).push(...points);
+};
+const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
+	const matches = [];
+	let m = regex.exec(content);
+	while (m !== null) {
+		matches.push({
+			captured: usesCapture ? getLastPositionalCapture(m) : void 0,
+			end: m.index + m[0].length,
+			namedCaptures: extractNamedCaptures(m.groups, captureNames),
+			start: m.index
+		});
+		if (m[0].length === 0) regex.lastIndex++;
+		m = regex.exec(content);
+	}
+	return matches;
+};
+const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
+	const result = [];
+	rules.forEach((rule, index) => {
+		const points = splitPointsByRule.get(index);
+		if (!points?.length) return;
+		const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
+		if (!debugMetaKey) {
+			result.push(...filtered.map((p) => ({
+				...p,
+				ruleIndex: index
+			})));
+			return;
+		}
+		const debugPatch = buildRuleDebugPatch(index, rule);
+		result.push(...filtered.map((p) => ({
+			...p,
+			meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
+			ruleIndex: index
+		})));
+	});
+	return result;
+};
 //#endregion
 //#region src/segmentation/textUtils.ts
 /**
@@ -1985,7 +2180,6 @@ const normalizeLineEndings = (content) => {
 *
 * @module segmenter
 */
-const MAX_REGEX_ITERATIONS = 1e5;
 /**
 * Builds a concatenated content string and page mapping from input pages.
 *
@@ -2082,7 +2276,7 @@ const dedupeSplitPoints = (splitPoints) => {
 const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
 	if (segments.length > 0 || pages.length === 0) return segments;
 	const firstPage = pages[0];
-	const lastPage = pages[pages.length - 1];
+	const lastPage = pages.at(-1);
 	const joinChar = pageJoiner === "newline" ? "\n" : " ";
 	const allContent = normalizedContent.join(joinChar).trim();
 	if (!allContent) return segments;
@@ -2093,7 +2287,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
 	if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
 	return [initialSeg];
 };
-const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
+const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
 	logger?.debug?.("[segmenter] collecting split points from rules", {
 		contentLength: matchContent.length,
 		ruleCount: rules.length
@@ -2106,124 +2300,9 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
 		standaloneCount: standaloneRules.length
 	});
 	const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
-	if (combinableRules.length > 0) {
-		const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
-			const built = buildRuleRegex(rule, prefix);
-			return {
-				prefix,
-				source: `(?<${prefix}>${built.regex.source})`,
-				...built
-			};
-		});
-		const combinedSource = ruleRegexes.map((r) => r.source).join("|");
-		const combinedRegex = new RegExp(combinedSource, "gm");
-		logger?.debug?.("[segmenter] combined regex built", {
-			combinableRuleCount: combinableRules.length,
-			combinedSourceLength: combinedSource.length
-		});
-		combinedRegex.lastIndex = 0;
-		let m = combinedRegex.exec(matchContent);
-		let iterationCount = 0;
-		while (m !== null) {
-			iterationCount++;
-			if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
-			if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
-				iterationCount,
-				lastIndex: combinedRegex.lastIndex,
-				matchLength: m[0].length,
-				matchPosition: m.index
-			});
-			const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
-			if (matchedRuleIndex !== -1) {
-				const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
-				const ruleInfo = ruleRegexes[matchedRuleIndex];
-				const namedCaptures = {};
-				if (m.groups) {
-					for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
-						const cleanName = prefixedName.slice(prefix.length);
-						namedCaptures[cleanName] = m.groups[prefixedName];
-					}
-				}
-				let capturedContent;
-				let contentStartOffset;
-				if (ruleInfo.usesLineStartsAfter) {
-					capturedContent = m.groups?.[`${prefix}__content`];
-					if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
-				}
-				const start = m.index;
-				const end = m.index + m[0].length;
-				const pageId = pageMap.getId(start);
-				if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
-					const sp = {
-						capturedContent: void 0,
-						contentStartOffset,
-						index: (rule.split ?? "at") === "at" ? start : end,
-						meta: rule.meta,
-						namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
-					};
-					if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
-					splitPointsByRule.get(originalIndex).push(sp);
-				}
-			}
-			if (m[0].length === 0) combinedRegex.lastIndex++;
-			m = combinedRegex.exec(matchContent);
-		}
-	}
-	const collectSplitPointsFromRule = (rule, ruleIndex) => {
-		const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
-		const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
-			const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
-			const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
-			return {
-				capturedContent: isLineStartsAfter ? void 0 : m.captured,
-				contentStartOffset: isLineStartsAfter ? markerLength : void 0,
-				index: (rule.split ?? "at") === "at" ? m.start : m.end,
-				meta: rule.meta,
-				namedCaptures: m.namedCaptures
-			};
-		});
-		if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
-		splitPointsByRule.get(ruleIndex).push(...points);
-	};
-	standaloneRules.forEach((rule) => {
-		collectSplitPointsFromRule(rule, rules.indexOf(rule));
-	});
-	const finalSplitPoints = [];
-	rules.forEach((rule, index) => {
-		const points = splitPointsByRule.get(index);
-		if (!points || points.length === 0) return;
-		let filtered = points;
-		if (rule.occurrence === "first") filtered = [points[0]];
-		else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
-		finalSplitPoints.push(...filtered);
-	});
-	return finalSplitPoints;
-};
-/**
-* Executes a regex against content and extracts match results with capture information.
-*
-* @param content - Full content string to search
-* @param regex - Compiled regex with 'g' flag
-* @param usesCapture - Whether to extract captured content
-* @param captureNames - Names of expected named capture groups
-* @returns Array of match results with positions and captures
-*/
-const findMatches = (content, regex, usesCapture, captureNames) => {
-	const matches = [];
-	regex.lastIndex = 0;
-	let m = regex.exec(content);
-	while (m !== null) {
-		const result = {
-			end: m.index + m[0].length,
-			start: m.index
-		};
-		result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
-		if (usesCapture) result.captured = getLastPositionalCapture(m);
-		matches.push(result);
-		if (m[0].length === 0) regex.lastIndex++;
-		m = regex.exec(content);
-	}
-	return matches;
+	if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
+	for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
+	return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
 };
 /**
 * Finds page breaks within a given offset range using binary search.
@@ -2326,6 +2405,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
 */
 const segmentPages = (pages, options) => {
 	const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
+	const debug = resolveDebugConfig(options.debug);
+	const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
 	logger?.info?.("[segmenter] starting segmentation", {
 		breakpointCount: breakpoints.length,
 		maxPages,
@@ -2339,7 +2420,7 @@ const segmentPages = (pages, options) => {
 		pageIds: pageMap.pageIds,
 		totalContentLength: matchContent.length
 	});
-	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
+	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
 	const unique = dedupeSplitPoints(splitPoints);
 	logger?.debug?.("[segmenter] split points collected", {
 		rawSplitPoints: splitPoints.length,
@@ -2358,7 +2439,7 @@ const segmentPages = (pages, options) => {
 	if (maxPages >= 0 && breakpoints.length) {
 		logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
 		const patternProcessor = (p) => processPattern(p, false).pattern;
-		const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
+		const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
 		logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
 		return result;
 	}
@@ -2410,7 +2491,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
 		const result = [];
 		for (let i = 0; i < splitPoints.length; i++) {
 			const sp = splitPoints[i];
-			const end = i < splitPoints.length - 1 ? splitPoints[i + 1].index : content.length;
+			const end = splitPoints[i + 1]?.index ?? content.length;
 			const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
 			if (s) result.push(s);
 		}
@@ -2434,29 +2515,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
 };
 //#endregion
-//#region src/analysis.ts
-const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
-const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
-const computeSpecificity = (pattern) => {
-	const tokenCount = countTokenMarkers(pattern);
-	return {
-		literalLen: stripWhitespacePlaceholders(pattern).length,
-		tokenCount
-	};
-};
-const DEFAULT_OPTIONS = {
-	includeFirstWordFallback: true,
-	lineFilter: void 0,
-	maxExamples: 1,
-	minCount: 3,
-	minLineLength: 6,
-	normalizeArabicDiacritics: true,
-	prefixChars: 60,
-	prefixMatchers: [/^#+/u],
-	sortBy: "specificity",
-	topK: 40,
-	whitespace: "regex"
-};
+//#region src/analysis/shared.ts
 const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
 const TOKEN_PRIORITY_ORDER$1 = [
 	"basmalah",
@@ -2497,30 +2556,7 @@ const appendWs = (out, mode) => {
 	if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
 	return out.endsWith("\\s*") ? out : `${out}\\s*`;
 };
-const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
-	let matchedAny = false;
-	let currentPos = pos;
-	let currentOut = out;
-	for (const re of prefixMatchers) {
-		if (currentPos >= s.length) break;
-		const m = re.exec(s.slice(currentPos));
-		if (!m || m.index !== 0 || !m[0]) continue;
-		currentOut += escapeSignatureLiteral(m[0]);
-		currentPos += m[0].length;
-		matchedAny = true;
-		const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
-		if (wsAfter) {
-			currentPos += wsAfter[0].length;
-			currentOut = appendWs(currentOut, whitespace);
-		}
-	}
-	return {
-		matchedAny,
-		out: currentOut,
-		pos: currentPos
-	};
-};
-const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
+const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
 	let best = null;
 	for (const { token, re } of compiled) {
 		re.lastIndex = pos;
@@ -2534,132 +2570,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
 	if (best?.token === "rumuz") {
 		const end = pos + best.text.length;
 		const next = end < s.length ? s[end] : "";
-		if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
+		if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
 	}
 	return best;
 };
-const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers, whitespace) => {
+const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
+const isCommonDelimiter = (ch) => /[:：\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
+//#endregion
+//#region src/analysis/line-starts.ts
+const resolveOptions$1 = (options = {}) => ({
+	includeFirstWordFallback: options.includeFirstWordFallback ?? true,
+	lineFilter: options.lineFilter,
+	maxExamples: options.maxExamples ?? 1,
+	minCount: options.minCount ?? 3,
+	minLineLength: options.minLineLength ?? 6,
+	normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
+	prefixChars: options.prefixChars ?? 60,
+	prefixMatchers: options.prefixMatchers ?? [/^#+/u],
+	sortBy: options.sortBy ?? "specificity",
+	topK: options.topK ?? 40,
+	whitespace: options.whitespace ?? "regex"
+});
+const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
+const computeSpecificity = (pattern) => ({
+	literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
+	tokenCount: countTokenMarkers(pattern)
+});
+const compareBySpecificity = (a, b) => {
+	const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
+	return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
+};
+const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
+/** Remove trailing whitespace placeholders */
+const trimTrailingWs = (out, mode) => {
+	const suffix = mode === "regex" ? "\\s*" : " ";
+	while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
+	return out;
+};
+/** Try to extract first word for fallback */
+const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
+/** Consume prefix matchers at current position */
+const consumePrefixes = (s, pos, out, matchers, ws) => {
+	let matched = false;
+	for (const re of matchers) {
+		if (pos >= s.length) break;
+		const m = re.exec(s.slice(pos));
+		if (!m?.index && m?.[0]) {
+			out += escapeSignatureLiteral(m[0]);
+			pos += m[0].length;
+			matched = true;
+			const wsm = /^[ \t]+/u.exec(s.slice(pos));
+			if (wsm) {
+				pos += wsm[0].length;
+				out = appendWs(out, ws);
+			}
+		}
+	}
+	return {
+		matched,
+		out,
+		pos
+	};
+};
+/** Try to match a token at current position and append to signature */
+const tryMatchToken = (s, pos, out, compiled) => {
+	const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
+	if (!best) return {
+		matched: false,
+		out,
+		pos
+	};
+	return {
+		matched: true,
+		out: `${out}{{${best.token}}}`,
+		pos: pos + best.text.length
+	};
+};
+/** Try to match a delimiter at current position */
+const tryMatchDelimiter = (s, pos, out) => {
+	const ch = s[pos];
+	if (!ch || !isCommonDelimiter(ch)) return {
+		matched: false,
+		out,
+		pos
+	};
+	return {
+		matched: true,
+		out: out + escapeSignatureLiteral(ch),
+		pos: pos + 1
+	};
+};
+/** Skip whitespace at position */
+const skipWhitespace = (s, pos, out, ws) => {
+	const m = /^[ \t]+/u.exec(s.slice(pos));
+	if (!m) return {
+		out,
+		pos,
+		skipped: false
+	};
+	return {
+		out: appendWs(out, ws),
+		pos: pos + m[0].length,
+		skipped: true
+	};
+};
+const tokenizeLineStart = (line, tokenNames, opts) => {
 	const trimmed = collapseWhitespace(line);
 	if (!trimmed) return null;
-	const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
-	let pos = 0;
-	let out = "";
-	let matchedAny = false;
-	let matchedToken = false;
+	const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
 	const compiled = compileTokenRegexes(tokenNames);
-	const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
-	const isCommonDelimiter = (ch) => /[:：\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
-	{
-		const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers, whitespace);
-		pos = consumed.pos;
-		out = consumed.out;
-		matchedAny = consumed.matchedAny;
-	}
-	let tokenSteps = 0;
-	while (tokenSteps < 6 && pos < s.length) {
-		const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
-		if (wsMatch) {
-			pos += wsMatch[0].length;
-			out = appendWs(out, whitespace);
+	let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
+	const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
+	pos = prefix.pos;
+	out = prefix.out;
+	matchedAny = prefix.matched;
+	while (steps < 6 && pos < s.length) {
+		const ws = skipWhitespace(s, pos, out, opts.whitespace);
+		if (ws.skipped) {
+			pos = ws.pos;
+			out = ws.out;
 			continue;
 		}
-		const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
-		if (best) {
-			if (out && !out.endsWith("\\s*")) {}
-			out += `{{${best.token}}}`;
-			matchedAny = true;
-			matchedToken = true;
-			pos += best.text.length;
-			tokenSteps++;
+		const tok = tryMatchToken(s, pos, out, compiled);
+		if (tok.matched) {
+			pos = tok.pos;
+			out = tok.out;
+			matchedAny = matchedToken = true;
+			steps++;
 			continue;
 		}
 		if (matchedAny) {
-			const ch = s[pos];
-			if (ch && isCommonDelimiter(ch)) {
-				out += escapeSignatureLiteral(ch);
-				pos += 1;
+			const delim = tryMatchDelimiter(s, pos, out);
+			if (delim.matched) {
+				pos = delim.pos;
+				out = delim.out;
 				continue;
 			}
 		}
 		if (matchedAny) {
-			if (includeFirstWordFallback && !matchedToken) {
-				const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
-				if (!firstWord$1) break;
-				out += escapeSignatureLiteral(firstWord$1);
-				tokenSteps++;
+			if (opts.includeFirstWordFallback && !matchedToken) {
+				const word$1 = extractFirstWord(s.slice(pos));
+				if (word$1) {
+					out += escapeSignatureLiteral(word$1);
+					steps++;
+				}
 			}
 			break;
 		}
-		if (!includeFirstWordFallback) return null;
-		const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
-		if (!firstWord) return null;
-		out += escapeSignatureLiteral(firstWord);
-		tokenSteps++;
-		return out;
-	}
-	if (!matchedAny) return null;
-	if (whitespace === "regex") while (out.endsWith("\\s*")) out = out.slice(0, -3);
-	else while (out.endsWith(" ")) out = out.slice(0, -1);
-	return out;
+		if (!opts.includeFirstWordFallback) return null;
+		const word = extractFirstWord(s.slice(pos));
+		if (!word) return null;
+		return escapeSignatureLiteral(word);
+	}
+	return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
+};
+const processLine = (line, pageId, tokenPriority, opts, acc) => {
+	const trimmed = collapseWhitespace(line);
+	if (trimmed.length < opts.minLineLength) return;
+	if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
+	const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
+	if (!sig) return;
+	const entry = acc.get(sig);
+	if (!entry) acc.set(sig, {
+		count: 1,
+		examples: [{
+			line: trimmed,
+			pageId
+		}]
+	});
+	else {
+		entry.count++;
+		if (entry.examples.length < opts.maxExamples) entry.examples.push({
+			line: trimmed,
+			pageId
+		});
+	}
+};
+const processPage = (page, tokenPriority, opts, acc) => {
+	for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
 };
 /**
 * Analyze pages and return the most common line-start patterns (top K).
-*
-* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
-* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
 */
 const analyzeCommonLineStarts = (pages, options = {}) => {
-	const o = {
-		...DEFAULT_OPTIONS,
-		...options,
-		lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
-		prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
-		whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
-	};
+	const opts = resolveOptions$1(options);
 	const tokenPriority = buildTokenPriority();
-	const counts = /* @__PURE__ */ new Map();
-	for (const page of pages) {
-		const lines = normalizeLineEndings(page.content ?? "").split("\n");
-		for (const line of lines) {
-			const trimmed = collapseWhitespace(line);
-			if (trimmed.length < o.minLineLength) continue;
-			if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
-			const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers, o.whitespace);
-			if (!sig) continue;
-			const existing = counts.get(sig);
-			if (!existing) counts.set(sig, {
-				count: 1,
-				examples: [{
-					line: trimmed,
-					pageId: page.id
-				}]
-			});
-			else {
-				existing.count++;
-				if (existing.examples.length < o.maxExamples) existing.examples.push({
-					line: trimmed,
-					pageId: page.id
-				});
+	const acc = /* @__PURE__ */ new Map();
+	for (const page of pages) processPage(page, tokenPriority, opts, acc);
+	const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
+	return [...acc.entries()].map(([pattern, v]) => ({
+		count: v.count,
+		examples: v.examples,
+		pattern
+	})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
+};
+//#endregion
+//#region src/analysis/repeating-sequences.ts
+const resolveOptions = (options) => {
+	const minElements = Math.max(1, options?.minElements ?? 1);
+	return {
+		contextChars: options?.contextChars ?? 50,
+		maxElements: Math.max(minElements, options?.maxElements ?? 3),
+		maxExamples: options?.maxExamples ?? 3,
+		maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
+		minCount: Math.max(1, options?.minCount ?? 3),
+		minElements,
+		normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
+		requireToken: options?.requireToken ?? true,
+		topK: Math.max(1, options?.topK ?? 20),
+		whitespace: options?.whitespace ?? "regex"
+	};
+};
+/** Creates a cursor that tracks position in both normalized and raw text */
+const createRawCursor = (text, normalize) => {
+	let rawPos = 0;
+	return {
+		advance(normalizedLen) {
+			if (!normalize) {
+				const chunk = text.slice(rawPos, rawPos + normalizedLen);
+				rawPos += normalizedLen;
+				return chunk;
 			}
+			const start = rawPos;
+			let matchedLen = 0;
+			while (matchedLen < normalizedLen && rawPos < text.length) {
+				if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
+				rawPos++;
+			}
+			while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
+			return text.slice(start, rawPos);
+		},
+		get pos() {
+			return rawPos;
+		}
+	};
+};
+/** Scans text and produces a stream of tokens and literals. */
+const tokenizeContent = (text, normalize) => {
+	const normalized = normalize ? stripArabicDiacritics(text) : text;
+	const compiled = compileTokenRegexes(buildTokenPriority());
+	const cursor = createRawCursor(text, normalize);
+	const items = [];
+	let pos = 0;
+	while (pos < normalized.length) {
+		const ws = /^\s+/u.exec(normalized.slice(pos));
+		if (ws) {
+			pos += ws[0].length;
+			cursor.advance(ws[0].length);
+			continue;
+		}
+		const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
+		if (token) {
+			const raw = cursor.advance(token.text.length);
+			items.push({
+				end: cursor.pos,
+				raw,
+				start: cursor.pos - raw.length,
+				text: `{{${token.token}}}`,
+				type: "token"
+			});
+			pos += token.text.length;
+			continue;
+		}
+		if (isCommonDelimiter(normalized[pos])) {
+			const raw = cursor.advance(1);
+			items.push({
+				end: cursor.pos,
+				raw,
+				start: cursor.pos - 1,
+				text: escapeSignatureLiteral(normalized[pos]),
+				type: "literal"
+			});
+			pos++;
+			continue;
+		}
+		const word = /^[^\s:：\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
+		if (word) {
+			const raw = cursor.advance(word[0].length);
+			items.push({
+				end: cursor.pos,
+				raw,
+				start: cursor.pos - raw.length,
+				text: escapeSignatureLiteral(word[0]),
+				type: "literal"
+			});
+			pos += word[0].length;
+			continue;
 		}
+		cursor.advance(1);
+		pos++;
 	}
-	const compareSpecificityThenCount = (a, b) => {
-		const sa = computeSpecificity(a.pattern);
-		const sb = computeSpecificity(b.pattern);
-		if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
-		if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
-		if (b.count !== a.count) return b.count - a.count;
-		return a.pattern.localeCompare(b.pattern);
+	return items;
+};
+/** Build pattern string from window items */
+const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
+/** Check if window contains at least one token */
+const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
+/** Compute token count and literal length for a window */
+const computeWindowStats = (window) => {
+	let tokenCount = 0, literalLen = 0;
+	for (const item of window) if (item.type === "token") tokenCount++;
+	else literalLen += item.text.length;
+	return {
+		literalLen,
+		tokenCount
 	};
-	const compareCountThenSpecificity = (a, b) => {
-		if (b.count !== a.count) return b.count - a.count;
-		return compareSpecificityThenCount(a, b);
+};
+/** Build example from page content and window */
+const buildExample = (page, window, contextChars) => {
+	const start = window[0].start;
+	const end = window.at(-1).end;
+	const ctxStart = Math.max(0, start - contextChars);
+	const ctxEnd = Math.min(page.content.length, end + contextChars);
+	return {
+		context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
+		pageId: page.id,
+		startIndices: window.map((w) => w.start),
+		text: page.content.slice(start, end)
 	};
-	return [...counts.entries()].map(([pattern, v]) => ({
-		count: v.count,
-		examples: v.examples,
+};
+/** Extract N-grams from a single page */
+const extractPageNgrams = (page, items, opts, stats) => {
+	for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
+		const window = items.slice(i, i + n);
+		if (opts.requireToken && !hasTokenInWindow(window)) continue;
+		const pattern = buildPattern(window, opts.whitespace);
+		if (!stats.has(pattern)) {
+			if (stats.size >= opts.maxUniquePatterns) continue;
+			stats.set(pattern, {
+				count: 0,
+				examples: [],
+				...computeWindowStats(window)
+			});
+		}
+		const entry = stats.get(pattern);
+		entry.count++;
+		if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
+	}
+};
+/**
+* Analyze pages for commonly repeating word sequences.
+*
+* Use for continuous text without line breaks. For line-based analysis,
+* use `analyzeCommonLineStarts()` instead.
+*/
+const analyzeRepeatingSequences = (pages, options) => {
+	const opts = resolveOptions(options);
+	const stats = /* @__PURE__ */ new Map();
+	for (const page of pages) {
+		if (!page.content) continue;
+		extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
+	}
+	return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
+		count: s.count,
+		examples: s.examples,
 		pattern
-	})).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
+	}));
 };
 //#endregion
@@ -2831,5 +3099,524 @@ const analyzeTextForRule = (text) => {
 };
 //#endregion
-export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
+//#region src/recovery.ts
+const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
+const normalizeForCompare = (s, mode) => {
+	if (mode === "none") return s;
+	let out = s;
+	if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
+	out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
+	return out;
+};
+const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
+const buildFixedOptions = (options, selectedRuleIndices) => {
+	const fixedRules = (options.rules ?? []).map((r, idx) => {
+		if (!selectedRuleIndices.has(idx)) return r;
+		if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
+		const { lineStartsAfter, ...rest } = r;
+		return {
+			...rest,
+			lineStartsWith: lineStartsAfter
+		};
+	});
+	return {
+		...options,
+		rules: fixedRules
+	};
+};
+const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
+const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
+	const parts = [];
+	for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
+	const matchContent = parts.join("\n");
+	if (pageJoiner === "newline") return {
+		matchContent,
+		outputContent: matchContent
+	};
+	return {
+		matchContent,
+		outputContent: parts.join(" ")
+	};
+};
+const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
+	const rules = options.rules ?? [];
+	const compiled = [];
+	for (const idx of selectedRuleIndices) {
+		const r = rules[idx];
+		if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
+		const { lineStartsAfter, ...rest } = r;
+		const built = buildRuleRegex({
+			...rest,
+			lineStartsWith: lineStartsAfter
+		});
+		compiled.push({
+			ruleIndex: idx,
+			startsWithRegex: new RegExp(built.regex.source, "mu")
+		});
+	}
+	return compiled;
+};
+const findUniqueAnchorPos = (outputContent, segmentContent) => {
+	for (const len of [
+		80,
+		60,
+		40,
+		30,
+		20,
+		15
+	]) {
+		const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
+		if (!needle.trim()) continue;
+		const first = outputContent.indexOf(needle);
+		if (first === -1) continue;
+		if (outputContent.indexOf(needle, first + 1) === -1) return first;
+	}
+	return null;
+};
+const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
+	const line = matchContent.slice(lineStart);
+	for (const mr of compiledMistaken) {
+		mr.startsWithRegex.lastIndex = 0;
+		const m = mr.startsWithRegex.exec(line);
+		if (!m || m.index !== 0) continue;
+		const markerMatch = m[0];
+		const markerEnd = lineStart + markerMatch.length;
+		if (anchorPos < markerEnd) continue;
+		const gap = matchContent.slice(markerEnd, anchorPos);
+		const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
+		if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
+		return { prefix: recoveredPrefix };
+	}
+	return { reason: "no selected marker pattern matched at anchored line start" };
+};
+const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
+	const fromIdx = pageIdToIndex.get(segment.from);
+	const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
+	if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
+		kind: "unresolved",
+		reason: "segment page range not found in pages"
+	};
+	const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
+	if (!segment.content) return {
+		kind: "unresolved",
+		reason: "empty segment content"
+	};
+	const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
+	if (anchorPos === null) return {
+		kind: "unresolved",
+		reason: "could not uniquely anchor segment content in page range"
+	};
+	const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
+	const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
+	if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
+		kind: "unresolved",
+		reason: found.reason
+	};
+	return {
+		kind: "recovered",
+		recoveredContent: `${found.prefix}${segment.content}`,
+		recoveredPrefix: found.prefix
+	};
+};
+const resolveRuleIndicesSelector = (rules, indicesIn) => {
+	const errors = [];
+	const indices = /* @__PURE__ */ new Set();
+	for (const idx of indicesIn) {
+		if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
+			errors.push(`Selector index out of range: ${idx}`);
+			continue;
+		}
+		const rule = rules[idx];
+		if (!rule || !("lineStartsAfter" in rule)) {
+			errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
+			continue;
+		}
+		indices.add(idx);
+	}
+	return {
+		errors,
+		indices,
+		warnings: []
+	};
+};
+const resolvePredicateSelector = (rules, predicate) => {
+	const errors = [];
+	const warnings = [];
+	const indices = /* @__PURE__ */ new Set();
+	rules.forEach((r, i) => {
+		try {
+			if (!predicate(r, i)) return;
+			if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
+				indices.add(i);
+				return;
+			}
+			warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
+		} catch (e) {
+			const msg = e instanceof Error ? e.message : String(e);
+			errors.push(`Predicate threw at rule ${i}: ${msg}`);
+		}
+	});
+	if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
+	return {
+		errors,
+		indices,
+		warnings
+	};
+};
+const resolvePatternsSelector = (rules, patterns, matchMode) => {
+	const errors = [];
+	const warnings = [];
+	const indices = /* @__PURE__ */ new Set();
+	const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
+	const targets = patterns.map(normalizePattern);
+	for (let pi = 0; pi < patterns.length; pi++) {
+		const rawPattern = patterns[pi];
+		const pat = targets[pi];
+		const matched = [];
+		for (let i = 0; i < rules.length; i++) {
+			const r = rules[i];
+			if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
+			if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
+		}
+		if (matched.length === 0) {
+			errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
+			continue;
+		}
+		if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
+		matched.forEach((i) => {
+			indices.add(i);
+		});
+	}
+	return {
+		errors,
+		indices,
+		warnings
+	};
+};
+const resolveSelectorToRuleIndices = (options, selector) => {
+	const rules = options.rules ?? [];
+	if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
+	if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
+	return resolvePatternsSelector(rules, selector.patterns, selector.match);
+};
+const longestCommonSuffixLength = (a, b) => {
+	const max = Math.min(a.length, b.length);
+	let i = 0;
+	while (i < max) {
+		if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
+		i++;
+	}
+	return i;
+};
+const AMBIGUITY_SCORE_GAP = 5;
+const scoreCandidate = (orig, fixed, normalizeMode) => {
+	if (fixed.content === orig.content) return {
+		fixedIndex: -1,
+		kind: "exact",
+		score: 100
+	};
+	if (fixed.content.endsWith(orig.content)) {
+		const markerLen = fixed.content.length - orig.content.length;
+		return {
+			fixedIndex: -1,
+			kind: "exact_suffix",
+			score: 90 + Math.min(30, markerLen)
+		};
+	}
+	if (normalizeMode !== "none") {
+		const normFixed = normalizeForCompare(fixed.content, normalizeMode);
+		const normOrig = normalizeForCompare(orig.content, normalizeMode);
+		if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
+			const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
+			return {
+				fixedIndex: -1,
+				kind: "normalized_suffix",
+				score: 70 + Math.floor(overlap * 20)
+			};
+		}
+	}
+	return null;
+};
+const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
+	const warnings = [...reportBase.warnings];
+	warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
+	const details = segments.map((s, i) => {
+		const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
+		return {
+			from: s.from,
+			notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
+			originalStartPreview: preview(s.content),
+			segmentIndex: i,
+			status,
+			strategy: "none",
+			to: s.to
+		};
+	});
+	return {
+		report: {
+			...reportBase,
+			details,
+			summary: {
+				mode,
+				recovered: 0,
+				totalSegments: segments.length,
+				unchanged: segments.length,
+				unresolved: selectorErrors.length ? segments.length : 0
+			},
+			warnings
+		},
+		segments
+	};
+};
+const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
+	const recoveredAtIndex = /* @__PURE__ */ new Map();
+	const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
+	if (mode !== "best_effort_then_rerun") return {
+		recoveredAtIndex,
+		recoveredDetailAtIndex
+	};
+	const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
+	const pageIdToIndex = buildPageIdToIndex(processedPages);
+	const pageJoiner = options.pageJoiner ?? "space";
+	const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
+	for (let i = 0; i < segments.length; i++) {
+		const orig = segments[i];
+		const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
+		if (r.kind !== "recovered") continue;
+		const seg = {
+			...orig,
+			content: r.recoveredContent
+		};
+		recoveredAtIndex.set(i, seg);
+		recoveredDetailAtIndex.set(i, {
+			from: orig.from,
+			originalStartPreview: preview(orig.content),
+			recoveredPrefixPreview: preview(r.recoveredPrefix),
+			recoveredStartPreview: preview(seg.content),
+			segmentIndex: i,
+			status: "recovered",
+			strategy: "stage1",
+			to: orig.to
+		});
+	}
+	return {
+		recoveredAtIndex,
+		recoveredDetailAtIndex
+	};
+};
+const buildFixedBuckets = (fixedSegments) => {
+	const buckets = /* @__PURE__ */ new Map();
+	for (let i = 0; i < fixedSegments.length; i++) {
+		const k = segmentRangeKey(fixedSegments[i]);
+		const arr = buckets.get(k);
+		if (!arr) buckets.set(k, [i]);
+		else arr.push(i);
+	}
+	return buckets;
+};
+const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
+	let best = null;
+	let secondBestScore = -Infinity;
+	for (const fixedIdx of candidates) {
+		if (usedFixed.has(fixedIdx)) continue;
+		const fixed = fixedSegments[fixedIdx];
+		const scored = scoreCandidate(orig, fixed, normalizeCompare);
+		if (!scored) continue;
+		const candidateScore = scored.score;
+		if (!best || candidateScore > best.score) {
+			secondBestScore = best?.score ?? -Infinity;
+			best = {
+				fixedIdx,
+				score: candidateScore
+			};
+		} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
+	}
+	if (!best) return { kind: "none" };
+	if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
+	return {
+		fixedIdx: best.fixedIdx,
+		kind: "match"
+	};
+};
+const detailUnresolved = (orig, segmentIndex, notes) => ({
+	from: orig.from,
+	notes,
+	originalStartPreview: preview(orig.content),
+	segmentIndex,
+	status: "unresolved_alignment",
+	strategy: "rerun",
+	to: orig.to
+});
+const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
+	from: orig.from,
+	notes,
+	originalStartPreview: preview(orig.content),
+	segmentIndex,
+	status: "skipped_idempotent",
+	strategy: "rerun",
+	to: orig.to
+});
+const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
+	let recoveredPrefixPreview;
+	if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
+	return {
+		from: orig.from,
+		originalStartPreview: preview(orig.content),
+		recoveredPrefixPreview,
+		recoveredStartPreview: preview(fixed.content),
+		segmentIndex,
+		status: "recovered",
+		strategy: "rerun",
+		to: orig.to
+	};
+};
+const mergeWithRerun = (params) => {
+	const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
+	const usedFixed = /* @__PURE__ */ new Set();
+	const out = [];
+	const details = [];
+	let recovered = 0;
+	let unresolved = 0;
+	let unchanged = 0;
+	for (let i = 0; i < originalSegments.length; i++) {
+		const stage1Recovered = stage1RecoveredAtIndex.get(i);
+		if (stage1Recovered) {
+			out.push(stage1Recovered);
+			recovered++;
+			details.push(recoveredDetailAtIndex.get(i) ?? {
+				from: stage1Recovered.from,
+				originalStartPreview: preview(originalSegments[i].content),
+				recoveredStartPreview: preview(stage1Recovered.content),
+				segmentIndex: i,
+				status: "recovered",
+				strategy: "stage1",
+				to: stage1Recovered.to
+			});
+			continue;
+		}
+		const orig = originalSegments[i];
+		const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
+		if (best.kind === "none") {
+			out.push(orig);
+			unresolved++;
+			details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
+			continue;
+		}
+		if (best.kind === "ambiguous") {
+			out.push(orig);
+			unresolved++;
+			details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
+			continue;
+		}
+		usedFixed.add(best.fixedIdx);
+		const fixed = fixedSegments[best.fixedIdx];
+		if (fixed.content === orig.content) {
+			out.push(orig);
+			unchanged++;
+			details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
+			continue;
+		}
+		out.push({
+			...orig,
+			content: fixed.content
+		});
+		recovered++;
+		details.push(detailRecoveredRerun(orig, fixed, i));
+	}
+	return {
+		details,
+		segments: out,
+		summary: {
+			recovered,
+			unchanged,
+			unresolved
+		}
+	};
+};
+function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
+	const mode = opts?.mode ?? "rerun_only";
+	const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
+	const resolved = resolveSelectorToRuleIndices(options, selector);
+	const reportBase = {
+		byRun: void 0,
+		errors: resolved.errors,
+		warnings: resolved.warnings
+	};
+	if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
+	const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
+	const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
+	const merged = mergeWithRerun({
+		fixedBuckets: buildFixedBuckets(fixedSegments),
+		fixedSegments,
+		normalizeCompare,
+		originalSegments: segments,
+		recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
+		stage1RecoveredAtIndex: stage1.recoveredAtIndex
+	});
+	return {
+		report: {
+			...reportBase,
+			details: merged.details,
+			summary: {
+				mode,
+				recovered: merged.summary.recovered,
+				totalSegments: segments.length,
+				unchanged: merged.summary.unchanged,
+				unresolved: merged.summary.unresolved
+			}
+		},
+		segments: merged.segments
+	};
+}
+function recoverMistakenMarkersForRuns(runs, opts) {
+	const allSegments = [];
+	const byRun = [];
+	const details = [];
+	const warnings = [];
+	const errors = [];
+	let recovered = 0;
+	let unchanged = 0;
+	let unresolved = 0;
+	let offset = 0;
+	for (let i = 0; i < runs.length; i++) {
+		const run = runs[i];
+		const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
+		allSegments.push(...res.segments);
+		for (const d of res.report.details) details.push({
+			...d,
+			segmentIndex: d.segmentIndex + offset
+		});
+		offset += run.segments.length;
+		recovered += res.report.summary.recovered;
+		unchanged += res.report.summary.unchanged;
+		unresolved += res.report.summary.unresolved;
+		warnings.push(...res.report.warnings);
+		errors.push(...res.report.errors);
+		byRun.push({
+			recovered: res.report.summary.recovered,
+			runIndex: i,
+			totalSegments: run.segments.length,
+			unresolved: res.report.summary.unresolved
+		});
+	}
+	return {
+		report: {
+			byRun,
+			details,
+			errors,
+			summary: {
+				mode: opts?.mode ?? "rerun_only",
+				recovered,
+				totalSegments: offset,
+				unchanged,
+				unresolved
+			},
+			warnings
+		},
+		segments: allSegments
+	};
+}
+//#endregion
+export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
 //# sourceMappingURL=index.mjs.map