npm - flappa-doormal - Versions diffs - 2.3.0 → 2.4.0 - Mend

flappa-doormal 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -662,12 +662,24 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 			break;
 		}
 		const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
-		if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
+		const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
+		if (remainingSpan <= maxPages && !remainingHasExclusions) {
 			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
 			if (finalSeg) result.push(finalSeg);
 			break;
 		}
 		const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
+		logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
+			currentFromIdx,
+			currentFromPageId: pageIds[currentFromIdx],
+			remainingContentStart: remainingContent.slice(0, 50),
+			remainingContentLength: remainingContent.length,
+			remainingSpan,
+			toIdx,
+			toPageId: pageIds[toIdx],
+			windowEndIdx,
+			windowEndPageId: pageIds[windowEndIdx]
+		});
 		const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
 		const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
 		let breakPosition = -1;
@@ -680,16 +692,35 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 		});
 		if (breakPosition <= 0) breakPosition = windowEndPosition;
 		const pieceContent = remainingContent.slice(0, breakPosition).trim();
+		logger?.debug?.("[breakpoints] selectedBreak", {
+			breakPosition,
+			pieceContentEnd: pieceContent.slice(-50),
+			pieceContentLength: pieceContent.length,
+			windowEndPosition
+		});
 		const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
 		if (pieceContent) {
 			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
 			if (pieceSeg) result.push(pieceSeg);
 		}
 		remainingContent = remainingContent.slice(breakPosition).trim();
-		if (!remainingContent) break;
+		logger?.debug?.("[breakpoints] afterSlice", {
+			actualEndIdx,
+			remainingContentLength: remainingContent.length,
+			remainingContentStart: remainingContent.slice(0, 60)
+		});
+		if (!remainingContent) {
+			logger?.debug?.("[breakpoints] done: no remaining content");
+			break;
+		}
 		currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
+		logger?.debug?.("[breakpoints] nextIteration", {
+			currentFromIdx,
+			currentFromPageId: pageIds[currentFromIdx]
+		});
 		isFirstPiece = false;
 	}
+	logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
 	return result;
 };
 /**
@@ -708,6 +739,14 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 		maxPages,
 		segmentCount: segments.length
 	});
+	logger?.debug?.("[breakpoints] inputSegments", {
+		segmentCount: segments.length,
+		segments: segments.map((s) => ({
+			contentLength: s.content.length,
+			from: s.from,
+			to: s.to
+		}))
+	});
 	for (const segment of segments) {
 		const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
 		const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
@@ -830,39 +869,6 @@ const filterByConstraints = (matches, rule, getId) => {
 	});
 };
 /**
-* Filters matches based on occurrence setting (first, last, or all).
-*
-* Applies occurrence-based selection to a list of matches:
-* - `'all'` or `undefined`: Return all matches (default)
-* - `'first'`: Return only the first match
-* - `'last'`: Return only the last match
-*
-* @param matches - Array of match results to filter
-* @param occurrence - Which occurrence(s) to keep
-* @returns Filtered array based on occurrence setting
-*
-* @example
-* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
-*
-* filterByOccurrence(matches, 'first')
-* // → [{ start: 0 }]
-*
-* filterByOccurrence(matches, 'last')
-* // → [{ start: 20 }]
-*
-* filterByOccurrence(matches, 'all')
-* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
-*
-* filterByOccurrence(matches, undefined)
-* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
-*/
-const filterByOccurrence = (matches, occurrence) => {
-	if (!matches.length) return [];
-	if (occurrence === "first") return [matches[0]];
-	if (occurrence === "last") return [matches[matches.length - 1]];
-	return matches;
-};
-/**
 * Checks if any rule in the list allows the given page ID.
 *
 * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
@@ -963,19 +969,13 @@ const anyRuleAllowsId = (rules, pageId) => {
 * // → '{{harf}}' (unchanged - no brackets outside tokens)
 */
 const escapeTemplateBrackets = (pattern) => {
-	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (match, token, bracket) => {
+	return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
 		if (token) return token;
 		return `\\${bracket}`;
 	});
 };
-/**
-* Base token definitions mapping human-readable token names to regex patterns.
-*
-* These tokens contain raw regex patterns and do not reference other tokens.
-* For composite tokens that build on these, see `COMPOSITE_TOKENS`.
-*
-* @internal
-*/
+const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|كن|مد|قد|خد|فد|دل|كد|غد|صد|تم|فق|دق|[خرزيمنصدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
+const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
 const BASE_TOKENS = {
 	bab: "باب",
 	basmalah: ["بسم الله", "﷽"].join("|"),
@@ -983,7 +983,7 @@ const BASE_TOKENS = {
 	dash: "[-–—ـ]",
 	fasl: ["مسألة", "فصل"].join("|"),
 	harf: "[أ-ي]",
-	harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
+	harfs: "[أ-ي](?:\\s+[أ-ي])*",
 	kitab: "كتاب",
 	naql: [
 		"حدثني",
@@ -996,6 +996,7 @@ const BASE_TOKENS = {
 	].join("|"),
 	raqm: "[\\u0660-\\u0669]",
 	raqms: "[\\u0660-\\u0669]+",
+	rumuz: RUMUZ_BLOCK,
 	tarqim: "[.!?؟؛]"
 };
 /**
@@ -1127,7 +1128,7 @@ const containsTokens = (query) => {
 * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
 * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
 */
-const expandTokensWithCaptures = (query, fuzzyTransform) => {
+const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
 	const captureNames = [];
 	const captureNameCounts = /* @__PURE__ */ new Map();
 	/**
@@ -1169,16 +1170,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
 		const [, tokenName, captureName] = tokenMatch;
 		if (!tokenName && captureName) {
 			const uniqueName = getUniqueCaptureName(captureName);
-			captureNames.push(uniqueName);
-			return `(?<${uniqueName}>.+)`;
+			const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
+			captureNames.push(prefixedName);
+			return `(?<${prefixedName}>.+)`;
 		}
 		let tokenPattern = TOKEN_PATTERNS[tokenName];
 		if (!tokenPattern) return segment.value;
 		if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
 		if (captureName) {
 			const uniqueName = getUniqueCaptureName(captureName);
-			captureNames.push(uniqueName);
-			return `(?<${uniqueName}>${tokenPattern})`;
+			const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
+			captureNames.push(prefixedName);
+			return `(?<${prefixedName}>${tokenPattern})`;
 		}
 		return tokenPattern;
 	});
@@ -1319,39 +1322,42 @@ const compileRuleRegex = (pattern) => {
 *
 * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
 */
-const processPattern = (pattern, fuzzy) => {
-	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
+const processPattern = (pattern, fuzzy, capturePrefix) => {
+	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
 	return {
 		captureNames,
 		pattern: expanded
 	};
 };
-const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
+	const captureNames = processed.flatMap((p) => p.captureNames);
+	const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
+	if (capturePrefix) captureNames.push(`${capturePrefix}content`);
 	return {
-		captureNames: processed.flatMap((p) => p.captureNames),
-		regex: `^(?:${union})(.*)`
+		captureNames,
+		regex: `^(?:${union})${contentCapture}`
 	};
 };
-const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
 	return {
 		captureNames: processed.flatMap((p) => p.captureNames),
 		regex: `^(?:${union})`
 	};
 };
-const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
 	return {
 		captureNames: processed.flatMap((p) => p.captureNames),
 		regex: `(?:${union})$`
 	};
 };
-const buildTemplateRegexSource = (template) => {
-	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
+const buildTemplateRegexSource = (template, capturePrefix) => {
+	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
 	return {
 		captureNames,
 		regex: pattern
@@ -1363,12 +1369,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
 *
 * Behavior mirrors the previous implementation in `segmenter.ts`.
 */
-const buildRuleRegex = (rule) => {
+const buildRuleRegex = (rule, capturePrefix) => {
 	const s = { ...rule };
 	const fuzzy = rule.fuzzy ?? false;
 	let allCaptureNames = [];
 	if (s.lineStartsAfter?.length) {
-		const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
+		const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
 		allCaptureNames = captureNames;
 		return {
 			captureNames: allCaptureNames,
@@ -1378,17 +1384,17 @@ const buildRuleRegex = (rule) => {
 		};
 	}
 	if (s.lineStartsWith?.length) {
-		const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
+		const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = captureNames;
 	}
 	if (s.lineEndsWith?.length) {
-		const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
+		const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = captureNames;
 	}
 	if (s.template) {
-		const { regex, captureNames } = buildTemplateRegexSource(s.template);
+		const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = [...allCaptureNames, ...captureNames];
 	}
@@ -1404,16 +1410,231 @@ const buildRuleRegex = (rule) => {
 };
 //#endregion
-//#region src/segmentation/textUtils.ts
+//#region src/segmentation/fast-fuzzy-prefix.ts
+/**
+* Fast-path fuzzy prefix matching for common Arabic line-start markers.
+*
+* This exists to avoid running expensive fuzzy-expanded regex alternations over
+* a giant concatenated string. Instead, we match only at known line-start
+* offsets and perform a small deterministic comparison:
+* - Skip Arabic diacritics in the CONTENT
+* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
+*
+* This module is intentionally conservative: it only supports "literal"
+* token patterns (plain text alternation via `|`), not general regex.
+*/
+const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
+const equivKey = (ch) => {
+	switch (ch) {
+		case "آ":
+		case "أ":
+		case "إ": return "ا";
+		case "ه": return "ة";
+		case "ي": return "ى";
+		default: return ch;
+	}
+};
 /**
-* Strip all HTML tags from content, keeping only text.
+* Match a fuzzy literal prefix at a given offset.
+*
+* - Skips diacritics in the content
+* - Applies equivalence groups on both content and literal
 *
-* @param html - HTML content
-* @returns Plain text content
+* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
 */
-const stripHtmlTags = (html) => {
-	return html.replace(/<[^>]*>/g, "");
+const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
+	let i = offset;
+	while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+	for (let j = 0; j < literal.length; j++) {
+		const litCh = literal[j];
+		while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+		if (i >= content.length) return null;
+		const cCh = content[i];
+		if (equivKey(cCh) !== equivKey(litCh)) return null;
+		i++;
+	}
+	while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+	return i;
+};
+const isLiteralOnly = (s) => {
+	return !/[\\[\]{}()^$.*+?]/.test(s);
+};
+const compileLiteralAlternation = (pattern) => {
+	if (!pattern) return null;
+	if (!isLiteralOnly(pattern)) return null;
+	const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
+	if (!alternatives.length) return null;
+	return { alternatives };
+};
+/**
+* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
+* Returns null if not eligible.
+*/
+const compileFastFuzzyTokenRule = (tokenTemplate) => {
+	const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
+	if (!m) return null;
+	const token = m[1];
+	const tokenPattern = getTokenPattern(token);
+	if (!tokenPattern) return null;
+	const compiled = compileLiteralAlternation(tokenPattern);
+	if (!compiled) return null;
+	return {
+		alternatives: compiled.alternatives,
+		token
+	};
+};
+/**
+* Try matching any alternative for a compiled token at a line-start offset.
+* Returns endOffset (exclusive) on match, else null.
+*/
+const matchFastFuzzyTokenAt = (content, offset, compiled) => {
+	for (const alt of compiled.alternatives) {
+		const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
+		if (end !== null) return end;
+	}
+	return null;
+};
+//#endregion
+//#region src/segmentation/segmenter-rule-utils.ts
+const partitionRulesForMatching = (rules) => {
+	const combinableRules = [];
+	const standaloneRules = [];
+	const fastFuzzyRules = [];
+	rules.forEach((rule, index) => {
+		if (rule.fuzzy && "lineStartsWith" in rule) {
+			const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
+			if (compiled) {
+				fastFuzzyRules.push({
+					compiled,
+					kind: "startsWith",
+					rule,
+					ruleIndex: index
+				});
+				return;
+			}
+		}
+		if (rule.fuzzy && "lineStartsAfter" in rule) {
+			const compiled = rule.lineStartsAfter.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsAfter[0]) : null;
+			if (compiled) {
+				fastFuzzyRules.push({
+					compiled,
+					kind: "startsAfter",
+					rule,
+					ruleIndex: index
+				});
+				return;
+			}
+		}
+		let isCombinable = true;
+		if ("regex" in rule && rule.regex) {
+			const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
+			const hasBackreferences = /\\[1-9]/.test(rule.regex);
+			const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
+			if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
+		}
+		if (isCombinable) combinableRules.push({
+			index,
+			prefix: `r${index}_`,
+			rule
+		});
+		else standaloneRules.push(rule);
+	});
+	return {
+		combinableRules,
+		fastFuzzyRules,
+		standaloneRules
+	};
+};
+const createPageStartGuardChecker = (matchContent, pageMap) => {
+	const pageStartToBoundaryIndex = /* @__PURE__ */ new Map();
+	for (let i = 0; i < pageMap.boundaries.length; i++) pageStartToBoundaryIndex.set(pageMap.boundaries[i].start, i);
+	const compiledPageStartPrev = /* @__PURE__ */ new Map();
+	const getPageStartPrevRegex = (rule, ruleIndex) => {
+		if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
+		const pattern = rule.pageStartGuard;
+		if (!pattern) {
+			compiledPageStartPrev.set(ruleIndex, null);
+			return null;
+		}
+		const expanded = processPattern(pattern, false).pattern;
+		const re = new RegExp(`(?:${expanded})$`, "u");
+		compiledPageStartPrev.set(ruleIndex, re);
+		return re;
+	};
+	const getPrevPageLastNonWsChar = (boundaryIndex) => {
+		if (boundaryIndex <= 0) return "";
+		const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
+		for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
+			const ch = matchContent[i];
+			if (!ch) continue;
+			if (/\s/u.test(ch)) continue;
+			return ch;
+		}
+		return "";
+	};
+	return (rule, ruleIndex, matchStart) => {
+		const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
+		if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
+		const prevReq = getPageStartPrevRegex(rule, ruleIndex);
+		if (!prevReq) return true;
+		const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
+		if (!lastChar) return false;
+		return prevReq.test(lastChar);
+	};
+};
+const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
+	const splitPointsByRule = /* @__PURE__ */ new Map();
+	if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
+	let boundaryIdx = 0;
+	let currentBoundary = pageMap.boundaries[boundaryIdx];
+	const advanceBoundaryTo = (offset) => {
+		while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
+			boundaryIdx++;
+			currentBoundary = pageMap.boundaries[boundaryIdx];
+		}
+	};
+	const recordSplitPoint = (ruleIndex, sp) => {
+		const arr = splitPointsByRule.get(ruleIndex);
+		if (!arr) {
+			splitPointsByRule.set(ruleIndex, [sp]);
+			return;
+		}
+		arr.push(sp);
+	};
+	const isPageStart = (offset) => offset === currentBoundary?.start;
+	for (let lineStart = 0; lineStart <= matchContent.length;) {
+		advanceBoundaryTo(lineStart);
+		const pageId = currentBoundary?.id ?? 0;
+		if (lineStart >= matchContent.length) break;
+		for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
+			if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
+			if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
+			const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
+			if (end === null) continue;
+			const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
+			if (kind === "startsWith") recordSplitPoint(ruleIndex, {
+				index: splitIndex,
+				meta: rule.meta
+			});
+			else {
+				const markerLength = end - lineStart;
+				recordSplitPoint(ruleIndex, {
+					contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
+					index: splitIndex,
+					meta: rule.meta
+				});
+			}
+		}
+		const nextNl = matchContent.indexOf("\n", lineStart);
+		if (nextNl === -1) break;
+		lineStart = nextNl + 1;
+	}
+	return splitPointsByRule;
 };
+//#endregion
+//#region src/segmentation/textUtils.ts
 /**
 * Normalizes line endings to Unix-style (`\n`).
 *
@@ -1423,7 +1644,9 @@ const stripHtmlTags = (html) => {
 * @param content - Raw content with potentially mixed line endings
 * @returns Content with all line endings normalized to `\n`
 */
-const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
+const normalizeLineEndings = (content) => {
+	return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
+};
 //#endregion
 //#region src/segmentation/segmenter.ts
@@ -1544,9 +1767,63 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
 	return [initialSeg];
 };
 const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
-	const collectSplitPointsFromRule = (rule) => {
+	const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
+	const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
+	const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
+	if (combinableRules.length > 0) {
+		const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
+			const built = buildRuleRegex(rule, prefix);
+			return {
+				prefix,
+				source: `(?<${prefix}>${built.regex.source})`,
+				...built
+			};
+		});
+		const combinedSource = ruleRegexes.map((r) => r.source).join("|");
+		const combinedRegex = new RegExp(combinedSource, "gm");
+		combinedRegex.lastIndex = 0;
+		let m = combinedRegex.exec(matchContent);
+		while (m !== null) {
+			const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
+			if (matchedRuleIndex !== -1) {
+				const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
+				const ruleInfo = ruleRegexes[matchedRuleIndex];
+				const namedCaptures = {};
+				if (m.groups) {
+					for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
+						const cleanName = prefixedName.slice(prefix.length);
+						namedCaptures[cleanName] = m.groups[prefixedName];
+					}
+				}
+				let capturedContent;
+				let contentStartOffset;
+				if (ruleInfo.usesLineStartsAfter) {
+					capturedContent = m.groups?.[`${prefix}content`];
+					if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
+				}
+				const start = m.index;
+				const end = m.index + m[0].length;
+				const pageId = pageMap.getId(start);
+				if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
+					if (!passesPageStartGuard(rule, originalIndex, start)) continue;
+					const sp = {
+						capturedContent: void 0,
+						contentStartOffset,
+						index: (rule.split ?? "at") === "at" ? start : end,
+						meta: rule.meta,
+						namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
+					};
+					if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
+					splitPointsByRule.get(originalIndex).push(sp);
+				}
+			}
+			if (m[0].length === 0) combinedRegex.lastIndex++;
+			m = combinedRegex.exec(matchContent);
+		}
+	}
+	const collectSplitPointsFromRule = (rule, ruleIndex) => {
 		const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
-		return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
+		const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
 			const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
 			const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
 			return {
@@ -1557,8 +1834,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
 				namedCaptures: m.namedCaptures
 			};
 		});
+		if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
+		splitPointsByRule.get(ruleIndex).push(...points);
 	};
-	return rules.flatMap(collectSplitPointsFromRule);
+	standaloneRules.forEach((rule) => {
+		collectSplitPointsFromRule(rule, rules.indexOf(rule));
+	});
+	const finalSplitPoints = [];
+	rules.forEach((rule, index) => {
+		const points = splitPointsByRule.get(index);
+		if (!points || points.length === 0) return;
+		let filtered = points;
+		if (rule.occurrence === "first") filtered = [points[0]];
+		else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
+		finalSplitPoints.push(...filtered);
+	});
+	return finalSplitPoints;
 };
 /**
 * Executes a regex against content and extracts match results with capture information.
@@ -1686,12 +1977,11 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
 * });
 */
 const segmentPages = (pages, options) => {
-	const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
-	if (!pages.length) return [];
+	const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
 	const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
 	let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
 	segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
-	if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
+	if (maxPages >= 0 && breakpoints.length) {
 		const patternProcessor = (p) => processPattern(p, false).pattern;
 		return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
 	}
@@ -1766,7 +2056,225 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
 };
 //#endregion
-//#region src/pattern-detection.ts
+//#region src/analysis.ts
+const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
+const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "");
+const computeSpecificity = (pattern) => {
+	const tokenCount = countTokenMarkers(pattern);
+	return {
+		literalLen: stripWhitespacePlaceholders(pattern).length,
+		tokenCount
+	};
+};
+const DEFAULT_OPTIONS = {
+	includeFirstWordFallback: true,
+	lineFilter: void 0,
+	maxExamples: 1,
+	minCount: 3,
+	minLineLength: 6,
+	normalizeArabicDiacritics: true,
+	prefixChars: 60,
+	prefixMatchers: [/^#+/u],
+	sortBy: "specificity",
+	topK: 40
+};
+const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+const TOKEN_PRIORITY_ORDER$1 = [
+	"basmalah",
+	"kitab",
+	"bab",
+	"fasl",
+	"naql",
+	"rumuz",
+	"numbered",
+	"raqms",
+	"raqm",
+	"dash",
+	"bullet",
+	"tarqim"
+];
+const buildTokenPriority = () => {
+	const allTokens = new Set(getAvailableTokens());
+	return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
+};
+const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
+const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
+const compileTokenRegexes = (tokenNames) => {
+	const compiled = [];
+	for (const token of tokenNames) {
+		const pat = TOKEN_PATTERNS[token];
+		if (!pat) continue;
+		try {
+			compiled.push({
+				re: new RegExp(pat, "uy"),
+				token
+			});
+		} catch {}
+	}
+	return compiled;
+};
+const appendWs = (out) => out && !out.endsWith("\\s*") ? `${out}\\s*` : out;
+const consumeLeadingPrefixes = (s, pos, out, prefixMatchers) => {
+	let matchedAny = false;
+	let currentPos = pos;
+	let currentOut = out;
+	for (const re of prefixMatchers) {
+		if (currentPos >= s.length) break;
+		const m = re.exec(s.slice(currentPos));
+		if (!m || m.index !== 0 || !m[0]) continue;
+		currentOut += escapeRegexLiteral(m[0]);
+		currentPos += m[0].length;
+		matchedAny = true;
+		const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
+		if (wsAfter) {
+			currentPos += wsAfter[0].length;
+			currentOut = appendWs(currentOut);
+		}
+	}
+	return {
+		matchedAny,
+		out: currentOut,
+		pos: currentPos
+	};
+};
+const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
+	let best = null;
+	for (const { token, re } of compiled) {
+		re.lastIndex = pos;
+		const m = re.exec(s);
+		if (!m || m.index !== pos) continue;
+		if (!best || m[0].length > best.text.length) best = {
+			text: m[0],
+			token
+		};
+	}
+	if (best?.token === "rumuz") {
+		const end = pos + best.text.length;
+		const next = end < s.length ? s[end] : "";
+		if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
+	}
+	return best;
+};
+const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers) => {
+	const trimmed = collapseWhitespace(line);
+	if (!trimmed) return null;
+	const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
+	let pos = 0;
+	let out = "";
+	let matchedAny = false;
+	let matchedToken = false;
+	const compiled = compileTokenRegexes(tokenNames);
+	const isArabicLetter = (ch) => /[\u0600-\u06FF]/u.test(ch);
+	const isCommonDelimiter = (ch) => /[:：\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
+	{
+		const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers);
+		pos = consumed.pos;
+		out = consumed.out;
+		matchedAny = consumed.matchedAny;
+	}
+	for (let steps = 0; steps < 6 && pos < s.length; steps++) {
+		const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
+		if (wsMatch) {
+			pos += wsMatch[0].length;
+			out = appendWs(out);
+			continue;
+		}
+		const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
+		if (best) {
+			if (out && !out.endsWith("\\s*")) {}
+			out += `{{${best.token}}}`;
+			matchedAny = true;
+			matchedToken = true;
+			pos += best.text.length;
+			continue;
+		}
+		if (matchedAny) {
+			const ch = s[pos];
+			if (ch && isCommonDelimiter(ch)) {
+				out += escapeRegexLiteral(ch);
+				pos += 1;
+				continue;
+			}
+		}
+		if (matchedAny) {
+			if (includeFirstWordFallback && !matchedToken) {
+				const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
+				if (!firstWord$1) break;
+				out += escapeRegexLiteral(firstWord$1);
+			}
+			break;
+		}
+		if (!includeFirstWordFallback) return null;
+		const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
+		if (!firstWord) return null;
+		out += escapeRegexLiteral(firstWord);
+		return out;
+	}
+	if (!matchedAny) return null;
+	while (out.endsWith("\\s*")) out = out.slice(0, -3);
+	return out;
+};
+/**
+* Analyze pages and return the most common line-start patterns (top K).
+*
+* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
+* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
+*/
+const analyzeCommonLineStarts = (pages, options = {}) => {
+	const o = {
+		...DEFAULT_OPTIONS,
+		...options,
+		lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
+		prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers
+	};
+	const tokenPriority = buildTokenPriority();
+	const counts = /* @__PURE__ */ new Map();
+	for (const page of pages) {
+		const lines = normalizeLineEndings(page.content ?? "").split("\n");
+		for (const line of lines) {
+			const trimmed = collapseWhitespace(line);
+			if (trimmed.length < o.minLineLength) continue;
+			if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
+			const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers);
+			if (!sig) continue;
+			const existing = counts.get(sig);
+			if (!existing) counts.set(sig, {
+				count: 1,
+				examples: [{
+					line: trimmed,
+					pageId: page.id
+				}]
+			});
+			else {
+				existing.count++;
+				if (existing.examples.length < o.maxExamples) existing.examples.push({
+					line: trimmed,
+					pageId: page.id
+				});
+			}
+		}
+	}
+	const compareSpecificityThenCount = (a, b) => {
+		const sa = computeSpecificity(a.pattern);
+		const sb = computeSpecificity(b.pattern);
+		if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
+		if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
+		if (b.count !== a.count) return b.count - a.count;
+		return a.pattern.localeCompare(b.pattern);
+	};
+	const compareCountThenSpecificity = (a, b) => {
+		if (b.count !== a.count) return b.count - a.count;
+		return compareSpecificityThenCount(a, b);
+	};
+	return [...counts.entries()].map(([pattern, v]) => ({
+		count: v.count,
+		examples: v.examples,
+		pattern
+	})).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
+};
+//#endregion
+//#region src/detection.ts
 /**
 * Pattern detection utilities for recognizing template tokens in Arabic text.
 * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
@@ -1785,6 +2293,7 @@ const TOKEN_PRIORITY_ORDER = [
 	"bab",
 	"fasl",
 	"naql",
+	"rumuz",
 	"numbered",
 	"raqms",
 	"raqm",
@@ -1921,5 +2430,5 @@ const analyzeTextForRule = (text) => {
 };
 //#endregion
-export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
+export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
 //# sourceMappingURL=index.mjs.map