npm - flappa-doormal - Versions diffs - 2.2.3 → 2.3.1 - Mend

flappa-doormal 2.2.3 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -348,25 +348,25 @@ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalize
 	for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
 		const pageData = normalizedPages.get(pageIds[pi]);
 		if (!pageData) continue;
-		const trimmed = pageData.content.trimStart();
-		let found = -1;
-		for (const len of JOINER_PREFIX_LENGTHS) {
-			const prefix = trimmed.slice(0, Math.min(len, trimmed.length)).trim();
-			if (!prefix) continue;
-			const pos = updated.indexOf(prefix, searchFrom);
-			if (pos > 0) {
-				found = pos;
-				break;
-			}
-		}
-		if (found > 0) {
-			if (updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
-			searchFrom = found;
-		}
+		const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
+		if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
+		if (found > 0) searchFrom = found;
 	}
 	return updated;
 };
 /**
+* Finds the position of a page prefix in content, trying multiple prefix lengths.
+*/
+const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
+	for (const len of JOINER_PREFIX_LENGTHS) {
+		const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
+		if (!prefix) continue;
+		const pos = content.indexOf(prefix, searchFrom);
+		if (pos > 0) return pos;
+	}
+	return -1;
+};
+/**
 * Estimates how far into the current page `remainingContent` begins.
 *
 * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
@@ -390,7 +390,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
 * This is used to define breakpoint windows in terms of actual content being split, rather than
 * raw per-page offsets which can desync when structural rules strip markers.
 */
-const findPageStartNearExpectedBoundary = (remainingContent, currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
+const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
 	const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
 	if (!targetPageData) return -1;
 	const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
@@ -548,6 +548,21 @@ const findPatternBreakPosition = (windowContent, regex, prefer) => {
 	return selected.index + selected.length;
 };
 /**
+* Handles page boundary breakpoint (empty pattern).
+* Returns break position or -1 if no valid position found.
+*/
+const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
+	const nextPageIdx = windowEndIdx + 1;
+	if (nextPageIdx <= toIdx) {
+		const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
+		if (nextPageData) {
+			const pos = findNextPagePosition(remainingContent, nextPageData);
+			if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
+		}
+	}
+	return Math.min(windowEndPosition, remainingContent.length);
+};
+/**
 * Tries to find a break position within the current window using breakpoint patterns.
 * Returns the break position or -1 if no suitable break was found.
 *
@@ -564,17 +579,7 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
 		if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
 		if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
 		if (skipWhenRegex?.test(remainingContent)) continue;
-		if (regex === null) {
-			const nextPageIdx = windowEndIdx + 1;
-			if (nextPageIdx <= toIdx) {
-				const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
-				if (nextPageData) {
-					const pos = findNextPagePosition(remainingContent, nextPageData);
-					if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
-				}
-			}
-			return Math.min(windowEndPosition, remainingContent.length);
-		}
+		if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
 		const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
 		if (breakPos > 0) return breakPos;
 	}
@@ -636,7 +641,8 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
 		const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
 		if (nextPageData) {
 			const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
-			if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
+			const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
+			if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
 		}
 	}
 	return nextFromIdx;
@@ -726,171 +732,6 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 	return result;
 };
-//#endregion
-//#region src/segmentation/match-utils.ts
-/**
-* Utility functions for regex matching and result processing.
-*
-* These functions were extracted from `segmenter.ts` to reduce complexity
-* and enable independent testing. They handle match filtering, capture
-* extraction, and occurrence-based selection.
-*
-* @module match-utils
-*/
-/**
-* Extracts named capture groups from a regex match.
-*
-* Only includes groups that are in the `captureNames` list and have
-* defined values. This filters out positional captures and ensures
-* only explicitly requested named captures are returned.
-*
-* @param groups - The `match.groups` object from `RegExp.exec()`
-* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
-* @returns Object with capture name → value pairs, or `undefined` if none found
-*
-* @example
-* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
-* extractNamedCaptures(match.groups, ['num'])
-* // → { num: '٦٦٩٦' }
-*
-* @example
-* // No matching captures
-* extractNamedCaptures({}, ['num'])
-* // → undefined
-*
-* @example
-* // Undefined groups
-* extractNamedCaptures(undefined, ['num'])
-* // → undefined
-*/
-const extractNamedCaptures = (groups, captureNames) => {
-	if (!groups || captureNames.length === 0) return;
-	const namedCaptures = {};
-	for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
-	return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
-};
-/**
-* Gets the last defined positional capture group from a match array.
-*
-* Used for `lineStartsAfter` patterns where the content capture (`.*`)
-* is always at the end of the pattern. Named captures may shift the
-* positional indices, so we iterate backward to find the actual content.
-*
-* @param match - RegExp exec result array
-* @returns The last defined capture group value, or `undefined` if none
-*
-* @example
-* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
-* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
-* getLastPositionalCapture(match)
-* // → 'content'
-*
-* @example
-* // No captures
-* getLastPositionalCapture(['full match'])
-* // → undefined
-*/
-const getLastPositionalCapture = (match) => {
-	if (match.length <= 1) return;
-	for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
-};
-/**
-* Filters matches to only include those within page ID constraints.
-*
-* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
-* matches that occur on pages outside the allowed range or explicitly excluded.
-*
-* @param matches - Array of match results to filter
-* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
-* @param getId - Function that returns the page ID for a given offset
-* @returns Filtered array containing only matches within constraints
-*
-* @example
-* const matches = [
-*   { start: 0, end: 10 },   // Page 1
-*   { start: 100, end: 110 }, // Page 5
-*   { start: 200, end: 210 }, // Page 10
-* ];
-* filterByConstraints(matches, { min: 3, max: 8 }, getId)
-* // → [{ start: 100, end: 110 }] (only page 5 match)
-*/
-const filterByConstraints = (matches, rule, getId) => {
-	return matches.filter((m) => {
-		const id = getId(m.start);
-		if (rule.min !== void 0 && id < rule.min) return false;
-		if (rule.max !== void 0 && id > rule.max) return false;
-		if (isPageExcluded(id, rule.exclude)) return false;
-		return true;
-	});
-};
-/**
-* Filters matches based on occurrence setting (first, last, or all).
-*
-* Applies occurrence-based selection to a list of matches:
-* - `'all'` or `undefined`: Return all matches (default)
-* - `'first'`: Return only the first match
-* - `'last'`: Return only the last match
-*
-* @param matches - Array of match results to filter
-* @param occurrence - Which occurrence(s) to keep
-* @returns Filtered array based on occurrence setting
-*
-* @example
-* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
-*
-* filterByOccurrence(matches, 'first')
-* // → [{ start: 0 }]
-*
-* filterByOccurrence(matches, 'last')
-* // → [{ start: 20 }]
-*
-* filterByOccurrence(matches, 'all')
-* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
-*
-* filterByOccurrence(matches, undefined)
-* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
-*/
-const filterByOccurrence = (matches, occurrence) => {
-	if (!matches.length) return [];
-	if (occurrence === "first") return [matches[0]];
-	if (occurrence === "last") return [matches[matches.length - 1]];
-	return matches;
-};
-/**
-* Checks if any rule in the list allows the given page ID.
-*
-* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
-* Rules without constraints allow all page IDs.
-*
-* This is used to determine whether to create a segment for content
-* that appears before any split points (the "first segment").
-*
-* @param rules - Array of rules with optional `min` and `max` constraints
-* @param pageId - Page ID to check
-* @returns `true` if at least one rule allows the page ID
-*
-* @example
-* const rules = [
-*   { min: 5, max: 10 },  // Allows pages 5-10
-*   { min: 20 },          // Allows pages 20+
-* ];
-*
-* anyRuleAllowsId(rules, 7)   // → true (first rule allows)
-* anyRuleAllowsId(rules, 3)   // → false (no rule allows)
-* anyRuleAllowsId(rules, 25)  // → true (second rule allows)
-*
-* @example
-* // Rules without constraints allow everything
-* anyRuleAllowsId([{}], 999) // → true
-*/
-const anyRuleAllowsId = (rules, pageId) => {
-	return rules.some((r) => {
-		const minOk = r.min === void 0 || pageId >= r.min;
-		const maxOk = r.max === void 0 || pageId <= r.max;
-		return minOk && maxOk;
-	});
-};
 //#endregion
 //#region src/segmentation/tokens.ts
 /**
@@ -977,6 +818,7 @@ const BASE_TOKENS = {
 	dash: "[-–—ـ]",
 	fasl: ["مسألة", "فصل"].join("|"),
 	harf: "[أ-ي]",
+	harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
 	kitab: "كتاب",
 	naql: [
 		"حدثني",
@@ -1120,7 +962,7 @@ const containsTokens = (query) => {
 * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
 * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
 */
-const expandTokensWithCaptures = (query, fuzzyTransform) => {
+const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
 	const captureNames = [];
 	const captureNameCounts = /* @__PURE__ */ new Map();
 	/**
@@ -1162,16 +1004,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
 		const [, tokenName, captureName] = tokenMatch;
 		if (!tokenName && captureName) {
 			const uniqueName = getUniqueCaptureName(captureName);
-			captureNames.push(uniqueName);
-			return `(?<${uniqueName}>.+)`;
+			const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
+			captureNames.push(prefixedName);
+			return `(?<${prefixedName}>.+)`;
 		}
 		let tokenPattern = TOKEN_PATTERNS[tokenName];
 		if (!tokenPattern) return segment.value;
 		if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
 		if (captureName) {
 			const uniqueName = getUniqueCaptureName(captureName);
-			captureNames.push(uniqueName);
-			return `(?<${uniqueName}>${tokenPattern})`;
+			const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
+			captureNames.push(prefixedName);
+			return `(?<${prefixedName}>${tokenPattern})`;
 		}
 		return tokenPattern;
 	});
@@ -1259,6 +1103,224 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
 */
 const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
+//#endregion
+//#region src/segmentation/fast-fuzzy-prefix.ts
+/**
+* Fast-path fuzzy prefix matching for common Arabic line-start markers.
+*
+* This exists to avoid running expensive fuzzy-expanded regex alternations over
+* a giant concatenated string. Instead, we match only at known line-start
+* offsets and perform a small deterministic comparison:
+* - Skip Arabic diacritics in the CONTENT
+* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
+*
+* This module is intentionally conservative: it only supports "literal"
+* token patterns (plain text alternation via `|`), not general regex.
+*/
+const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
+const equivKey = (ch) => {
+	switch (ch) {
+		case "آ":
+		case "أ":
+		case "إ": return "ا";
+		case "ه": return "ة";
+		case "ي": return "ى";
+		default: return ch;
+	}
+};
+/**
+* Match a fuzzy literal prefix at a given offset.
+*
+* - Skips diacritics in the content
+* - Applies equivalence groups on both content and literal
+*
+* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
+*/
+const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
+	let i = offset;
+	while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+	for (let j = 0; j < literal.length; j++) {
+		const litCh = literal[j];
+		while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+		if (i >= content.length) return null;
+		const cCh = content[i];
+		if (equivKey(cCh) !== equivKey(litCh)) return null;
+		i++;
+	}
+	while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
+	return i;
+};
+const isLiteralOnly = (s) => {
+	return !/[\\[\]{}()^$.*+?]/.test(s);
+};
+const compileLiteralAlternation = (pattern) => {
+	if (!pattern) return null;
+	if (!isLiteralOnly(pattern)) return null;
+	const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
+	if (!alternatives.length) return null;
+	return { alternatives };
+};
+/**
+* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
+* Returns null if not eligible.
+*/
+const compileFastFuzzyTokenRule = (tokenTemplate) => {
+	const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
+	if (!m) return null;
+	const token = m[1];
+	const tokenPattern = getTokenPattern(token);
+	if (!tokenPattern) return null;
+	const compiled = compileLiteralAlternation(tokenPattern);
+	if (!compiled) return null;
+	return {
+		alternatives: compiled.alternatives,
+		token
+	};
+};
+/**
+* Try matching any alternative for a compiled token at a line-start offset.
+* Returns endOffset (exclusive) on match, else null.
+*/
+const matchFastFuzzyTokenAt = (content, offset, compiled) => {
+	for (const alt of compiled.alternatives) {
+		const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
+		if (end !== null) return end;
+	}
+	return null;
+};
+//#endregion
+//#region src/segmentation/match-utils.ts
+/**
+* Utility functions for regex matching and result processing.
+*
+* These functions were extracted from `segmenter.ts` to reduce complexity
+* and enable independent testing. They handle match filtering, capture
+* extraction, and occurrence-based selection.
+*
+* @module match-utils
+*/
+/**
+* Extracts named capture groups from a regex match.
+*
+* Only includes groups that are in the `captureNames` list and have
+* defined values. This filters out positional captures and ensures
+* only explicitly requested named captures are returned.
+*
+* @param groups - The `match.groups` object from `RegExp.exec()`
+* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
+* @returns Object with capture name → value pairs, or `undefined` if none found
+*
+* @example
+* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
+* extractNamedCaptures(match.groups, ['num'])
+* // → { num: '٦٦٩٦' }
+*
+* @example
+* // No matching captures
+* extractNamedCaptures({}, ['num'])
+* // → undefined
+*
+* @example
+* // Undefined groups
+* extractNamedCaptures(undefined, ['num'])
+* // → undefined
+*/
+const extractNamedCaptures = (groups, captureNames) => {
+	if (!groups || captureNames.length === 0) return;
+	const namedCaptures = {};
+	for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
+	return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
+};
+/**
+* Gets the last defined positional capture group from a match array.
+*
+* Used for `lineStartsAfter` patterns where the content capture (`.*`)
+* is always at the end of the pattern. Named captures may shift the
+* positional indices, so we iterate backward to find the actual content.
+*
+* @param match - RegExp exec result array
+* @returns The last defined capture group value, or `undefined` if none
+*
+* @example
+* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
+* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
+* getLastPositionalCapture(match)
+* // → 'content'
+*
+* @example
+* // No captures
+* getLastPositionalCapture(['full match'])
+* // → undefined
+*/
+const getLastPositionalCapture = (match) => {
+	if (match.length <= 1) return;
+	for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
+};
+/**
+* Filters matches to only include those within page ID constraints.
+*
+* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
+* matches that occur on pages outside the allowed range or explicitly excluded.
+*
+* @param matches - Array of match results to filter
+* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
+* @param getId - Function that returns the page ID for a given offset
+* @returns Filtered array containing only matches within constraints
+*
+* @example
+* const matches = [
+*   { start: 0, end: 10 },   // Page 1
+*   { start: 100, end: 110 }, // Page 5
+*   { start: 200, end: 210 }, // Page 10
+* ];
+* filterByConstraints(matches, { min: 3, max: 8 }, getId)
+* // → [{ start: 100, end: 110 }] (only page 5 match)
+*/
+const filterByConstraints = (matches, rule, getId) => {
+	return matches.filter((m) => {
+		const id = getId(m.start);
+		if (rule.min !== void 0 && id < rule.min) return false;
+		if (rule.max !== void 0 && id > rule.max) return false;
+		if (isPageExcluded(id, rule.exclude)) return false;
+		return true;
+	});
+};
+/**
+* Checks if any rule in the list allows the given page ID.
+*
+* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
+* Rules without constraints allow all page IDs.
+*
+* This is used to determine whether to create a segment for content
+* that appears before any split points (the "first segment").
+*
+* @param rules - Array of rules with optional `min` and `max` constraints
+* @param pageId - Page ID to check
+* @returns `true` if at least one rule allows the page ID
+*
+* @example
+* const rules = [
+*   { min: 5, max: 10 },  // Allows pages 5-10
+*   { min: 20 },          // Allows pages 20+
+* ];
+*
+* anyRuleAllowsId(rules, 7)   // → true (first rule allows)
+* anyRuleAllowsId(rules, 3)   // → false (no rule allows)
+* anyRuleAllowsId(rules, 25)  // → true (second rule allows)
+*
+* @example
+* // Rules without constraints allow everything
+* anyRuleAllowsId([{}], 999) // → true
+*/
+const anyRuleAllowsId = (rules, pageId) => {
+	return rules.some((r) => {
+		const minOk = r.min === void 0 || pageId >= r.min;
+		const maxOk = r.max === void 0 || pageId <= r.max;
+		return minOk && maxOk;
+	});
+};
 //#endregion
 //#region src/segmentation/rule-regex.ts
 /**
@@ -1282,6 +1344,21 @@ const hasCapturingGroup = (pattern) => {
 	return /\((?!\?)/.test(pattern);
 };
 /**
+* Extracts named capture group names from a regex pattern.
+*
+* Parses patterns like `(?<num>[0-9]+)` and returns `['num']`.
+*
+* @example
+* extractNamedCaptureNames('^(?<num>[٠-٩]+)\\s+') // ['num']
+* extractNamedCaptureNames('^(?<a>\\d+)(?<b>\\w+)') // ['a', 'b']
+* extractNamedCaptureNames('^\\d+') // []
+*/
+const extractNamedCaptureNames = (pattern) => {
+	const names = [];
+	for (const match of pattern.matchAll(/\(\?<([^>]+)>/g)) names.push(match[1]);
+	return names;
+};
+/**
 * Safely compiles a regex pattern, throwing a helpful error if invalid.
 */
 const compileRuleRegex = (pattern) => {
@@ -1297,56 +1374,59 @@ const compileRuleRegex = (pattern) => {
 *
 * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
 */
-const processPattern = (pattern, fuzzy) => {
-	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
+const processPattern = (pattern, fuzzy, capturePrefix) => {
+	const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
 	return {
 		captureNames,
 		pattern: expanded
 	};
 };
-const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
+	const captureNames = processed.flatMap((p) => p.captureNames);
+	const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
+	if (capturePrefix) captureNames.push(`${capturePrefix}content`);
 	return {
-		captureNames: processed.flatMap((p) => p.captureNames),
-		regex: `^(?:${union})(.*)`
+		captureNames,
+		regex: `^(?:${union})${contentCapture}`
 	};
 };
-const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
 	return {
 		captureNames: processed.flatMap((p) => p.captureNames),
 		regex: `^(?:${union})`
 	};
 };
-const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
-	const processed = patterns.map((p) => processPattern(p, fuzzy));
+const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
+	const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
 	const union = processed.map((p) => p.pattern).join("|");
 	return {
 		captureNames: processed.flatMap((p) => p.captureNames),
 		regex: `(?:${union})$`
 	};
 };
-const buildTemplateRegexSource = (template) => {
-	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
+const buildTemplateRegexSource = (template, capturePrefix) => {
+	const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
 	return {
 		captureNames,
 		regex: pattern
 	};
 };
-const determineUsesCapture = (regexSource, captureNames) => hasCapturingGroup(regexSource) || captureNames.length > 0;
+const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(regexSource);
 /**
 * Builds a compiled regex and metadata from a split rule.
 *
 * Behavior mirrors the previous implementation in `segmenter.ts`.
 */
-const buildRuleRegex = (rule) => {
+const buildRuleRegex = (rule, capturePrefix) => {
 	const s = { ...rule };
 	const fuzzy = rule.fuzzy ?? false;
 	let allCaptureNames = [];
 	if (s.lineStartsAfter?.length) {
-		const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
+		const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
 		allCaptureNames = captureNames;
 		return {
 			captureNames: allCaptureNames,
@@ -1356,21 +1436,22 @@ const buildRuleRegex = (rule) => {
 		};
 	}
 	if (s.lineStartsWith?.length) {
-		const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
+		const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = captureNames;
 	}
 	if (s.lineEndsWith?.length) {
-		const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
+		const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = captureNames;
 	}
 	if (s.template) {
-		const { regex, captureNames } = buildTemplateRegexSource(s.template);
+		const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
 		s.regex = regex;
 		allCaptureNames = [...allCaptureNames, ...captureNames];
 	}
 	if (!s.regex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
+	if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(s.regex);
 	const usesCapture = determineUsesCapture(s.regex, allCaptureNames);
 	return {
 		captureNames: allCaptureNames,
@@ -1521,9 +1602,120 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
 	return [initialSeg];
 };
 const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
-	const collectSplitPointsFromRule = (rule) => {
+	const combinableRules = [];
+	const standaloneRules = [];
+	const fastFuzzyRules = [];
+	rules.forEach((rule, index) => {
+		let isCombinable = true;
+		if (rule.fuzzy && "lineStartsWith" in rule && Array.isArray(rule.lineStartsWith)) {
+			const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
+			if (compiled) {
+				fastFuzzyRules.push({
+					compiled,
+					rule,
+					ruleIndex: index
+				});
+				return;
+			}
+		}
+		if ("regex" in rule && rule.regex) {
+			const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
+			const hasBackreferences = /\\[1-9]/.test(rule.regex);
+			const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
+			if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
+		}
+		if (isCombinable) combinableRules.push({
+			index,
+			prefix: `r${index}_`,
+			rule
+		});
+		else standaloneRules.push(rule);
+	});
+	const splitPointsByRule = /* @__PURE__ */ new Map();
+	if (fastFuzzyRules.length > 0) {
+		let boundaryIdx = 0;
+		let currentBoundary = pageMap.boundaries[boundaryIdx];
+		const advanceBoundaryTo = (offset) => {
+			while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
+				boundaryIdx++;
+				currentBoundary = pageMap.boundaries[boundaryIdx];
+			}
+		};
+		const recordSplitPoint = (ruleIndex, sp) => {
+			if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
+			splitPointsByRule.get(ruleIndex).push(sp);
+		};
+		for (let lineStart = 0; lineStart <= matchContent.length;) {
+			advanceBoundaryTo(lineStart);
+			const pageId = currentBoundary?.id ?? 0;
+			if (lineStart >= matchContent.length) break;
+			for (const { compiled, rule, ruleIndex } of fastFuzzyRules) {
+				if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
+				const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
+				if (end === null) continue;
+				recordSplitPoint(ruleIndex, {
+					index: (rule.split ?? "at") === "at" ? lineStart : end,
+					meta: rule.meta
+				});
+			}
+			const nextNl = matchContent.indexOf("\n", lineStart);
+			if (nextNl === -1) break;
+			lineStart = nextNl + 1;
+		}
+	}
+	if (combinableRules.length > 0) {
+		const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
+			const built = buildRuleRegex(rule, prefix);
+			return {
+				prefix,
+				source: `(?<${prefix}>${built.regex.source})`,
+				...built
+			};
+		});
+		const combinedSource = ruleRegexes.map((r) => r.source).join("|");
+		const combinedRegex = new RegExp(combinedSource, "gm");
+		combinedRegex.lastIndex = 0;
+		let m = combinedRegex.exec(matchContent);
+		while (m !== null) {
+			const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
+			if (matchedRuleIndex !== -1) {
+				const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
+				const ruleInfo = ruleRegexes[matchedRuleIndex];
+				const namedCaptures = {};
+				if (m.groups) {
+					for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
+						const cleanName = prefixedName.slice(prefix.length);
+						namedCaptures[cleanName] = m.groups[prefixedName];
+					}
+				}
+				let capturedContent;
+				let contentStartOffset;
+				if (ruleInfo.usesLineStartsAfter) {
+					capturedContent = m.groups?.[`${prefix}content`];
+					if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
+				}
+				const start = m.index;
+				const end = m.index + m[0].length;
+				const pageId = pageMap.getId(start);
+				if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
+					const sp = {
+						capturedContent: void 0,
+						contentStartOffset,
+						index: (rule.split ?? "at") === "at" ? start : end,
+						meta: rule.meta,
+						namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
+					};
+					if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
+					splitPointsByRule.get(originalIndex).push(sp);
+				}
+			}
+			if (m[0].length === 0) combinedRegex.lastIndex++;
+			m = combinedRegex.exec(matchContent);
+		}
+	}
+	const collectSplitPointsFromRule = (rule, ruleIndex) => {
 		const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
-		return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
+		const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).map((m) => {
 			const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
 			const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
 			return {
@@ -1534,8 +1726,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
 				namedCaptures: m.namedCaptures
 			};
 		});
+		if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
+		splitPointsByRule.get(ruleIndex).push(...points);
 	};
-	return rules.flatMap(collectSplitPointsFromRule);
+	standaloneRules.forEach((rule) => {
+		collectSplitPointsFromRule(rule, rules.indexOf(rule));
+	});
+	const finalSplitPoints = [];
+	rules.forEach((rule, index) => {
+		const points = splitPointsByRule.get(index);
+		if (!points || points.length === 0) return;
+		let filtered = points;
+		if (rule.occurrence === "first") filtered = [points[0]];
+		else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
+		finalSplitPoints.push(...filtered);
+	});
+	return finalSplitPoints;
 };
 /**
 * Executes a regex against content and extracts match results with capture information.