npm - flappa-doormal - Versions diffs - 2.8.0 → 2.10.0 - Mend

flappa-doormal 2.8.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -155,6 +155,154 @@ const makeDiacriticInsensitive = (text) => {
 	return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
 };
+//#endregion
+//#region src/segmentation/types.ts
+/**
+* Pattern type key names for split rules.
+*
+* Use this array to dynamically iterate over pattern types in UIs,
+* or use the `PatternTypeKey` type for type-safe string unions.
+*
+* @example
+* // Build a dropdown/select in UI
+* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
+*
+* @example
+* // Type-safe pattern key validation
+* const validateKey = (k: string): k is PatternTypeKey =>
+*   (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
+*/
+const PATTERN_TYPE_KEYS = [
+	"lineStartsWith",
+	"lineStartsAfter",
+	"lineEndsWith",
+	"template",
+	"regex"
+];
+//#endregion
+//#region src/segmentation/optimize-rules.ts
+/**
+* Rule optimization utilities for merging and sorting split rules.
+*
+* Provides `optimizeRules()` to:
+* 1. Merge compatible rules with same pattern type and options
+* 2. Deduplicate patterns within each rule
+* 3. Sort rules by specificity (longer patterns first)
+*
+* @module optimize-rules
+*/
+const MERGEABLE_KEYS = new Set([
+	"lineStartsWith",
+	"lineStartsAfter",
+	"lineEndsWith"
+]);
+/**
+* Get the pattern type key for a rule.
+*/
+const getPatternKey = (rule) => {
+	for (const key of PATTERN_TYPE_KEYS) if (key in rule) return key;
+	return "regex";
+};
+/**
+* Get the pattern array for a mergeable rule.
+*/
+const getPatternArray = (rule, key) => {
+	const value = rule[key];
+	return Array.isArray(value) ? value : [];
+};
+/**
+* Get a string representation of the pattern value (for specificity scoring).
+*/
+const getPatternString = (rule, key) => {
+	const value = rule[key];
+	if (typeof value === "string") return value;
+	if (Array.isArray(value)) return value.join("\n");
+	return "";
+};
+/**
+* Deduplicate and sort patterns by length (longest first).
+*/
+const normalizePatterns = (patterns) => {
+	return [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
+};
+/**
+* Calculate specificity score for a rule (higher = more specific).
+* Based on the longest pattern length.
+*/
+const getSpecificityScore = (rule) => {
+	const key = getPatternKey(rule);
+	if (MERGEABLE_KEYS.has(key)) return getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0);
+	return getPatternString(rule, key).length;
+};
+/**
+* Create a merge key for a rule based on pattern type and all non-pattern properties.
+* Rules with the same merge key can have their patterns combined.
+*/
+const createMergeKey = (rule) => {
+	const patternKey = getPatternKey(rule);
+	const { [patternKey]: _pattern, ...rest } = rule;
+	return `${patternKey}|${JSON.stringify(rest)}`;
+};
+/**
+* Optimize split rules by merging compatible rules and sorting by specificity.
+*
+* This function:
+* 1. **Merges compatible rules**: Rules with the same pattern type and identical
+*    options (meta, fuzzy, min/max, etc.) have their pattern arrays combined
+* 2. **Deduplicates patterns**: Removes duplicate patterns within each rule
+* 3. **Sorts by specificity**: Rules with longer patterns come first
+*
+* Only array-based pattern types (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`)
+* can be merged. `template` and `regex` rules are kept separate.
+*
+* @param rules - Array of split rules to optimize
+* @returns Optimized rules and count of merged rules
+*
+* @example
+* import { optimizeRules } from 'flappa-doormal';
+*
+* const { rules, mergedCount } = optimizeRules([
+*   { lineStartsWith: ['{{kitab}}'], fuzzy: true, meta: { type: 'header' } },
+*   { lineStartsWith: ['{{bab}}'], fuzzy: true, meta: { type: 'header' } },
+*   { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } },
+* ]);
+*
+* // rules[0] = { lineStartsWith: ['{{kitab}}', '{{bab}}'], fuzzy: true, meta: { type: 'header' } }
+* // rules[1] = { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } }
+* // mergedCount = 1
+*/
+const optimizeRules = (rules) => {
+	const output = [];
+	const indexByMergeKey = /* @__PURE__ */ new Map();
+	let mergedCount = 0;
+	for (const rule of rules) {
+		const patternKey = getPatternKey(rule);
+		if (!MERGEABLE_KEYS.has(patternKey)) {
+			output.push(rule);
+			continue;
+		}
+		const mergeKey = createMergeKey(rule);
+		const existingIndex = indexByMergeKey.get(mergeKey);
+		if (existingIndex === void 0) {
+			indexByMergeKey.set(mergeKey, output.length);
+			output.push({
+				...rule,
+				[patternKey]: normalizePatterns(getPatternArray(rule, patternKey))
+			});
+			continue;
+		}
+		const existing = output[existingIndex];
+		existing[patternKey] = normalizePatterns([...getPatternArray(existing, patternKey), ...getPatternArray(rule, patternKey)]);
+		mergedCount++;
+	}
+	output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a));
+	return {
+		mergedCount,
+		rules: output
+	};
+};
 //#endregion
 //#region src/segmentation/tokens.ts
 /**
@@ -626,6 +774,51 @@ const shouldDefaultToFuzzy = (patterns) => {
 		return FUZZY_TOKEN_REGEX.test(p);
 	});
 };
+/**
+* Apply token mappings to a template string.
+*
+* Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
+* Useful for applying user-configured capture names to a raw template.
+*
+* - Only affects exact matches of `{{token}}`.
+* - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
+* - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
+*
+* @param template - The template string to transform
+* @param mappings - Array of mappings from token name to capture name
+* @returns Transformed template string with captures applied
+*
+* @example
+* applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
+* // → '{{raqms:num}} {{dash}}'
+*/
+const applyTokenMappings = (template, mappings) => {
+	let result = template;
+	for (const { token, name } of mappings) {
+		if (!token || !name) continue;
+		const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
+		result = result.replace(regex, `{{${token}:${name}}}`);
+	}
+	return result;
+};
+/**
+* Strip token mappings from a template string.
+*
+* Transforms `{{token:name}}` back into `{{token}}`.
+* Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
+*
+* Useful for normalizing templates for storage or comparison.
+*
+* @param template - The template string to strip
+* @returns Template string with capture names removed
+*
+* @example
+* stripTokenMappings('{{raqms:num}} {{dash}}')
+* // → '{{raqms}} {{dash}}'
+*/
+const stripTokenMappings = (template) => {
+	return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
+};
 //#endregion
 //#region src/segmentation/pattern-validator.ts
@@ -645,8 +838,13 @@ const buildBareTokenRegex = () => {
 * Validates a single pattern for common issues.
 */
 const validatePattern = (pattern, seenPatterns) => {
+	if (!pattern.trim()) return {
+		message: "Empty pattern is not allowed",
+		type: "empty_pattern"
+	};
 	if (seenPatterns.has(pattern)) return {
 		message: `Duplicate pattern: "${pattern}"`,
+		pattern,
 		type: "duplicate"
 	};
 	seenPatterns.add(pattern);
@@ -656,6 +854,7 @@ const validatePattern = (pattern, seenPatterns) => {
 		if (!KNOWN_TOKENS.has(tokenName)) return {
 			message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
 			suggestion: `Check spelling or use a known token`,
+			token: tokenName,
 			type: "unknown_token"
 		};
 	}
@@ -670,6 +869,7 @@ const validatePattern = (pattern, seenPatterns) => {
 		if (before !== "{{" && after !== "}}") return {
 			message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
 			suggestion: `{{${fullMatch}}}`,
+			token: tokenName,
 			type: "missing_braces"
 		};
 	}
@@ -727,7 +927,7 @@ const validateRules = (rules) => {
 				hasIssues = true;
 			}
 		}
-		if ("template" in rule && rule.template) {
+		if ("template" in rule && rule.template !== void 0) {
 			const seenPatterns = /* @__PURE__ */ new Set();
 			const issue = validatePattern(rule.template, seenPatterns);
 			if (issue) {
@@ -738,6 +938,39 @@ const validateRules = (rules) => {
 		return hasIssues ? result : void 0;
 	});
 };
+/**
+* Formats a validation result array into a list of human-readable error messages.
+*
+* Useful for displaying validation errors in UIs.
+*
+* @param results - The result array from `validateRules()`
+* @returns Array of formatted error strings
+*
+* @example
+* const issues = validateRules(rules);
+* const errors = formatValidationReport(issues);
+* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
+*/
+const formatValidationReport = (results) => {
+	const errors = [];
+	results.forEach((result, ruleIndex) => {
+		if (!result) return;
+		const formatIssue = (issue, location) => {
+			if (!issue) return;
+			const type = issue.type;
+			if (type === "missing_braces" && issue.token) errors.push(`${location}: Missing {{}} around token "${issue.token}"`);
+			else if (type === "unknown_token" && issue.token) errors.push(`${location}: Unknown token "{{${issue.token}}}"`);
+			else if (type === "duplicate" && issue.pattern) errors.push(`${location}: Duplicate pattern "${issue.pattern}"`);
+			else if (issue.message) errors.push(`${location}: ${issue.message}`);
+			else errors.push(`${location}: ${type}`);
+		};
+		for (const [patternType, issues] of Object.entries(result)) {
+			const list = Array.isArray(issues) ? issues : [issues];
+			for (const issue of list) if (issue) formatIssue(issue, `Rule ${ruleIndex + 1}, ${patternType}`);
+		}
+	});
+	return errors;
+};
 //#endregion
 //#region src/segmentation/replace.ts
@@ -1245,16 +1478,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
 */
 const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
 	const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
-	for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
+	for (let i = 0; i < expandedBreakpoints.length; i++) {
+		const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
 		if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
 		if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
 		if (skipWhenRegex?.test(remainingContent)) continue;
-		if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
+		if (regex === null) return {
+			breakpointIndex: i,
+			breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
+			rule
+		};
 		const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
-		if (breakPos > 0) return breakPos;
+		if (breakPos > 0) return {
+			breakpointIndex: i,
+			breakPos,
+			rule
+		};
 	}
-	return -1;
+	return null;
+};
+//#endregion
+//#region src/segmentation/debug-meta.ts
+const resolveDebugConfig = (debug) => {
+	if (!debug) return null;
+	if (debug === true) return {
+		includeBreakpoint: true,
+		includeRule: true,
+		metaKey: "_flappa"
+	};
+	if (typeof debug !== "object") return null;
+	const metaKey = debug.metaKey;
+	const include = debug.include;
+	const includeRule = Array.isArray(include) ? include.includes("rule") : true;
+	return {
+		includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
+		includeRule,
+		metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
+	};
+};
+const getRulePatternType = (rule) => {
+	if ("lineStartsWith" in rule) return "lineStartsWith";
+	if ("lineStartsAfter" in rule) return "lineStartsAfter";
+	if ("lineEndsWith" in rule) return "lineEndsWith";
+	if ("template" in rule) return "template";
+	return "regex";
+};
+const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
+const mergeDebugIntoMeta = (meta, metaKey, patch) => {
+	const out = meta ? { ...meta } : {};
+	const existing = out[metaKey];
+	out[metaKey] = {
+		...isPlainObject(existing) ? existing : {},
+		...patch
+	};
+	return out;
 };
+const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
+	index: ruleIndex,
+	patternType: getRulePatternType(rule)
+} });
+const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
+	index: breakpointIndex,
+	kind: rule.pattern === "" ? "pageBoundary" : "pattern",
+	pattern: rule.pattern
+} });
 //#endregion
 //#region src/segmentation/breakpoint-processor.ts
@@ -1338,15 +1626,20 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
 const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
 	if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
 		const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
-		if (exclusionBreak > 0) return exclusionBreak;
+		if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
 	}
-	const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
+	const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
 		expandedBreakpoints,
 		normalizedPages,
 		pageIds,
 		prefer
 	});
-	return patternBreak > 0 ? patternBreak : windowEndPosition;
+	if (patternMatch && patternMatch.breakPos > 0) return {
+		breakOffset: patternMatch.breakPos,
+		breakpointIndex: patternMatch.breakpointIndex,
+		breakpointRule: patternMatch.rule
+	};
+	return { breakOffset: windowEndPosition };
 };
 /**
 * Advances cursor position past any leading whitespace.
@@ -1362,12 +1655,13 @@ const skipWhitespace$1 = (content, startPos) => {
 *
 * Uses precomputed boundary positions for O(log n) page attribution lookups.
 */
-const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
+const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
 	const result = [];
 	const fullContent = segment.content;
 	let cursorPos = 0;
 	let currentFromIdx = fromIdx;
 	let isFirstPiece = true;
+	let lastBreakpoint = null;
 	const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
 	logger?.debug?.("[breakpoints] boundaryPositions built", {
 		boundaryPositions,
@@ -1382,7 +1676,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 		const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
 		const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
 		if (remainingSpan <= maxPages && !remainingHasExclusions) {
-			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
+			const includeMeta = isFirstPiece || Boolean(debugMetaKey);
+			const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
+			const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
 			if (finalSeg) result.push(finalSeg);
 			break;
 		}
@@ -1393,8 +1689,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 			cursorPos,
 			windowEndIdx
 		});
-		const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
-		const breakPos = cursorPos + breakOffset;
+		const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
+		if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
+			breakpointIndex: found.breakpointIndex,
+			rule: found.breakpointRule
+		};
+		const breakPos = cursorPos + found.breakOffset;
 		const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
 		const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
 		logger?.trace?.("[breakpoints] piece", {
@@ -1403,7 +1703,8 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 			pieceLength: pieceContent.length
 		});
 		if (pieceContent) {
-			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
+			const includeMeta = isFirstPiece || Boolean(debugMetaKey);
+			const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
 			if (pieceSeg) result.push(pieceSeg);
 		}
 		cursorPos = skipWhitespace$1(fullContent, breakPos);
@@ -1418,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
 *
 * Note: This is an internal engine used by `segmentPages()`.
 */
-const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
+const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
 	const pageIds = pages.map((p) => p.id);
 	const pageIdToIndex = buildPageIdToIndexMap(pageIds);
 	const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1446,7 +1747,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 			result.push(segment);
 			continue;
 		}
-		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
+		const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
 		result.push(...broken.map((s) => {
 			const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
 			const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -2059,13 +2360,25 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
 	}
 	return matches;
 };
-const applyOccurrenceFilter = (rules, splitPointsByRule) => {
+const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
 	const result = [];
 	rules.forEach((rule, index) => {
 		const points = splitPointsByRule.get(index);
 		if (!points?.length) return;
 		const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
-		result.push(...filtered);
+		if (!debugMetaKey) {
+			result.push(...filtered.map((p) => ({
+				...p,
+				ruleIndex: index
+			})));
+			return;
+		}
+		const debugPatch = buildRuleDebugPatch(index, rule);
+		result.push(...filtered.map((p) => ({
+			...p,
+			meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
+			ruleIndex: index
+		})));
 	});
 	return result;
 };
@@ -2203,7 +2516,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
 	if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
 	return [initialSeg];
 };
-const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
+const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
 	logger?.debug?.("[segmenter] collecting split points from rules", {
 		contentLength: matchContent.length,
 		ruleCount: rules.length
@@ -2218,7 +2531,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
 	const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
 	if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
 	for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
-	return applyOccurrenceFilter(rules, splitPointsByRule);
+	return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
 };
 /**
 * Finds page breaks within a given offset range using binary search.
@@ -2321,6 +2634,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
 */
 const segmentPages = (pages, options) => {
 	const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
+	const debug = resolveDebugConfig(options.debug);
+	const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
 	logger?.info?.("[segmenter] starting segmentation", {
 		breakpointCount: breakpoints.length,
 		maxPages,
@@ -2334,7 +2649,7 @@ const segmentPages = (pages, options) => {
 		pageIds: pageMap.pageIds,
 		totalContentLength: matchContent.length
 	});
-	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
+	const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
 	const unique = dedupeSplitPoints(splitPoints);
 	logger?.debug?.("[segmenter] split points collected", {
 		rawSplitPoints: splitPoints.length,
@@ -2353,7 +2668,7 @@ const segmentPages = (pages, options) => {
 	if (maxPages >= 0 && breakpoints.length) {
 		logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
 		const patternProcessor = (p) => processPattern(p, false).pattern;
-		const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
+		const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
 		logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
 		return result;
 	}
@@ -2450,7 +2765,7 @@ const buildTokenPriority = () => {
 	return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
 };
 const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
-const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
+const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
 const compileTokenRegexes = (tokenNames) => {
 	const compiled = [];
 	for (const token of tokenNames) {
@@ -3532,5 +3847,5 @@ function recoverMistakenMarkersForRuns(runs, opts) {
 }
 //#endregion
-export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
+export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, applyTokenMappings, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, formatValidationReport, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules };
 //# sourceMappingURL=index.mjs.map