npm - flappa-doormal - Versions diffs - 2.0.0 → 2.1.0 - Mend

flappa-doormal 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1227,7 +1227,7 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
 * @param prefer - 'longer' for last match, 'shorter' for first match
 * @returns Processed segments with oversized ones broken up
 */
-const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer) => {
+const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger) => {
 	const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
 		const startingPageId = pageIds$1[currentFromIdx];
 		if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
@@ -1259,72 +1259,168 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 	const patternProcessor = (p) => processPattern(p, false).pattern;
 	const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
 	const result = [];
+	logger?.info?.("Starting breakpoint processing", {
+		maxPages,
+		segmentCount: segments.length
+	});
 	for (const segment of segments) {
 		const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
 		const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
+		logger?.debug?.("Processing segment", {
+			contentLength: segment.content.length,
+			contentPreview: segment.content.slice(0, 100),
+			from: segment.from,
+			fromIdx,
+			to: segment.to,
+			toIdx
+		});
 		const segmentSpan = (segment.to ?? segment.from) - segment.from;
 		const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
 		if (segmentSpan <= maxPages && !hasExclusions) {
+			logger?.trace?.("Segment within limit, keeping as-is");
 			result.push(segment);
 			continue;
 		}
+		logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
 		let remainingContent = segment.content;
 		let currentFromIdx = fromIdx;
 		let isFirstPiece = true;
+		let iterationCount = 0;
+		const maxIterations = 1e4;
 		while (currentFromIdx <= toIdx) {
+			iterationCount++;
+			if (iterationCount > maxIterations) {
+				logger?.error?.("INFINITE LOOP DETECTED! Breaking out", { iterationCount: maxIterations });
+				logger?.error?.("Loop state", {
+					currentFromIdx,
+					remainingContentLength: remainingContent.length,
+					toIdx
+				});
+				break;
+			}
 			const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
+			logger?.trace?.("Loop iteration", {
+				currentFromIdx,
+				currentPageId: pageIds[currentFromIdx],
+				iterationCount,
+				remainingContentLength: remainingContent.length,
+				remainingContentPreview: remainingContent.slice(0, 80),
+				remainingSpan,
+				toIdx,
+				toPageId: pageIds[toIdx]
+			});
 			const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
 			if (remainingSpan <= maxPages && !remainingHasExclusions) {
+				logger?.debug?.("Remaining span within limit, outputting final segment");
 				const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
 				if (finalSeg) result.push(finalSeg);
 				break;
 			}
-			const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
+			const currentPageId = pageIds[currentFromIdx];
+			const maxWindowPageId = currentPageId + maxPages;
 			let windowEndIdx = currentFromIdx;
 			for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
 			else break;
+			logger?.trace?.("Window calculation", {
+				currentPageId,
+				maxWindowPageId,
+				windowEndIdx,
+				windowEndPageId: pageIds[windowEndIdx]
+			});
 			const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
 			let breakPosition = -1;
-			if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
-			if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, {
-				cumulativeOffsets,
-				expandedBreakpoints,
-				normalizedPages,
-				pageIds,
-				prefer
-			});
+			if (windowHasExclusions) {
+				logger?.trace?.("Window has exclusions, finding exclusion break position");
+				breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
+				logger?.trace?.("Exclusion break position", { breakPosition });
+			}
 			if (breakPosition <= 0) {
+				const breakpointCtx = {
+					cumulativeOffsets,
+					expandedBreakpoints,
+					normalizedPages,
+					pageIds,
+					prefer
+				};
+				logger?.trace?.("Finding break position using patterns...");
+				breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
+				logger?.trace?.("Pattern break position", { breakPosition });
+			}
+			if (breakPosition <= 0) {
+				logger?.debug?.("No pattern matched, falling back to page boundary");
 				if (windowEndIdx === currentFromIdx) {
+					logger?.trace?.("Single page window, outputting page and advancing");
 					const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
 					const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
 					if (pageSeg) result.push(pageSeg);
 					remainingContent = remainingContent.slice(pageContent.length).trim();
 					currentFromIdx++;
 					isFirstPiece = false;
+					logger?.trace?.("After single page", {
+						currentFromIdx,
+						remainingContentLength: remainingContent.length
+					});
 					continue;
 				}
 				breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
+				logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
 			}
 			const pieceContent = remainingContent.slice(0, breakPosition).trim();
+			logger?.trace?.("Piece extracted", {
+				breakPosition,
+				pieceContentLength: pieceContent.length,
+				pieceContentPreview: pieceContent.slice(0, 80)
+			});
 			const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
 			const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
+			logger?.trace?.("Actual page indices", {
+				actualEndIdx,
+				actualStartIdx,
+				pieceHasContent: !!pieceContent
+			});
 			if (pieceContent) {
 				const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
-				if (pieceSeg) result.push(pieceSeg);
+				if (pieceSeg) {
+					result.push(pieceSeg);
+					logger?.debug?.("Created segment", {
+						contentLength: pieceSeg.content.length,
+						from: pieceSeg.from,
+						to: pieceSeg.to
+					});
+				}
 			}
+			const prevRemainingLength = remainingContent.length;
 			remainingContent = remainingContent.slice(breakPosition).trim();
+			logger?.trace?.("After slicing remainingContent", {
+				newLength: remainingContent.length,
+				prevLength: prevRemainingLength,
+				slicedAmount: breakPosition
+			});
+			if (!remainingContent) {
+				logger?.debug?.("No remaining content, breaking out of loop");
+				break;
+			}
 			let nextFromIdx = actualEndIdx;
-			if (remainingContent && actualEndIdx + 1 <= toIdx) {
+			if (actualEndIdx + 1 <= toIdx) {
 				const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
 				if (nextPageData) {
 					const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
-					if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
+					if (nextPrefix && remainingContent.startsWith(nextPrefix)) {
+						nextFromIdx = actualEndIdx + 1;
+						logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
+					}
 				}
 			}
+			logger?.trace?.("End of iteration", {
+				nextFromIdx,
+				prevCurrentFromIdx: currentFromIdx,
+				willAdvance: nextFromIdx !== currentFromIdx
+			});
 			currentFromIdx = nextFromIdx;
 			isFirstPiece = false;
 		}
 	}
+	logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
 	return result;
 };
 /**
@@ -1370,7 +1466,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
 * });
 */
 const segmentPages = (pages, options) => {
-	const { rules = [], maxPages, breakpoints, prefer = "longer" } = options;
+	const { rules = [], maxPages, breakpoints, prefer = "longer", logger } = options;
 	if (!pages.length) return [];
 	const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
 	const splitPoints = [];
@@ -1408,7 +1504,7 @@ const segmentPages = (pages, options) => {
 		if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
 		if (initialSeg.content) segments = [initialSeg];
 	}
-	if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer);
+	if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger);
 	return segments;
 };
 /**
@@ -1480,5 +1576,160 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
 };
 //#endregion
-export { TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
+//#region src/pattern-detection.ts
+/**
+* Pattern detection utilities for recognizing template tokens in Arabic text.
+* Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
+*
+* @module pattern-detection
+*/
+/**
+* Token detection order - more specific patterns first to avoid partial matches.
+* Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
+*
+* Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
+*/
+const TOKEN_PRIORITY_ORDER = [
+	"basmalah",
+	"kitab",
+	"bab",
+	"fasl",
+	"naql",
+	"numbered",
+	"raqms",
+	"raqm",
+	"tarqim",
+	"bullet",
+	"dash",
+	"harf"
+];
+/**
+* Gets the token detection priority order.
+* Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
+*/
+const getTokenPriority = () => {
+	const allTokens = getAvailableTokens();
+	const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
+	const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
+	return [...prioritized, ...remaining];
+};
+/**
+* Analyzes text and returns all detected token patterns with their positions.
+* Patterns are detected in priority order to avoid partial matches.
+*
+* @param text - The text to analyze for token patterns
+* @returns Array of detected patterns sorted by position
+*
+* @example
+* detectTokenPatterns("٣٤ - حدثنا")
+* // Returns: [
+* //   { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
+* //   { token: 'dash', match: '-', index: 3, endIndex: 4 },
+* //   { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
+* // ]
+*/
+const detectTokenPatterns = (text) => {
+	if (!text) return [];
+	const results = [];
+	const coveredRanges = [];
+	const isPositionCovered = (start, end) => {
+		return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
+	};
+	for (const tokenName of getTokenPriority()) {
+		const pattern = TOKEN_PATTERNS[tokenName];
+		if (!pattern) continue;
+		try {
+			const regex = new RegExp(`(${pattern})`, "gu");
+			let match;
+			while ((match = regex.exec(text)) !== null) {
+				const startIndex = match.index;
+				const endIndex = startIndex + match[0].length;
+				if (isPositionCovered(startIndex, endIndex)) continue;
+				results.push({
+					endIndex,
+					index: startIndex,
+					match: match[0],
+					token: tokenName
+				});
+				coveredRanges.push([startIndex, endIndex]);
+			}
+		} catch {}
+	}
+	return results.sort((a, b) => a.index - b.index);
+};
+/**
+* Generates a template pattern from text using detected tokens.
+* Replaces matched portions with {{token}} syntax.
+*
+* @param text - Original text
+* @param detected - Array of detected patterns from detectTokenPatterns
+* @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
+*
+* @example
+* const detected = detectTokenPatterns("٣٤ - ");
+* generateTemplateFromText("٣٤ - ", detected);
+* // Returns: "{{raqms}} {{dash}} "
+*/
+const generateTemplateFromText = (text, detected) => {
+	if (!text || detected.length === 0) return text;
+	let template = text;
+	const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
+	for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
+	return template;
+};
+/**
+* Determines the best pattern type for auto-generated rules based on detected patterns.
+*
+* @param detected - Array of detected patterns
+* @returns Suggested pattern type and whether to use fuzzy matching
+*/
+const suggestPatternConfig = (detected) => {
+	const hasStructuralToken = detected.some((d) => [
+		"basmalah",
+		"kitab",
+		"bab",
+		"fasl"
+	].includes(d.token));
+	const hasNumberedPattern = detected.some((d) => [
+		"raqms",
+		"raqm",
+		"numbered"
+	].includes(d.token));
+	if (hasStructuralToken) return {
+		fuzzy: true,
+		metaType: detected.find((d) => [
+			"kitab",
+			"bab",
+			"fasl"
+		].includes(d.token))?.token || "chapter",
+		patternType: "lineStartsWith"
+	};
+	if (hasNumberedPattern) return {
+		fuzzy: false,
+		metaType: "hadith",
+		patternType: "lineStartsAfter"
+	};
+	return {
+		fuzzy: false,
+		patternType: "lineStartsAfter"
+	};
+};
+/**
+* Analyzes text and generates a complete suggested rule configuration.
+*
+* @param text - Highlighted text from the page
+* @returns Suggested rule configuration or null if no patterns detected
+*/
+const analyzeTextForRule = (text) => {
+	const detected = detectTokenPatterns(text);
+	if (detected.length === 0) return null;
+	return {
+		detected,
+		template: generateTemplateFromText(text, detected),
+		...suggestPatternConfig(detected)
+	};
+};
+//#endregion
+export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
 //# sourceMappingURL=index.mjs.map