flappa-doormal 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1351,7 +1351,7 @@ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx
1351
1351
  /**
1352
1352
  * Advances cursor position past any leading whitespace.
1353
1353
  */
1354
- const skipWhitespace = (content, startPos) => {
1354
+ const skipWhitespace$1 = (content, startPos) => {
1355
1355
  let pos = startPos;
1356
1356
  while (pos < content.length && /\s/.test(content[pos])) pos++;
1357
1357
  return pos;
@@ -1406,7 +1406,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1406
1406
  const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
1407
1407
  if (pieceSeg) result.push(pieceSeg);
1408
1408
  }
1409
- cursorPos = skipWhitespace(fullContent, breakPos);
1409
+ cursorPos = skipWhitespace$1(fullContent, breakPos);
1410
1410
  currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
1411
1411
  isFirstPiece = false;
1412
1412
  }
@@ -1959,6 +1959,117 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
1959
1959
  return splitPointsByRule;
1960
1960
  };
1961
1961
 
1962
+ //#endregion
1963
+ //#region src/segmentation/split-point-helpers.ts
1964
+ /**
1965
+ * Helper module for collectSplitPointsFromRules to reduce complexity.
1966
+ * Handles combined regex matching and split point creation.
1967
+ */
1968
+ const MAX_REGEX_ITERATIONS = 1e5;
1969
+ const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
1970
+ const result = {};
1971
+ if (!groups) return result;
1972
+ for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
1973
+ return result;
1974
+ };
1975
+ const buildContentOffsets = (match, ruleInfo) => {
1976
+ if (!ruleInfo.usesLineStartsAfter) return {};
1977
+ const captured = match.groups?.[`${ruleInfo.prefix}__content`];
1978
+ if (captured === void 0) return {};
1979
+ return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
1980
+ };
1981
+ const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
1982
+ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
1983
+ const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
1984
+ const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
1985
+ return {
1986
+ capturedContent: void 0,
1987
+ contentStartOffset,
1988
+ index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
1989
+ meta: rule.meta,
1990
+ namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
1991
+ };
1992
+ };
1993
+ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
1994
+ const combinedSource = ruleRegexes.map((r) => r.source).join("|");
1995
+ const combinedRegex = new RegExp(combinedSource, "gm");
1996
+ logger?.debug?.("[segmenter] combined regex built", {
1997
+ combinableRuleCount: combinableRules.length,
1998
+ combinedSourceLength: combinedSource.length
1999
+ });
2000
+ let m = combinedRegex.exec(matchContent);
2001
+ let iterations = 0;
2002
+ while (m !== null) {
2003
+ iterations++;
2004
+ if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
2005
+ if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
2006
+ iterations,
2007
+ position: m.index
2008
+ });
2009
+ const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
2010
+ if (matchedIndex !== -1) {
2011
+ const { rule, index: originalIndex } = combinableRules[matchedIndex];
2012
+ const ruleInfo = ruleRegexes[matchedIndex];
2013
+ if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
2014
+ const sp = createSplitPointFromMatch(m, rule, ruleInfo);
2015
+ if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
2016
+ splitPointsByRule.get(originalIndex).push(sp);
2017
+ }
2018
+ }
2019
+ if (m[0].length === 0) combinedRegex.lastIndex++;
2020
+ m = combinedRegex.exec(matchContent);
2021
+ }
2022
+ };
2023
+ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
2024
+ const built = buildRuleRegex(rule, prefix);
2025
+ return {
2026
+ ...built,
2027
+ prefix,
2028
+ source: `(?<${prefix}>${built.regex.source})`
2029
+ };
2030
+ });
2031
+ const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
2032
+ const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
2033
+ const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
2034
+ const isLSA = usesLineStartsAfter && m.captured !== void 0;
2035
+ const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
2036
+ return {
2037
+ capturedContent: isLSA ? void 0 : m.captured,
2038
+ contentStartOffset: isLSA ? markerLen : void 0,
2039
+ index: (rule.split ?? "at") === "at" ? m.start : m.end,
2040
+ meta: rule.meta,
2041
+ namedCaptures: m.namedCaptures
2042
+ };
2043
+ });
2044
+ if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
2045
+ splitPointsByRule.get(ruleIndex).push(...points);
2046
+ };
2047
+ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
2048
+ const matches = [];
2049
+ let m = regex.exec(content);
2050
+ while (m !== null) {
2051
+ matches.push({
2052
+ captured: usesCapture ? getLastPositionalCapture(m) : void 0,
2053
+ end: m.index + m[0].length,
2054
+ namedCaptures: extractNamedCaptures(m.groups, captureNames),
2055
+ start: m.index
2056
+ });
2057
+ if (m[0].length === 0) regex.lastIndex++;
2058
+ m = regex.exec(content);
2059
+ }
2060
+ return matches;
2061
+ };
2062
+ const applyOccurrenceFilter = (rules, splitPointsByRule) => {
2063
+ const result = [];
2064
+ rules.forEach((rule, index) => {
2065
+ const points = splitPointsByRule.get(index);
2066
+ if (!points?.length) return;
2067
+ const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
2068
+ result.push(...filtered);
2069
+ });
2070
+ return result;
2071
+ };
2072
+
1962
2073
  //#endregion
1963
2074
  //#region src/segmentation/textUtils.ts
1964
2075
  /**
@@ -1985,7 +2096,6 @@ const normalizeLineEndings = (content) => {
1985
2096
  *
1986
2097
  * @module segmenter
1987
2098
  */
1988
- const MAX_REGEX_ITERATIONS = 1e5;
1989
2099
  /**
1990
2100
  * Builds a concatenated content string and page mapping from input pages.
1991
2101
  *
@@ -2082,7 +2192,7 @@ const dedupeSplitPoints = (splitPoints) => {
2082
2192
  const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
2083
2193
  if (segments.length > 0 || pages.length === 0) return segments;
2084
2194
  const firstPage = pages[0];
2085
- const lastPage = pages[pages.length - 1];
2195
+ const lastPage = pages.at(-1);
2086
2196
  const joinChar = pageJoiner === "newline" ? "\n" : " ";
2087
2197
  const allContent = normalizedContent.join(joinChar).trim();
2088
2198
  if (!allContent) return segments;
@@ -2106,124 +2216,9 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2106
2216
  standaloneCount: standaloneRules.length
2107
2217
  });
2108
2218
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
2109
- if (combinableRules.length > 0) {
2110
- const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
2111
- const built = buildRuleRegex(rule, prefix);
2112
- return {
2113
- prefix,
2114
- source: `(?<${prefix}>${built.regex.source})`,
2115
- ...built
2116
- };
2117
- });
2118
- const combinedSource = ruleRegexes.map((r) => r.source).join("|");
2119
- const combinedRegex = new RegExp(combinedSource, "gm");
2120
- logger?.debug?.("[segmenter] combined regex built", {
2121
- combinableRuleCount: combinableRules.length,
2122
- combinedSourceLength: combinedSource.length
2123
- });
2124
- combinedRegex.lastIndex = 0;
2125
- let m = combinedRegex.exec(matchContent);
2126
- let iterationCount = 0;
2127
- while (m !== null) {
2128
- iterationCount++;
2129
- if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
2130
- if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
2131
- iterationCount,
2132
- lastIndex: combinedRegex.lastIndex,
2133
- matchLength: m[0].length,
2134
- matchPosition: m.index
2135
- });
2136
- const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
2137
- if (matchedRuleIndex !== -1) {
2138
- const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
2139
- const ruleInfo = ruleRegexes[matchedRuleIndex];
2140
- const namedCaptures = {};
2141
- if (m.groups) {
2142
- for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
2143
- const cleanName = prefixedName.slice(prefix.length);
2144
- namedCaptures[cleanName] = m.groups[prefixedName];
2145
- }
2146
- }
2147
- let capturedContent;
2148
- let contentStartOffset;
2149
- if (ruleInfo.usesLineStartsAfter) {
2150
- capturedContent = m.groups?.[`${prefix}__content`];
2151
- if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
2152
- }
2153
- const start = m.index;
2154
- const end = m.index + m[0].length;
2155
- const pageId = pageMap.getId(start);
2156
- if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
2157
- const sp = {
2158
- capturedContent: void 0,
2159
- contentStartOffset,
2160
- index: (rule.split ?? "at") === "at" ? start : end,
2161
- meta: rule.meta,
2162
- namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
2163
- };
2164
- if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
2165
- splitPointsByRule.get(originalIndex).push(sp);
2166
- }
2167
- }
2168
- if (m[0].length === 0) combinedRegex.lastIndex++;
2169
- m = combinedRegex.exec(matchContent);
2170
- }
2171
- }
2172
- const collectSplitPointsFromRule = (rule, ruleIndex) => {
2173
- const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
2174
- const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
2175
- const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
2176
- const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
2177
- return {
2178
- capturedContent: isLineStartsAfter ? void 0 : m.captured,
2179
- contentStartOffset: isLineStartsAfter ? markerLength : void 0,
2180
- index: (rule.split ?? "at") === "at" ? m.start : m.end,
2181
- meta: rule.meta,
2182
- namedCaptures: m.namedCaptures
2183
- };
2184
- });
2185
- if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
2186
- splitPointsByRule.get(ruleIndex).push(...points);
2187
- };
2188
- standaloneRules.forEach((rule) => {
2189
- collectSplitPointsFromRule(rule, rules.indexOf(rule));
2190
- });
2191
- const finalSplitPoints = [];
2192
- rules.forEach((rule, index) => {
2193
- const points = splitPointsByRule.get(index);
2194
- if (!points || points.length === 0) return;
2195
- let filtered = points;
2196
- if (rule.occurrence === "first") filtered = [points[0]];
2197
- else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
2198
- finalSplitPoints.push(...filtered);
2199
- });
2200
- return finalSplitPoints;
2201
- };
2202
- /**
2203
- * Executes a regex against content and extracts match results with capture information.
2204
- *
2205
- * @param content - Full content string to search
2206
- * @param regex - Compiled regex with 'g' flag
2207
- * @param usesCapture - Whether to extract captured content
2208
- * @param captureNames - Names of expected named capture groups
2209
- * @returns Array of match results with positions and captures
2210
- */
2211
- const findMatches = (content, regex, usesCapture, captureNames) => {
2212
- const matches = [];
2213
- regex.lastIndex = 0;
2214
- let m = regex.exec(content);
2215
- while (m !== null) {
2216
- const result = {
2217
- end: m.index + m[0].length,
2218
- start: m.index
2219
- };
2220
- result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
2221
- if (usesCapture) result.captured = getLastPositionalCapture(m);
2222
- matches.push(result);
2223
- if (m[0].length === 0) regex.lastIndex++;
2224
- m = regex.exec(content);
2225
- }
2226
- return matches;
2219
+ if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
2220
+ for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
2221
+ return applyOccurrenceFilter(rules, splitPointsByRule);
2227
2222
  };
2228
2223
  /**
2229
2224
  * Finds page breaks within a given offset range using binary search.
@@ -2410,7 +2405,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
2410
2405
  const result = [];
2411
2406
  for (let i = 0; i < splitPoints.length; i++) {
2412
2407
  const sp = splitPoints[i];
2413
- const end = i < splitPoints.length - 1 ? splitPoints[i + 1].index : content.length;
2408
+ const end = splitPoints[i + 1]?.index ?? content.length;
2414
2409
  const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
2415
2410
  if (s) result.push(s);
2416
2411
  }
@@ -2434,29 +2429,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
2434
2429
  };
2435
2430
 
2436
2431
  //#endregion
2437
- //#region src/analysis.ts
2438
- const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2439
- const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
2440
- const computeSpecificity = (pattern) => {
2441
- const tokenCount = countTokenMarkers(pattern);
2442
- return {
2443
- literalLen: stripWhitespacePlaceholders(pattern).length,
2444
- tokenCount
2445
- };
2446
- };
2447
- const DEFAULT_OPTIONS = {
2448
- includeFirstWordFallback: true,
2449
- lineFilter: void 0,
2450
- maxExamples: 1,
2451
- minCount: 3,
2452
- minLineLength: 6,
2453
- normalizeArabicDiacritics: true,
2454
- prefixChars: 60,
2455
- prefixMatchers: [/^#+/u],
2456
- sortBy: "specificity",
2457
- topK: 40,
2458
- whitespace: "regex"
2459
- };
2432
+ //#region src/analysis/shared.ts
2460
2433
  const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
2461
2434
  const TOKEN_PRIORITY_ORDER$1 = [
2462
2435
  "basmalah",
@@ -2497,30 +2470,7 @@ const appendWs = (out, mode) => {
2497
2470
  if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
2498
2471
  return out.endsWith("\\s*") ? out : `${out}\\s*`;
2499
2472
  };
2500
- const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
2501
- let matchedAny = false;
2502
- let currentPos = pos;
2503
- let currentOut = out;
2504
- for (const re of prefixMatchers) {
2505
- if (currentPos >= s.length) break;
2506
- const m = re.exec(s.slice(currentPos));
2507
- if (!m || m.index !== 0 || !m[0]) continue;
2508
- currentOut += escapeSignatureLiteral(m[0]);
2509
- currentPos += m[0].length;
2510
- matchedAny = true;
2511
- const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
2512
- if (wsAfter) {
2513
- currentPos += wsAfter[0].length;
2514
- currentOut = appendWs(currentOut, whitespace);
2515
- }
2516
- }
2517
- return {
2518
- matchedAny,
2519
- out: currentOut,
2520
- pos: currentPos
2521
- };
2522
- };
2523
- const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2473
+ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
2524
2474
  let best = null;
2525
2475
  for (const { token, re } of compiled) {
2526
2476
  re.lastIndex = pos;
@@ -2534,132 +2484,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2534
2484
  if (best?.token === "rumuz") {
2535
2485
  const end = pos + best.text.length;
2536
2486
  const next = end < s.length ? s[end] : "";
2537
- if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
2487
+ if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
2538
2488
  }
2539
2489
  return best;
2540
2490
  };
2541
- const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers, whitespace) => {
2491
+ const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
2492
+ const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2493
+
2494
+ //#endregion
2495
+ //#region src/analysis/line-starts.ts
2496
+ const resolveOptions$1 = (options = {}) => ({
2497
+ includeFirstWordFallback: options.includeFirstWordFallback ?? true,
2498
+ lineFilter: options.lineFilter,
2499
+ maxExamples: options.maxExamples ?? 1,
2500
+ minCount: options.minCount ?? 3,
2501
+ minLineLength: options.minLineLength ?? 6,
2502
+ normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
2503
+ prefixChars: options.prefixChars ?? 60,
2504
+ prefixMatchers: options.prefixMatchers ?? [/^#+/u],
2505
+ sortBy: options.sortBy ?? "specificity",
2506
+ topK: options.topK ?? 40,
2507
+ whitespace: options.whitespace ?? "regex"
2508
+ });
2509
+ const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2510
+ const computeSpecificity = (pattern) => ({
2511
+ literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
2512
+ tokenCount: countTokenMarkers(pattern)
2513
+ });
2514
+ const compareBySpecificity = (a, b) => {
2515
+ const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
2516
+ return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
2517
+ };
2518
+ const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
2519
+ /** Remove trailing whitespace placeholders */
2520
+ const trimTrailingWs = (out, mode) => {
2521
+ const suffix = mode === "regex" ? "\\s*" : " ";
2522
+ while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
2523
+ return out;
2524
+ };
2525
+ /** Try to extract first word for fallback */
2526
+ const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
2527
+ /** Consume prefix matchers at current position */
2528
+ const consumePrefixes = (s, pos, out, matchers, ws) => {
2529
+ let matched = false;
2530
+ for (const re of matchers) {
2531
+ if (pos >= s.length) break;
2532
+ const m = re.exec(s.slice(pos));
2533
+ if (!m?.index && m?.[0]) {
2534
+ out += escapeSignatureLiteral(m[0]);
2535
+ pos += m[0].length;
2536
+ matched = true;
2537
+ const wsm = /^[ \t]+/u.exec(s.slice(pos));
2538
+ if (wsm) {
2539
+ pos += wsm[0].length;
2540
+ out = appendWs(out, ws);
2541
+ }
2542
+ }
2543
+ }
2544
+ return {
2545
+ matched,
2546
+ out,
2547
+ pos
2548
+ };
2549
+ };
2550
+ /** Try to match a token at current position and append to signature */
2551
+ const tryMatchToken = (s, pos, out, compiled) => {
2552
+ const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2553
+ if (!best) return {
2554
+ matched: false,
2555
+ out,
2556
+ pos
2557
+ };
2558
+ return {
2559
+ matched: true,
2560
+ out: `${out}{{${best.token}}}`,
2561
+ pos: pos + best.text.length
2562
+ };
2563
+ };
2564
+ /** Try to match a delimiter at current position */
2565
+ const tryMatchDelimiter = (s, pos, out) => {
2566
+ const ch = s[pos];
2567
+ if (!ch || !isCommonDelimiter(ch)) return {
2568
+ matched: false,
2569
+ out,
2570
+ pos
2571
+ };
2572
+ return {
2573
+ matched: true,
2574
+ out: out + escapeSignatureLiteral(ch),
2575
+ pos: pos + 1
2576
+ };
2577
+ };
2578
+ /** Skip whitespace at position */
2579
+ const skipWhitespace = (s, pos, out, ws) => {
2580
+ const m = /^[ \t]+/u.exec(s.slice(pos));
2581
+ if (!m) return {
2582
+ out,
2583
+ pos,
2584
+ skipped: false
2585
+ };
2586
+ return {
2587
+ out: appendWs(out, ws),
2588
+ pos: pos + m[0].length,
2589
+ skipped: true
2590
+ };
2591
+ };
2592
+ const tokenizeLineStart = (line, tokenNames, opts) => {
2542
2593
  const trimmed = collapseWhitespace(line);
2543
2594
  if (!trimmed) return null;
2544
- const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
2545
- let pos = 0;
2546
- let out = "";
2547
- let matchedAny = false;
2548
- let matchedToken = false;
2595
+ const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
2549
2596
  const compiled = compileTokenRegexes(tokenNames);
2550
- const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
2551
- const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2552
- {
2553
- const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers, whitespace);
2554
- pos = consumed.pos;
2555
- out = consumed.out;
2556
- matchedAny = consumed.matchedAny;
2557
- }
2558
- let tokenSteps = 0;
2559
- while (tokenSteps < 6 && pos < s.length) {
2560
- const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
2561
- if (wsMatch) {
2562
- pos += wsMatch[0].length;
2563
- out = appendWs(out, whitespace);
2597
+ let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
2598
+ const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
2599
+ pos = prefix.pos;
2600
+ out = prefix.out;
2601
+ matchedAny = prefix.matched;
2602
+ while (steps < 6 && pos < s.length) {
2603
+ const ws = skipWhitespace(s, pos, out, opts.whitespace);
2604
+ if (ws.skipped) {
2605
+ pos = ws.pos;
2606
+ out = ws.out;
2564
2607
  continue;
2565
2608
  }
2566
- const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2567
- if (best) {
2568
- if (out && !out.endsWith("\\s*")) {}
2569
- out += `{{${best.token}}}`;
2570
- matchedAny = true;
2571
- matchedToken = true;
2572
- pos += best.text.length;
2573
- tokenSteps++;
2609
+ const tok = tryMatchToken(s, pos, out, compiled);
2610
+ if (tok.matched) {
2611
+ pos = tok.pos;
2612
+ out = tok.out;
2613
+ matchedAny = matchedToken = true;
2614
+ steps++;
2574
2615
  continue;
2575
2616
  }
2576
2617
  if (matchedAny) {
2577
- const ch = s[pos];
2578
- if (ch && isCommonDelimiter(ch)) {
2579
- out += escapeSignatureLiteral(ch);
2580
- pos += 1;
2618
+ const delim = tryMatchDelimiter(s, pos, out);
2619
+ if (delim.matched) {
2620
+ pos = delim.pos;
2621
+ out = delim.out;
2581
2622
  continue;
2582
2623
  }
2583
2624
  }
2584
2625
  if (matchedAny) {
2585
- if (includeFirstWordFallback && !matchedToken) {
2586
- const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2587
- if (!firstWord$1) break;
2588
- out += escapeSignatureLiteral(firstWord$1);
2589
- tokenSteps++;
2626
+ if (opts.includeFirstWordFallback && !matchedToken) {
2627
+ const word$1 = extractFirstWord(s.slice(pos));
2628
+ if (word$1) {
2629
+ out += escapeSignatureLiteral(word$1);
2630
+ steps++;
2631
+ }
2590
2632
  }
2591
2633
  break;
2592
2634
  }
2593
- if (!includeFirstWordFallback) return null;
2594
- const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2595
- if (!firstWord) return null;
2596
- out += escapeSignatureLiteral(firstWord);
2597
- tokenSteps++;
2598
- return out;
2599
- }
2600
- if (!matchedAny) return null;
2601
- if (whitespace === "regex") while (out.endsWith("\\s*")) out = out.slice(0, -3);
2602
- else while (out.endsWith(" ")) out = out.slice(0, -1);
2603
- return out;
2635
+ if (!opts.includeFirstWordFallback) return null;
2636
+ const word = extractFirstWord(s.slice(pos));
2637
+ if (!word) return null;
2638
+ return escapeSignatureLiteral(word);
2639
+ }
2640
+ return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
2641
+ };
2642
+ const processLine = (line, pageId, tokenPriority, opts, acc) => {
2643
+ const trimmed = collapseWhitespace(line);
2644
+ if (trimmed.length < opts.minLineLength) return;
2645
+ if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
2646
+ const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
2647
+ if (!sig) return;
2648
+ const entry = acc.get(sig);
2649
+ if (!entry) acc.set(sig, {
2650
+ count: 1,
2651
+ examples: [{
2652
+ line: trimmed,
2653
+ pageId
2654
+ }]
2655
+ });
2656
+ else {
2657
+ entry.count++;
2658
+ if (entry.examples.length < opts.maxExamples) entry.examples.push({
2659
+ line: trimmed,
2660
+ pageId
2661
+ });
2662
+ }
2663
+ };
2664
+ const processPage = (page, tokenPriority, opts, acc) => {
2665
+ for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
2604
2666
  };
2605
2667
  /**
2606
2668
  * Analyze pages and return the most common line-start patterns (top K).
2607
- *
2608
- * This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
2609
- * template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
2610
2669
  */
2611
2670
  const analyzeCommonLineStarts = (pages, options = {}) => {
2612
- const o = {
2613
- ...DEFAULT_OPTIONS,
2614
- ...options,
2615
- lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
2616
- prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
2617
- whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
2618
- };
2671
+ const opts = resolveOptions$1(options);
2619
2672
  const tokenPriority = buildTokenPriority();
2620
- const counts = /* @__PURE__ */ new Map();
2621
- for (const page of pages) {
2622
- const lines = normalizeLineEndings(page.content ?? "").split("\n");
2623
- for (const line of lines) {
2624
- const trimmed = collapseWhitespace(line);
2625
- if (trimmed.length < o.minLineLength) continue;
2626
- if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
2627
- const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers, o.whitespace);
2628
- if (!sig) continue;
2629
- const existing = counts.get(sig);
2630
- if (!existing) counts.set(sig, {
2631
- count: 1,
2632
- examples: [{
2633
- line: trimmed,
2634
- pageId: page.id
2635
- }]
2636
- });
2637
- else {
2638
- existing.count++;
2639
- if (existing.examples.length < o.maxExamples) existing.examples.push({
2640
- line: trimmed,
2641
- pageId: page.id
2642
- });
2673
+ const acc = /* @__PURE__ */ new Map();
2674
+ for (const page of pages) processPage(page, tokenPriority, opts, acc);
2675
+ const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
2676
+ return [...acc.entries()].map(([pattern, v]) => ({
2677
+ count: v.count,
2678
+ examples: v.examples,
2679
+ pattern
2680
+ })).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
2681
+ };
2682
+
2683
+ //#endregion
2684
+ //#region src/analysis/repeating-sequences.ts
2685
+ const resolveOptions = (options) => {
2686
+ const minElements = Math.max(1, options?.minElements ?? 1);
2687
+ return {
2688
+ contextChars: options?.contextChars ?? 50,
2689
+ maxElements: Math.max(minElements, options?.maxElements ?? 3),
2690
+ maxExamples: options?.maxExamples ?? 3,
2691
+ maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
2692
+ minCount: Math.max(1, options?.minCount ?? 3),
2693
+ minElements,
2694
+ normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
2695
+ requireToken: options?.requireToken ?? true,
2696
+ topK: Math.max(1, options?.topK ?? 20),
2697
+ whitespace: options?.whitespace ?? "regex"
2698
+ };
2699
+ };
2700
+ /** Creates a cursor that tracks position in both normalized and raw text */
2701
+ const createRawCursor = (text, normalize) => {
2702
+ let rawPos = 0;
2703
+ return {
2704
+ advance(normalizedLen) {
2705
+ if (!normalize) {
2706
+ const chunk = text.slice(rawPos, rawPos + normalizedLen);
2707
+ rawPos += normalizedLen;
2708
+ return chunk;
2709
+ }
2710
+ const start = rawPos;
2711
+ let matchedLen = 0;
2712
+ while (matchedLen < normalizedLen && rawPos < text.length) {
2713
+ if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
2714
+ rawPos++;
2643
2715
  }
2716
+ while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
2717
+ return text.slice(start, rawPos);
2718
+ },
2719
+ get pos() {
2720
+ return rawPos;
2721
+ }
2722
+ };
2723
+ };
2724
+ /** Scans text and produces a stream of tokens and literals. */
2725
+ const tokenizeContent = (text, normalize) => {
2726
+ const normalized = normalize ? stripArabicDiacritics(text) : text;
2727
+ const compiled = compileTokenRegexes(buildTokenPriority());
2728
+ const cursor = createRawCursor(text, normalize);
2729
+ const items = [];
2730
+ let pos = 0;
2731
+ while (pos < normalized.length) {
2732
+ const ws = /^\s+/u.exec(normalized.slice(pos));
2733
+ if (ws) {
2734
+ pos += ws[0].length;
2735
+ cursor.advance(ws[0].length);
2736
+ continue;
2737
+ }
2738
+ const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
2739
+ if (token) {
2740
+ const raw = cursor.advance(token.text.length);
2741
+ items.push({
2742
+ end: cursor.pos,
2743
+ raw,
2744
+ start: cursor.pos - raw.length,
2745
+ text: `{{${token.token}}}`,
2746
+ type: "token"
2747
+ });
2748
+ pos += token.text.length;
2749
+ continue;
2750
+ }
2751
+ if (isCommonDelimiter(normalized[pos])) {
2752
+ const raw = cursor.advance(1);
2753
+ items.push({
2754
+ end: cursor.pos,
2755
+ raw,
2756
+ start: cursor.pos - 1,
2757
+ text: escapeSignatureLiteral(normalized[pos]),
2758
+ type: "literal"
2759
+ });
2760
+ pos++;
2761
+ continue;
2762
+ }
2763
+ const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
2764
+ if (word) {
2765
+ const raw = cursor.advance(word[0].length);
2766
+ items.push({
2767
+ end: cursor.pos,
2768
+ raw,
2769
+ start: cursor.pos - raw.length,
2770
+ text: escapeSignatureLiteral(word[0]),
2771
+ type: "literal"
2772
+ });
2773
+ pos += word[0].length;
2774
+ continue;
2644
2775
  }
2776
+ cursor.advance(1);
2777
+ pos++;
2645
2778
  }
2646
- const compareSpecificityThenCount = (a, b) => {
2647
- const sa = computeSpecificity(a.pattern);
2648
- const sb = computeSpecificity(b.pattern);
2649
- if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
2650
- if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
2651
- if (b.count !== a.count) return b.count - a.count;
2652
- return a.pattern.localeCompare(b.pattern);
2779
+ return items;
2780
+ };
2781
+ /** Build pattern string from window items */
2782
+ const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
2783
+ /** Check if window contains at least one token */
2784
+ const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
2785
+ /** Compute token count and literal length for a window */
2786
+ const computeWindowStats = (window) => {
2787
+ let tokenCount = 0, literalLen = 0;
2788
+ for (const item of window) if (item.type === "token") tokenCount++;
2789
+ else literalLen += item.text.length;
2790
+ return {
2791
+ literalLen,
2792
+ tokenCount
2653
2793
  };
2654
- const compareCountThenSpecificity = (a, b) => {
2655
- if (b.count !== a.count) return b.count - a.count;
2656
- return compareSpecificityThenCount(a, b);
2794
+ };
2795
+ /** Build example from page content and window */
2796
+ const buildExample = (page, window, contextChars) => {
2797
+ const start = window[0].start;
2798
+ const end = window.at(-1).end;
2799
+ const ctxStart = Math.max(0, start - contextChars);
2800
+ const ctxEnd = Math.min(page.content.length, end + contextChars);
2801
+ return {
2802
+ context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
2803
+ pageId: page.id,
2804
+ startIndices: window.map((w) => w.start),
2805
+ text: page.content.slice(start, end)
2657
2806
  };
2658
- return [...counts.entries()].map(([pattern, v]) => ({
2659
- count: v.count,
2660
- examples: v.examples,
2807
+ };
2808
+ /** Extract N-grams from a single page */
2809
+ const extractPageNgrams = (page, items, opts, stats) => {
2810
+ for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
2811
+ const window = items.slice(i, i + n);
2812
+ if (opts.requireToken && !hasTokenInWindow(window)) continue;
2813
+ const pattern = buildPattern(window, opts.whitespace);
2814
+ if (!stats.has(pattern)) {
2815
+ if (stats.size >= opts.maxUniquePatterns) continue;
2816
+ stats.set(pattern, {
2817
+ count: 0,
2818
+ examples: [],
2819
+ ...computeWindowStats(window)
2820
+ });
2821
+ }
2822
+ const entry = stats.get(pattern);
2823
+ entry.count++;
2824
+ if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
2825
+ }
2826
+ };
2827
+ /**
2828
+ * Analyze pages for commonly repeating word sequences.
2829
+ *
2830
+ * Use for continuous text without line breaks. For line-based analysis,
2831
+ * use `analyzeCommonLineStarts()` instead.
2832
+ */
2833
+ const analyzeRepeatingSequences = (pages, options) => {
2834
+ const opts = resolveOptions(options);
2835
+ const stats = /* @__PURE__ */ new Map();
2836
+ for (const page of pages) {
2837
+ if (!page.content) continue;
2838
+ extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
2839
+ }
2840
+ return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
2841
+ count: s.count,
2842
+ examples: s.examples,
2661
2843
  pattern
2662
- })).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
2844
+ }));
2663
2845
  };
2664
2846
 
2665
2847
  //#endregion
@@ -2831,5 +3013,524 @@ const analyzeTextForRule = (text) => {
2831
3013
  };
2832
3014
 
2833
3015
  //#endregion
2834
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
3016
+ //#region src/recovery.ts
3017
+ const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
3018
+ const normalizeForCompare = (s, mode) => {
3019
+ if (mode === "none") return s;
3020
+ let out = s;
3021
+ if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
3022
+ out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
3023
+ return out;
3024
+ };
3025
+ const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
3026
+ const buildFixedOptions = (options, selectedRuleIndices) => {
3027
+ const fixedRules = (options.rules ?? []).map((r, idx) => {
3028
+ if (!selectedRuleIndices.has(idx)) return r;
3029
+ if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
3030
+ const { lineStartsAfter, ...rest } = r;
3031
+ return {
3032
+ ...rest,
3033
+ lineStartsWith: lineStartsAfter
3034
+ };
3035
+ });
3036
+ return {
3037
+ ...options,
3038
+ rules: fixedRules
3039
+ };
3040
+ };
3041
+ const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
3042
+ const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
3043
+ const parts = [];
3044
+ for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
3045
+ const matchContent = parts.join("\n");
3046
+ if (pageJoiner === "newline") return {
3047
+ matchContent,
3048
+ outputContent: matchContent
3049
+ };
3050
+ return {
3051
+ matchContent,
3052
+ outputContent: parts.join(" ")
3053
+ };
3054
+ };
3055
+ const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
3056
+ const rules = options.rules ?? [];
3057
+ const compiled = [];
3058
+ for (const idx of selectedRuleIndices) {
3059
+ const r = rules[idx];
3060
+ if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3061
+ const { lineStartsAfter, ...rest } = r;
3062
+ const built = buildRuleRegex({
3063
+ ...rest,
3064
+ lineStartsWith: lineStartsAfter
3065
+ });
3066
+ compiled.push({
3067
+ ruleIndex: idx,
3068
+ startsWithRegex: new RegExp(built.regex.source, "mu")
3069
+ });
3070
+ }
3071
+ return compiled;
3072
+ };
3073
+ const findUniqueAnchorPos = (outputContent, segmentContent) => {
3074
+ for (const len of [
3075
+ 80,
3076
+ 60,
3077
+ 40,
3078
+ 30,
3079
+ 20,
3080
+ 15
3081
+ ]) {
3082
+ const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
3083
+ if (!needle.trim()) continue;
3084
+ const first = outputContent.indexOf(needle);
3085
+ if (first === -1) continue;
3086
+ if (outputContent.indexOf(needle, first + 1) === -1) return first;
3087
+ }
3088
+ return null;
3089
+ };
3090
+ const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
3091
+ const line = matchContent.slice(lineStart);
3092
+ for (const mr of compiledMistaken) {
3093
+ mr.startsWithRegex.lastIndex = 0;
3094
+ const m = mr.startsWithRegex.exec(line);
3095
+ if (!m || m.index !== 0) continue;
3096
+ const markerMatch = m[0];
3097
+ const markerEnd = lineStart + markerMatch.length;
3098
+ if (anchorPos < markerEnd) continue;
3099
+ const gap = matchContent.slice(markerEnd, anchorPos);
3100
+ const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
3101
+ if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
3102
+ return { prefix: recoveredPrefix };
3103
+ }
3104
+ return { reason: "no selected marker pattern matched at anchored line start" };
3105
+ };
3106
+ const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
3107
+ const fromIdx = pageIdToIndex.get(segment.from);
3108
+ const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
3109
+ if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
3110
+ kind: "unresolved",
3111
+ reason: "segment page range not found in pages"
3112
+ };
3113
+ const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
3114
+ if (!segment.content) return {
3115
+ kind: "unresolved",
3116
+ reason: "empty segment content"
3117
+ };
3118
+ const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
3119
+ if (anchorPos === null) return {
3120
+ kind: "unresolved",
3121
+ reason: "could not uniquely anchor segment content in page range"
3122
+ };
3123
+ const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
3124
+ const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
3125
+ if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
3126
+ kind: "unresolved",
3127
+ reason: found.reason
3128
+ };
3129
+ return {
3130
+ kind: "recovered",
3131
+ recoveredContent: `${found.prefix}${segment.content}`,
3132
+ recoveredPrefix: found.prefix
3133
+ };
3134
+ };
3135
+ const resolveRuleIndicesSelector = (rules, indicesIn) => {
3136
+ const errors = [];
3137
+ const indices = /* @__PURE__ */ new Set();
3138
+ for (const idx of indicesIn) {
3139
+ if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
3140
+ errors.push(`Selector index out of range: ${idx}`);
3141
+ continue;
3142
+ }
3143
+ const rule = rules[idx];
3144
+ if (!rule || !("lineStartsAfter" in rule)) {
3145
+ errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
3146
+ continue;
3147
+ }
3148
+ indices.add(idx);
3149
+ }
3150
+ return {
3151
+ errors,
3152
+ indices,
3153
+ warnings: []
3154
+ };
3155
+ };
3156
+ const resolvePredicateSelector = (rules, predicate) => {
3157
+ const errors = [];
3158
+ const warnings = [];
3159
+ const indices = /* @__PURE__ */ new Set();
3160
+ rules.forEach((r, i) => {
3161
+ try {
3162
+ if (!predicate(r, i)) return;
3163
+ if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
3164
+ indices.add(i);
3165
+ return;
3166
+ }
3167
+ warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
3168
+ } catch (e) {
3169
+ const msg = e instanceof Error ? e.message : String(e);
3170
+ errors.push(`Predicate threw at rule ${i}: ${msg}`);
3171
+ }
3172
+ });
3173
+ if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
3174
+ return {
3175
+ errors,
3176
+ indices,
3177
+ warnings
3178
+ };
3179
+ };
3180
+ const resolvePatternsSelector = (rules, patterns, matchMode) => {
3181
+ const errors = [];
3182
+ const warnings = [];
3183
+ const indices = /* @__PURE__ */ new Set();
3184
+ const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
3185
+ const targets = patterns.map(normalizePattern);
3186
+ for (let pi = 0; pi < patterns.length; pi++) {
3187
+ const rawPattern = patterns[pi];
3188
+ const pat = targets[pi];
3189
+ const matched = [];
3190
+ for (let i = 0; i < rules.length; i++) {
3191
+ const r = rules[i];
3192
+ if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3193
+ if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
3194
+ }
3195
+ if (matched.length === 0) {
3196
+ errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
3197
+ continue;
3198
+ }
3199
+ if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
3200
+ matched.forEach((i) => {
3201
+ indices.add(i);
3202
+ });
3203
+ }
3204
+ return {
3205
+ errors,
3206
+ indices,
3207
+ warnings
3208
+ };
3209
+ };
3210
+ const resolveSelectorToRuleIndices = (options, selector) => {
3211
+ const rules = options.rules ?? [];
3212
+ if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
3213
+ if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
3214
+ return resolvePatternsSelector(rules, selector.patterns, selector.match);
3215
+ };
3216
+ const longestCommonSuffixLength = (a, b) => {
3217
+ const max = Math.min(a.length, b.length);
3218
+ let i = 0;
3219
+ while (i < max) {
3220
+ if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
3221
+ i++;
3222
+ }
3223
+ return i;
3224
+ };
3225
+ const AMBIGUITY_SCORE_GAP = 5;
3226
+ const scoreCandidate = (orig, fixed, normalizeMode) => {
3227
+ if (fixed.content === orig.content) return {
3228
+ fixedIndex: -1,
3229
+ kind: "exact",
3230
+ score: 100
3231
+ };
3232
+ if (fixed.content.endsWith(orig.content)) {
3233
+ const markerLen = fixed.content.length - orig.content.length;
3234
+ return {
3235
+ fixedIndex: -1,
3236
+ kind: "exact_suffix",
3237
+ score: 90 + Math.min(30, markerLen)
3238
+ };
3239
+ }
3240
+ if (normalizeMode !== "none") {
3241
+ const normFixed = normalizeForCompare(fixed.content, normalizeMode);
3242
+ const normOrig = normalizeForCompare(orig.content, normalizeMode);
3243
+ if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
3244
+ const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
3245
+ return {
3246
+ fixedIndex: -1,
3247
+ kind: "normalized_suffix",
3248
+ score: 70 + Math.floor(overlap * 20)
3249
+ };
3250
+ }
3251
+ }
3252
+ return null;
3253
+ };
3254
+ const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
3255
+ const warnings = [...reportBase.warnings];
3256
+ warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
3257
+ const details = segments.map((s, i) => {
3258
+ const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
3259
+ return {
3260
+ from: s.from,
3261
+ notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
3262
+ originalStartPreview: preview(s.content),
3263
+ segmentIndex: i,
3264
+ status,
3265
+ strategy: "none",
3266
+ to: s.to
3267
+ };
3268
+ });
3269
+ return {
3270
+ report: {
3271
+ ...reportBase,
3272
+ details,
3273
+ summary: {
3274
+ mode,
3275
+ recovered: 0,
3276
+ totalSegments: segments.length,
3277
+ unchanged: segments.length,
3278
+ unresolved: selectorErrors.length ? segments.length : 0
3279
+ },
3280
+ warnings
3281
+ },
3282
+ segments
3283
+ };
3284
+ };
3285
+ const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
3286
+ const recoveredAtIndex = /* @__PURE__ */ new Map();
3287
+ const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
3288
+ if (mode !== "best_effort_then_rerun") return {
3289
+ recoveredAtIndex,
3290
+ recoveredDetailAtIndex
3291
+ };
3292
+ const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
3293
+ const pageIdToIndex = buildPageIdToIndex(processedPages);
3294
+ const pageJoiner = options.pageJoiner ?? "space";
3295
+ const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
3296
+ for (let i = 0; i < segments.length; i++) {
3297
+ const orig = segments[i];
3298
+ const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
3299
+ if (r.kind !== "recovered") continue;
3300
+ const seg = {
3301
+ ...orig,
3302
+ content: r.recoveredContent
3303
+ };
3304
+ recoveredAtIndex.set(i, seg);
3305
+ recoveredDetailAtIndex.set(i, {
3306
+ from: orig.from,
3307
+ originalStartPreview: preview(orig.content),
3308
+ recoveredPrefixPreview: preview(r.recoveredPrefix),
3309
+ recoveredStartPreview: preview(seg.content),
3310
+ segmentIndex: i,
3311
+ status: "recovered",
3312
+ strategy: "stage1",
3313
+ to: orig.to
3314
+ });
3315
+ }
3316
+ return {
3317
+ recoveredAtIndex,
3318
+ recoveredDetailAtIndex
3319
+ };
3320
+ };
3321
+ const buildFixedBuckets = (fixedSegments) => {
3322
+ const buckets = /* @__PURE__ */ new Map();
3323
+ for (let i = 0; i < fixedSegments.length; i++) {
3324
+ const k = segmentRangeKey(fixedSegments[i]);
3325
+ const arr = buckets.get(k);
3326
+ if (!arr) buckets.set(k, [i]);
3327
+ else arr.push(i);
3328
+ }
3329
+ return buckets;
3330
+ };
3331
+ const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
3332
+ let best = null;
3333
+ let secondBestScore = -Infinity;
3334
+ for (const fixedIdx of candidates) {
3335
+ if (usedFixed.has(fixedIdx)) continue;
3336
+ const fixed = fixedSegments[fixedIdx];
3337
+ const scored = scoreCandidate(orig, fixed, normalizeCompare);
3338
+ if (!scored) continue;
3339
+ const candidateScore = scored.score;
3340
+ if (!best || candidateScore > best.score) {
3341
+ secondBestScore = best?.score ?? -Infinity;
3342
+ best = {
3343
+ fixedIdx,
3344
+ score: candidateScore
3345
+ };
3346
+ } else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
3347
+ }
3348
+ if (!best) return { kind: "none" };
3349
+ if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
3350
+ return {
3351
+ fixedIdx: best.fixedIdx,
3352
+ kind: "match"
3353
+ };
3354
+ };
3355
+ const detailUnresolved = (orig, segmentIndex, notes) => ({
3356
+ from: orig.from,
3357
+ notes,
3358
+ originalStartPreview: preview(orig.content),
3359
+ segmentIndex,
3360
+ status: "unresolved_alignment",
3361
+ strategy: "rerun",
3362
+ to: orig.to
3363
+ });
3364
+ const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
3365
+ from: orig.from,
3366
+ notes,
3367
+ originalStartPreview: preview(orig.content),
3368
+ segmentIndex,
3369
+ status: "skipped_idempotent",
3370
+ strategy: "rerun",
3371
+ to: orig.to
3372
+ });
3373
+ const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
3374
+ let recoveredPrefixPreview;
3375
+ if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
3376
+ return {
3377
+ from: orig.from,
3378
+ originalStartPreview: preview(orig.content),
3379
+ recoveredPrefixPreview,
3380
+ recoveredStartPreview: preview(fixed.content),
3381
+ segmentIndex,
3382
+ status: "recovered",
3383
+ strategy: "rerun",
3384
+ to: orig.to
3385
+ };
3386
+ };
3387
+ const mergeWithRerun = (params) => {
3388
+ const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
3389
+ const usedFixed = /* @__PURE__ */ new Set();
3390
+ const out = [];
3391
+ const details = [];
3392
+ let recovered = 0;
3393
+ let unresolved = 0;
3394
+ let unchanged = 0;
3395
+ for (let i = 0; i < originalSegments.length; i++) {
3396
+ const stage1Recovered = stage1RecoveredAtIndex.get(i);
3397
+ if (stage1Recovered) {
3398
+ out.push(stage1Recovered);
3399
+ recovered++;
3400
+ details.push(recoveredDetailAtIndex.get(i) ?? {
3401
+ from: stage1Recovered.from,
3402
+ originalStartPreview: preview(originalSegments[i].content),
3403
+ recoveredStartPreview: preview(stage1Recovered.content),
3404
+ segmentIndex: i,
3405
+ status: "recovered",
3406
+ strategy: "stage1",
3407
+ to: stage1Recovered.to
3408
+ });
3409
+ continue;
3410
+ }
3411
+ const orig = originalSegments[i];
3412
+ const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
3413
+ if (best.kind === "none") {
3414
+ out.push(orig);
3415
+ unresolved++;
3416
+ details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
3417
+ continue;
3418
+ }
3419
+ if (best.kind === "ambiguous") {
3420
+ out.push(orig);
3421
+ unresolved++;
3422
+ details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
3423
+ continue;
3424
+ }
3425
+ usedFixed.add(best.fixedIdx);
3426
+ const fixed = fixedSegments[best.fixedIdx];
3427
+ if (fixed.content === orig.content) {
3428
+ out.push(orig);
3429
+ unchanged++;
3430
+ details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
3431
+ continue;
3432
+ }
3433
+ out.push({
3434
+ ...orig,
3435
+ content: fixed.content
3436
+ });
3437
+ recovered++;
3438
+ details.push(detailRecoveredRerun(orig, fixed, i));
3439
+ }
3440
+ return {
3441
+ details,
3442
+ segments: out,
3443
+ summary: {
3444
+ recovered,
3445
+ unchanged,
3446
+ unresolved
3447
+ }
3448
+ };
3449
+ };
3450
+ function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
3451
+ const mode = opts?.mode ?? "rerun_only";
3452
+ const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
3453
+ const resolved = resolveSelectorToRuleIndices(options, selector);
3454
+ const reportBase = {
3455
+ byRun: void 0,
3456
+ errors: resolved.errors,
3457
+ warnings: resolved.warnings
3458
+ };
3459
+ if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
3460
+ const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
3461
+ const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
3462
+ const merged = mergeWithRerun({
3463
+ fixedBuckets: buildFixedBuckets(fixedSegments),
3464
+ fixedSegments,
3465
+ normalizeCompare,
3466
+ originalSegments: segments,
3467
+ recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
3468
+ stage1RecoveredAtIndex: stage1.recoveredAtIndex
3469
+ });
3470
+ return {
3471
+ report: {
3472
+ ...reportBase,
3473
+ details: merged.details,
3474
+ summary: {
3475
+ mode,
3476
+ recovered: merged.summary.recovered,
3477
+ totalSegments: segments.length,
3478
+ unchanged: merged.summary.unchanged,
3479
+ unresolved: merged.summary.unresolved
3480
+ }
3481
+ },
3482
+ segments: merged.segments
3483
+ };
3484
+ }
3485
+ function recoverMistakenMarkersForRuns(runs, opts) {
3486
+ const allSegments = [];
3487
+ const byRun = [];
3488
+ const details = [];
3489
+ const warnings = [];
3490
+ const errors = [];
3491
+ let recovered = 0;
3492
+ let unchanged = 0;
3493
+ let unresolved = 0;
3494
+ let offset = 0;
3495
+ for (let i = 0; i < runs.length; i++) {
3496
+ const run = runs[i];
3497
+ const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
3498
+ allSegments.push(...res.segments);
3499
+ for (const d of res.report.details) details.push({
3500
+ ...d,
3501
+ segmentIndex: d.segmentIndex + offset
3502
+ });
3503
+ offset += run.segments.length;
3504
+ recovered += res.report.summary.recovered;
3505
+ unchanged += res.report.summary.unchanged;
3506
+ unresolved += res.report.summary.unresolved;
3507
+ warnings.push(...res.report.warnings);
3508
+ errors.push(...res.report.errors);
3509
+ byRun.push({
3510
+ recovered: res.report.summary.recovered,
3511
+ runIndex: i,
3512
+ totalSegments: run.segments.length,
3513
+ unresolved: res.report.summary.unresolved
3514
+ });
3515
+ }
3516
+ return {
3517
+ report: {
3518
+ byRun,
3519
+ details,
3520
+ errors,
3521
+ summary: {
3522
+ mode: opts?.mode ?? "rerun_only",
3523
+ recovered,
3524
+ totalSegments: offset,
3525
+ unchanged,
3526
+ unresolved
3527
+ },
3528
+ warnings
3529
+ },
3530
+ segments: allSegments
3531
+ };
3532
+ }
3533
+
3534
+ //#endregion
3535
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
2835
3536
  //# sourceMappingURL=index.mjs.map