flappa-doormal 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -645,6 +645,10 @@ const buildBareTokenRegex = () => {
645
645
  * Validates a single pattern for common issues.
646
646
  */
647
647
  const validatePattern = (pattern, seenPatterns) => {
648
+ if (!pattern.trim()) return {
649
+ message: "Empty pattern is not allowed",
650
+ type: "empty_pattern"
651
+ };
648
652
  if (seenPatterns.has(pattern)) return {
649
653
  message: `Duplicate pattern: "${pattern}"`,
650
654
  type: "duplicate"
@@ -727,7 +731,7 @@ const validateRules = (rules) => {
727
731
  hasIssues = true;
728
732
  }
729
733
  }
730
- if ("template" in rule && rule.template) {
734
+ if ("template" in rule && rule.template !== void 0) {
731
735
  const seenPatterns = /* @__PURE__ */ new Set();
732
736
  const issue = validatePattern(rule.template, seenPatterns);
733
737
  if (issue) {
@@ -1245,16 +1249,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
1245
1249
  */
1246
1250
  const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
1247
1251
  const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
1248
- for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
1252
+ for (let i = 0; i < expandedBreakpoints.length; i++) {
1253
+ const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
1249
1254
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
1250
1255
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1251
1256
  if (skipWhenRegex?.test(remainingContent)) continue;
1252
- if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
1257
+ if (regex === null) return {
1258
+ breakpointIndex: i,
1259
+ breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
1260
+ rule
1261
+ };
1253
1262
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1254
- if (breakPos > 0) return breakPos;
1263
+ if (breakPos > 0) return {
1264
+ breakpointIndex: i,
1265
+ breakPos,
1266
+ rule
1267
+ };
1255
1268
  }
1256
- return -1;
1269
+ return null;
1270
+ };
1271
+
1272
+ //#endregion
1273
+ //#region src/segmentation/debug-meta.ts
1274
+ const resolveDebugConfig = (debug) => {
1275
+ if (!debug) return null;
1276
+ if (debug === true) return {
1277
+ includeBreakpoint: true,
1278
+ includeRule: true,
1279
+ metaKey: "_flappa"
1280
+ };
1281
+ if (typeof debug !== "object") return null;
1282
+ const metaKey = debug.metaKey;
1283
+ const include = debug.include;
1284
+ const includeRule = Array.isArray(include) ? include.includes("rule") : true;
1285
+ return {
1286
+ includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
1287
+ includeRule,
1288
+ metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
1289
+ };
1290
+ };
1291
+ const getRulePatternType = (rule) => {
1292
+ if ("lineStartsWith" in rule) return "lineStartsWith";
1293
+ if ("lineStartsAfter" in rule) return "lineStartsAfter";
1294
+ if ("lineEndsWith" in rule) return "lineEndsWith";
1295
+ if ("template" in rule) return "template";
1296
+ return "regex";
1297
+ };
1298
+ const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
1299
+ const mergeDebugIntoMeta = (meta, metaKey, patch) => {
1300
+ const out = meta ? { ...meta } : {};
1301
+ const existing = out[metaKey];
1302
+ out[metaKey] = {
1303
+ ...isPlainObject(existing) ? existing : {},
1304
+ ...patch
1305
+ };
1306
+ return out;
1257
1307
  };
1308
+ const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
1309
+ index: ruleIndex,
1310
+ patternType: getRulePatternType(rule)
1311
+ } });
1312
+ const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
1313
+ index: breakpointIndex,
1314
+ kind: rule.pattern === "" ? "pageBoundary" : "pattern",
1315
+ pattern: rule.pattern
1316
+ } });
1258
1317
 
1259
1318
  //#endregion
1260
1319
  //#region src/segmentation/breakpoint-processor.ts
@@ -1338,20 +1397,25 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
1338
1397
  const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
1339
1398
  if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
1340
1399
  const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1341
- if (exclusionBreak > 0) return exclusionBreak;
1400
+ if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
1342
1401
  }
1343
- const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1402
+ const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1344
1403
  expandedBreakpoints,
1345
1404
  normalizedPages,
1346
1405
  pageIds,
1347
1406
  prefer
1348
1407
  });
1349
- return patternBreak > 0 ? patternBreak : windowEndPosition;
1408
+ if (patternMatch && patternMatch.breakPos > 0) return {
1409
+ breakOffset: patternMatch.breakPos,
1410
+ breakpointIndex: patternMatch.breakpointIndex,
1411
+ breakpointRule: patternMatch.rule
1412
+ };
1413
+ return { breakOffset: windowEndPosition };
1350
1414
  };
1351
1415
  /**
1352
1416
  * Advances cursor position past any leading whitespace.
1353
1417
  */
1354
- const skipWhitespace = (content, startPos) => {
1418
+ const skipWhitespace$1 = (content, startPos) => {
1355
1419
  let pos = startPos;
1356
1420
  while (pos < content.length && /\s/.test(content[pos])) pos++;
1357
1421
  return pos;
@@ -1362,12 +1426,13 @@ const skipWhitespace = (content, startPos) => {
1362
1426
  *
1363
1427
  * Uses precomputed boundary positions for O(log n) page attribution lookups.
1364
1428
  */
1365
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
1429
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
1366
1430
  const result = [];
1367
1431
  const fullContent = segment.content;
1368
1432
  let cursorPos = 0;
1369
1433
  let currentFromIdx = fromIdx;
1370
1434
  let isFirstPiece = true;
1435
+ let lastBreakpoint = null;
1371
1436
  const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1372
1437
  logger?.debug?.("[breakpoints] boundaryPositions built", {
1373
1438
  boundaryPositions,
@@ -1382,7 +1447,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1382
1447
  const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1383
1448
  const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1384
1449
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
1385
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
1450
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1451
+ const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
1452
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
1386
1453
  if (finalSeg) result.push(finalSeg);
1387
1454
  break;
1388
1455
  }
@@ -1393,8 +1460,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1393
1460
  cursorPos,
1394
1461
  windowEndIdx
1395
1462
  });
1396
- const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1397
- const breakPos = cursorPos + breakOffset;
1463
+ const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1464
+ if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
1465
+ breakpointIndex: found.breakpointIndex,
1466
+ rule: found.breakpointRule
1467
+ };
1468
+ const breakPos = cursorPos + found.breakOffset;
1398
1469
  const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
1399
1470
  const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1400
1471
  logger?.trace?.("[breakpoints] piece", {
@@ -1403,10 +1474,11 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1403
1474
  pieceLength: pieceContent.length
1404
1475
  });
1405
1476
  if (pieceContent) {
1406
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
1477
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1478
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
1407
1479
  if (pieceSeg) result.push(pieceSeg);
1408
1480
  }
1409
- cursorPos = skipWhitespace(fullContent, breakPos);
1481
+ cursorPos = skipWhitespace$1(fullContent, breakPos);
1410
1482
  currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
1411
1483
  isFirstPiece = false;
1412
1484
  }
@@ -1418,7 +1490,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1418
1490
  *
1419
1491
  * Note: This is an internal engine used by `segmentPages()`.
1420
1492
  */
1421
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
1493
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
1422
1494
  const pageIds = pages.map((p) => p.id);
1423
1495
  const pageIdToIndex = buildPageIdToIndexMap(pageIds);
1424
1496
  const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1446,7 +1518,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1446
1518
  result.push(segment);
1447
1519
  continue;
1448
1520
  }
1449
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
1521
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
1450
1522
  result.push(...broken.map((s) => {
1451
1523
  const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
1452
1524
  const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -1959,6 +2031,129 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
1959
2031
  return splitPointsByRule;
1960
2032
  };
1961
2033
 
2034
+ //#endregion
2035
+ //#region src/segmentation/split-point-helpers.ts
2036
+ /**
2037
+ * Helper module for collectSplitPointsFromRules to reduce complexity.
2038
+ * Handles combined regex matching and split point creation.
2039
+ */
2040
+ const MAX_REGEX_ITERATIONS = 1e5;
2041
+ const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
2042
+ const result = {};
2043
+ if (!groups) return result;
2044
+ for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
2045
+ return result;
2046
+ };
2047
+ const buildContentOffsets = (match, ruleInfo) => {
2048
+ if (!ruleInfo.usesLineStartsAfter) return {};
2049
+ const captured = match.groups?.[`${ruleInfo.prefix}__content`];
2050
+ if (captured === void 0) return {};
2051
+ return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
2052
+ };
2053
+ const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
2054
+ const createSplitPointFromMatch = (match, rule, ruleInfo) => {
2055
+ const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
2056
+ const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
2057
+ return {
2058
+ capturedContent: void 0,
2059
+ contentStartOffset,
2060
+ index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
2061
+ meta: rule.meta,
2062
+ namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
2063
+ };
2064
+ };
2065
+ const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
2066
+ const combinedSource = ruleRegexes.map((r) => r.source).join("|");
2067
+ const combinedRegex = new RegExp(combinedSource, "gm");
2068
+ logger?.debug?.("[segmenter] combined regex built", {
2069
+ combinableRuleCount: combinableRules.length,
2070
+ combinedSourceLength: combinedSource.length
2071
+ });
2072
+ let m = combinedRegex.exec(matchContent);
2073
+ let iterations = 0;
2074
+ while (m !== null) {
2075
+ iterations++;
2076
+ if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
2077
+ if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
2078
+ iterations,
2079
+ position: m.index
2080
+ });
2081
+ const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
2082
+ if (matchedIndex !== -1) {
2083
+ const { rule, index: originalIndex } = combinableRules[matchedIndex];
2084
+ const ruleInfo = ruleRegexes[matchedIndex];
2085
+ if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
2086
+ const sp = createSplitPointFromMatch(m, rule, ruleInfo);
2087
+ if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
2088
+ splitPointsByRule.get(originalIndex).push(sp);
2089
+ }
2090
+ }
2091
+ if (m[0].length === 0) combinedRegex.lastIndex++;
2092
+ m = combinedRegex.exec(matchContent);
2093
+ }
2094
+ };
2095
+ const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
2096
+ const built = buildRuleRegex(rule, prefix);
2097
+ return {
2098
+ ...built,
2099
+ prefix,
2100
+ source: `(?<${prefix}>${built.regex.source})`
2101
+ };
2102
+ });
2103
+ const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
2104
+ const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
2105
+ const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
2106
+ const isLSA = usesLineStartsAfter && m.captured !== void 0;
2107
+ const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
2108
+ return {
2109
+ capturedContent: isLSA ? void 0 : m.captured,
2110
+ contentStartOffset: isLSA ? markerLen : void 0,
2111
+ index: (rule.split ?? "at") === "at" ? m.start : m.end,
2112
+ meta: rule.meta,
2113
+ namedCaptures: m.namedCaptures
2114
+ };
2115
+ });
2116
+ if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
2117
+ splitPointsByRule.get(ruleIndex).push(...points);
2118
+ };
2119
+ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
2120
+ const matches = [];
2121
+ let m = regex.exec(content);
2122
+ while (m !== null) {
2123
+ matches.push({
2124
+ captured: usesCapture ? getLastPositionalCapture(m) : void 0,
2125
+ end: m.index + m[0].length,
2126
+ namedCaptures: extractNamedCaptures(m.groups, captureNames),
2127
+ start: m.index
2128
+ });
2129
+ if (m[0].length === 0) regex.lastIndex++;
2130
+ m = regex.exec(content);
2131
+ }
2132
+ return matches;
2133
+ };
2134
+ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
2135
+ const result = [];
2136
+ rules.forEach((rule, index) => {
2137
+ const points = splitPointsByRule.get(index);
2138
+ if (!points?.length) return;
2139
+ const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
2140
+ if (!debugMetaKey) {
2141
+ result.push(...filtered.map((p) => ({
2142
+ ...p,
2143
+ ruleIndex: index
2144
+ })));
2145
+ return;
2146
+ }
2147
+ const debugPatch = buildRuleDebugPatch(index, rule);
2148
+ result.push(...filtered.map((p) => ({
2149
+ ...p,
2150
+ meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
2151
+ ruleIndex: index
2152
+ })));
2153
+ });
2154
+ return result;
2155
+ };
2156
+
1962
2157
  //#endregion
1963
2158
  //#region src/segmentation/textUtils.ts
1964
2159
  /**
@@ -1985,7 +2180,6 @@ const normalizeLineEndings = (content) => {
1985
2180
  *
1986
2181
  * @module segmenter
1987
2182
  */
1988
- const MAX_REGEX_ITERATIONS = 1e5;
1989
2183
  /**
1990
2184
  * Builds a concatenated content string and page mapping from input pages.
1991
2185
  *
@@ -2082,7 +2276,7 @@ const dedupeSplitPoints = (splitPoints) => {
2082
2276
  const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
2083
2277
  if (segments.length > 0 || pages.length === 0) return segments;
2084
2278
  const firstPage = pages[0];
2085
- const lastPage = pages[pages.length - 1];
2279
+ const lastPage = pages.at(-1);
2086
2280
  const joinChar = pageJoiner === "newline" ? "\n" : " ";
2087
2281
  const allContent = normalizedContent.join(joinChar).trim();
2088
2282
  if (!allContent) return segments;
@@ -2093,7 +2287,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
2093
2287
  if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
2094
2288
  return [initialSeg];
2095
2289
  };
2096
- const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2290
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
2097
2291
  logger?.debug?.("[segmenter] collecting split points from rules", {
2098
2292
  contentLength: matchContent.length,
2099
2293
  ruleCount: rules.length
@@ -2106,124 +2300,9 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2106
2300
  standaloneCount: standaloneRules.length
2107
2301
  });
2108
2302
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
2109
- if (combinableRules.length > 0) {
2110
- const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
2111
- const built = buildRuleRegex(rule, prefix);
2112
- return {
2113
- prefix,
2114
- source: `(?<${prefix}>${built.regex.source})`,
2115
- ...built
2116
- };
2117
- });
2118
- const combinedSource = ruleRegexes.map((r) => r.source).join("|");
2119
- const combinedRegex = new RegExp(combinedSource, "gm");
2120
- logger?.debug?.("[segmenter] combined regex built", {
2121
- combinableRuleCount: combinableRules.length,
2122
- combinedSourceLength: combinedSource.length
2123
- });
2124
- combinedRegex.lastIndex = 0;
2125
- let m = combinedRegex.exec(matchContent);
2126
- let iterationCount = 0;
2127
- while (m !== null) {
2128
- iterationCount++;
2129
- if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
2130
- if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
2131
- iterationCount,
2132
- lastIndex: combinedRegex.lastIndex,
2133
- matchLength: m[0].length,
2134
- matchPosition: m.index
2135
- });
2136
- const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
2137
- if (matchedRuleIndex !== -1) {
2138
- const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
2139
- const ruleInfo = ruleRegexes[matchedRuleIndex];
2140
- const namedCaptures = {};
2141
- if (m.groups) {
2142
- for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
2143
- const cleanName = prefixedName.slice(prefix.length);
2144
- namedCaptures[cleanName] = m.groups[prefixedName];
2145
- }
2146
- }
2147
- let capturedContent;
2148
- let contentStartOffset;
2149
- if (ruleInfo.usesLineStartsAfter) {
2150
- capturedContent = m.groups?.[`${prefix}__content`];
2151
- if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
2152
- }
2153
- const start = m.index;
2154
- const end = m.index + m[0].length;
2155
- const pageId = pageMap.getId(start);
2156
- if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
2157
- const sp = {
2158
- capturedContent: void 0,
2159
- contentStartOffset,
2160
- index: (rule.split ?? "at") === "at" ? start : end,
2161
- meta: rule.meta,
2162
- namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
2163
- };
2164
- if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
2165
- splitPointsByRule.get(originalIndex).push(sp);
2166
- }
2167
- }
2168
- if (m[0].length === 0) combinedRegex.lastIndex++;
2169
- m = combinedRegex.exec(matchContent);
2170
- }
2171
- }
2172
- const collectSplitPointsFromRule = (rule, ruleIndex) => {
2173
- const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
2174
- const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
2175
- const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
2176
- const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
2177
- return {
2178
- capturedContent: isLineStartsAfter ? void 0 : m.captured,
2179
- contentStartOffset: isLineStartsAfter ? markerLength : void 0,
2180
- index: (rule.split ?? "at") === "at" ? m.start : m.end,
2181
- meta: rule.meta,
2182
- namedCaptures: m.namedCaptures
2183
- };
2184
- });
2185
- if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
2186
- splitPointsByRule.get(ruleIndex).push(...points);
2187
- };
2188
- standaloneRules.forEach((rule) => {
2189
- collectSplitPointsFromRule(rule, rules.indexOf(rule));
2190
- });
2191
- const finalSplitPoints = [];
2192
- rules.forEach((rule, index) => {
2193
- const points = splitPointsByRule.get(index);
2194
- if (!points || points.length === 0) return;
2195
- let filtered = points;
2196
- if (rule.occurrence === "first") filtered = [points[0]];
2197
- else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
2198
- finalSplitPoints.push(...filtered);
2199
- });
2200
- return finalSplitPoints;
2201
- };
2202
- /**
2203
- * Executes a regex against content and extracts match results with capture information.
2204
- *
2205
- * @param content - Full content string to search
2206
- * @param regex - Compiled regex with 'g' flag
2207
- * @param usesCapture - Whether to extract captured content
2208
- * @param captureNames - Names of expected named capture groups
2209
- * @returns Array of match results with positions and captures
2210
- */
2211
- const findMatches = (content, regex, usesCapture, captureNames) => {
2212
- const matches = [];
2213
- regex.lastIndex = 0;
2214
- let m = regex.exec(content);
2215
- while (m !== null) {
2216
- const result = {
2217
- end: m.index + m[0].length,
2218
- start: m.index
2219
- };
2220
- result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
2221
- if (usesCapture) result.captured = getLastPositionalCapture(m);
2222
- matches.push(result);
2223
- if (m[0].length === 0) regex.lastIndex++;
2224
- m = regex.exec(content);
2225
- }
2226
- return matches;
2303
+ if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
2304
+ for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
2305
+ return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
2227
2306
  };
2228
2307
  /**
2229
2308
  * Finds page breaks within a given offset range using binary search.
@@ -2326,6 +2405,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2326
2405
  */
2327
2406
  const segmentPages = (pages, options) => {
2328
2407
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2408
+ const debug = resolveDebugConfig(options.debug);
2409
+ const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
2329
2410
  logger?.info?.("[segmenter] starting segmentation", {
2330
2411
  breakpointCount: breakpoints.length,
2331
2412
  maxPages,
@@ -2339,7 +2420,7 @@ const segmentPages = (pages, options) => {
2339
2420
  pageIds: pageMap.pageIds,
2340
2421
  totalContentLength: matchContent.length
2341
2422
  });
2342
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
2423
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
2343
2424
  const unique = dedupeSplitPoints(splitPoints);
2344
2425
  logger?.debug?.("[segmenter] split points collected", {
2345
2426
  rawSplitPoints: splitPoints.length,
@@ -2358,7 +2439,7 @@ const segmentPages = (pages, options) => {
2358
2439
  if (maxPages >= 0 && breakpoints.length) {
2359
2440
  logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2360
2441
  const patternProcessor = (p) => processPattern(p, false).pattern;
2361
- const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2442
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
2362
2443
  logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2363
2444
  return result;
2364
2445
  }
@@ -2410,7 +2491,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
2410
2491
  const result = [];
2411
2492
  for (let i = 0; i < splitPoints.length; i++) {
2412
2493
  const sp = splitPoints[i];
2413
- const end = i < splitPoints.length - 1 ? splitPoints[i + 1].index : content.length;
2494
+ const end = splitPoints[i + 1]?.index ?? content.length;
2414
2495
  const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
2415
2496
  if (s) result.push(s);
2416
2497
  }
@@ -2434,29 +2515,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
2434
2515
  };
2435
2516
 
2436
2517
  //#endregion
2437
- //#region src/analysis.ts
2438
- const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2439
- const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
2440
- const computeSpecificity = (pattern) => {
2441
- const tokenCount = countTokenMarkers(pattern);
2442
- return {
2443
- literalLen: stripWhitespacePlaceholders(pattern).length,
2444
- tokenCount
2445
- };
2446
- };
2447
- const DEFAULT_OPTIONS = {
2448
- includeFirstWordFallback: true,
2449
- lineFilter: void 0,
2450
- maxExamples: 1,
2451
- minCount: 3,
2452
- minLineLength: 6,
2453
- normalizeArabicDiacritics: true,
2454
- prefixChars: 60,
2455
- prefixMatchers: [/^#+/u],
2456
- sortBy: "specificity",
2457
- topK: 40,
2458
- whitespace: "regex"
2459
- };
2518
+ //#region src/analysis/shared.ts
2460
2519
  const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
2461
2520
  const TOKEN_PRIORITY_ORDER$1 = [
2462
2521
  "basmalah",
@@ -2497,30 +2556,7 @@ const appendWs = (out, mode) => {
2497
2556
  if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
2498
2557
  return out.endsWith("\\s*") ? out : `${out}\\s*`;
2499
2558
  };
2500
- const consumeLeadingPrefixes = (s, pos, out, prefixMatchers, whitespace) => {
2501
- let matchedAny = false;
2502
- let currentPos = pos;
2503
- let currentOut = out;
2504
- for (const re of prefixMatchers) {
2505
- if (currentPos >= s.length) break;
2506
- const m = re.exec(s.slice(currentPos));
2507
- if (!m || m.index !== 0 || !m[0]) continue;
2508
- currentOut += escapeSignatureLiteral(m[0]);
2509
- currentPos += m[0].length;
2510
- matchedAny = true;
2511
- const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
2512
- if (wsAfter) {
2513
- currentPos += wsAfter[0].length;
2514
- currentOut = appendWs(currentOut, whitespace);
2515
- }
2516
- }
2517
- return {
2518
- matchedAny,
2519
- out: currentOut,
2520
- pos: currentPos
2521
- };
2522
- };
2523
- const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2559
+ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
2524
2560
  let best = null;
2525
2561
  for (const { token, re } of compiled) {
2526
2562
  re.lastIndex = pos;
@@ -2534,132 +2570,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2534
2570
  if (best?.token === "rumuz") {
2535
2571
  const end = pos + best.text.length;
2536
2572
  const next = end < s.length ? s[end] : "";
2537
- if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
2573
+ if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
2538
2574
  }
2539
2575
  return best;
2540
2576
  };
2541
- const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers, whitespace) => {
2577
+ const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
2578
+ const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2579
+
2580
+ //#endregion
2581
+ //#region src/analysis/line-starts.ts
2582
+ const resolveOptions$1 = (options = {}) => ({
2583
+ includeFirstWordFallback: options.includeFirstWordFallback ?? true,
2584
+ lineFilter: options.lineFilter,
2585
+ maxExamples: options.maxExamples ?? 1,
2586
+ minCount: options.minCount ?? 3,
2587
+ minLineLength: options.minLineLength ?? 6,
2588
+ normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
2589
+ prefixChars: options.prefixChars ?? 60,
2590
+ prefixMatchers: options.prefixMatchers ?? [/^#+/u],
2591
+ sortBy: options.sortBy ?? "specificity",
2592
+ topK: options.topK ?? 40,
2593
+ whitespace: options.whitespace ?? "regex"
2594
+ });
2595
+ const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2596
+ const computeSpecificity = (pattern) => ({
2597
+ literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
2598
+ tokenCount: countTokenMarkers(pattern)
2599
+ });
2600
+ const compareBySpecificity = (a, b) => {
2601
+ const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
2602
+ return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
2603
+ };
2604
+ const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
2605
+ /** Remove trailing whitespace placeholders */
2606
+ const trimTrailingWs = (out, mode) => {
2607
+ const suffix = mode === "regex" ? "\\s*" : " ";
2608
+ while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
2609
+ return out;
2610
+ };
2611
+ /** Try to extract first word for fallback */
2612
+ const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
2613
+ /** Consume prefix matchers at current position */
2614
+ const consumePrefixes = (s, pos, out, matchers, ws) => {
2615
+ let matched = false;
2616
+ for (const re of matchers) {
2617
+ if (pos >= s.length) break;
2618
+ const m = re.exec(s.slice(pos));
2619
+ if (!m?.index && m?.[0]) {
2620
+ out += escapeSignatureLiteral(m[0]);
2621
+ pos += m[0].length;
2622
+ matched = true;
2623
+ const wsm = /^[ \t]+/u.exec(s.slice(pos));
2624
+ if (wsm) {
2625
+ pos += wsm[0].length;
2626
+ out = appendWs(out, ws);
2627
+ }
2628
+ }
2629
+ }
2630
+ return {
2631
+ matched,
2632
+ out,
2633
+ pos
2634
+ };
2635
+ };
2636
+ /** Try to match a token at current position and append to signature */
2637
+ const tryMatchToken = (s, pos, out, compiled) => {
2638
+ const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2639
+ if (!best) return {
2640
+ matched: false,
2641
+ out,
2642
+ pos
2643
+ };
2644
+ return {
2645
+ matched: true,
2646
+ out: `${out}{{${best.token}}}`,
2647
+ pos: pos + best.text.length
2648
+ };
2649
+ };
2650
+ /** Try to match a delimiter at current position */
2651
+ const tryMatchDelimiter = (s, pos, out) => {
2652
+ const ch = s[pos];
2653
+ if (!ch || !isCommonDelimiter(ch)) return {
2654
+ matched: false,
2655
+ out,
2656
+ pos
2657
+ };
2658
+ return {
2659
+ matched: true,
2660
+ out: out + escapeSignatureLiteral(ch),
2661
+ pos: pos + 1
2662
+ };
2663
+ };
2664
+ /** Skip whitespace at position */
2665
+ const skipWhitespace = (s, pos, out, ws) => {
2666
+ const m = /^[ \t]+/u.exec(s.slice(pos));
2667
+ if (!m) return {
2668
+ out,
2669
+ pos,
2670
+ skipped: false
2671
+ };
2672
+ return {
2673
+ out: appendWs(out, ws),
2674
+ pos: pos + m[0].length,
2675
+ skipped: true
2676
+ };
2677
+ };
2678
+ const tokenizeLineStart = (line, tokenNames, opts) => {
2542
2679
  const trimmed = collapseWhitespace(line);
2543
2680
  if (!trimmed) return null;
2544
- const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
2545
- let pos = 0;
2546
- let out = "";
2547
- let matchedAny = false;
2548
- let matchedToken = false;
2681
+ const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
2549
2682
  const compiled = compileTokenRegexes(tokenNames);
2550
- const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
2551
- const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2552
- {
2553
- const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers, whitespace);
2554
- pos = consumed.pos;
2555
- out = consumed.out;
2556
- matchedAny = consumed.matchedAny;
2557
- }
2558
- let tokenSteps = 0;
2559
- while (tokenSteps < 6 && pos < s.length) {
2560
- const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
2561
- if (wsMatch) {
2562
- pos += wsMatch[0].length;
2563
- out = appendWs(out, whitespace);
2683
+ let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
2684
+ const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
2685
+ pos = prefix.pos;
2686
+ out = prefix.out;
2687
+ matchedAny = prefix.matched;
2688
+ while (steps < 6 && pos < s.length) {
2689
+ const ws = skipWhitespace(s, pos, out, opts.whitespace);
2690
+ if (ws.skipped) {
2691
+ pos = ws.pos;
2692
+ out = ws.out;
2564
2693
  continue;
2565
2694
  }
2566
- const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2567
- if (best) {
2568
- if (out && !out.endsWith("\\s*")) {}
2569
- out += `{{${best.token}}}`;
2570
- matchedAny = true;
2571
- matchedToken = true;
2572
- pos += best.text.length;
2573
- tokenSteps++;
2695
+ const tok = tryMatchToken(s, pos, out, compiled);
2696
+ if (tok.matched) {
2697
+ pos = tok.pos;
2698
+ out = tok.out;
2699
+ matchedAny = matchedToken = true;
2700
+ steps++;
2574
2701
  continue;
2575
2702
  }
2576
2703
  if (matchedAny) {
2577
- const ch = s[pos];
2578
- if (ch && isCommonDelimiter(ch)) {
2579
- out += escapeSignatureLiteral(ch);
2580
- pos += 1;
2704
+ const delim = tryMatchDelimiter(s, pos, out);
2705
+ if (delim.matched) {
2706
+ pos = delim.pos;
2707
+ out = delim.out;
2581
2708
  continue;
2582
2709
  }
2583
2710
  }
2584
2711
  if (matchedAny) {
2585
- if (includeFirstWordFallback && !matchedToken) {
2586
- const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2587
- if (!firstWord$1) break;
2588
- out += escapeSignatureLiteral(firstWord$1);
2589
- tokenSteps++;
2712
+ if (opts.includeFirstWordFallback && !matchedToken) {
2713
+ const word$1 = extractFirstWord(s.slice(pos));
2714
+ if (word$1) {
2715
+ out += escapeSignatureLiteral(word$1);
2716
+ steps++;
2717
+ }
2590
2718
  }
2591
2719
  break;
2592
2720
  }
2593
- if (!includeFirstWordFallback) return null;
2594
- const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2595
- if (!firstWord) return null;
2596
- out += escapeSignatureLiteral(firstWord);
2597
- tokenSteps++;
2598
- return out;
2599
- }
2600
- if (!matchedAny) return null;
2601
- if (whitespace === "regex") while (out.endsWith("\\s*")) out = out.slice(0, -3);
2602
- else while (out.endsWith(" ")) out = out.slice(0, -1);
2603
- return out;
2721
+ if (!opts.includeFirstWordFallback) return null;
2722
+ const word = extractFirstWord(s.slice(pos));
2723
+ if (!word) return null;
2724
+ return escapeSignatureLiteral(word);
2725
+ }
2726
+ return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
2727
+ };
2728
+ const processLine = (line, pageId, tokenPriority, opts, acc) => {
2729
+ const trimmed = collapseWhitespace(line);
2730
+ if (trimmed.length < opts.minLineLength) return;
2731
+ if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
2732
+ const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
2733
+ if (!sig) return;
2734
+ const entry = acc.get(sig);
2735
+ if (!entry) acc.set(sig, {
2736
+ count: 1,
2737
+ examples: [{
2738
+ line: trimmed,
2739
+ pageId
2740
+ }]
2741
+ });
2742
+ else {
2743
+ entry.count++;
2744
+ if (entry.examples.length < opts.maxExamples) entry.examples.push({
2745
+ line: trimmed,
2746
+ pageId
2747
+ });
2748
+ }
2749
+ };
2750
+ const processPage = (page, tokenPriority, opts, acc) => {
2751
+ for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
2604
2752
  };
2605
2753
  /**
2606
2754
  * Analyze pages and return the most common line-start patterns (top K).
2607
- *
2608
- * This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
2609
- * template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
2610
2755
  */
2611
2756
  const analyzeCommonLineStarts = (pages, options = {}) => {
2612
- const o = {
2613
- ...DEFAULT_OPTIONS,
2614
- ...options,
2615
- lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
2616
- prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
2617
- whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
2618
- };
2757
+ const opts = resolveOptions$1(options);
2619
2758
  const tokenPriority = buildTokenPriority();
2620
- const counts = /* @__PURE__ */ new Map();
2621
- for (const page of pages) {
2622
- const lines = normalizeLineEndings(page.content ?? "").split("\n");
2623
- for (const line of lines) {
2624
- const trimmed = collapseWhitespace(line);
2625
- if (trimmed.length < o.minLineLength) continue;
2626
- if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
2627
- const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers, o.whitespace);
2628
- if (!sig) continue;
2629
- const existing = counts.get(sig);
2630
- if (!existing) counts.set(sig, {
2631
- count: 1,
2632
- examples: [{
2633
- line: trimmed,
2634
- pageId: page.id
2635
- }]
2636
- });
2637
- else {
2638
- existing.count++;
2639
- if (existing.examples.length < o.maxExamples) existing.examples.push({
2640
- line: trimmed,
2641
- pageId: page.id
2642
- });
2759
+ const acc = /* @__PURE__ */ new Map();
2760
+ for (const page of pages) processPage(page, tokenPriority, opts, acc);
2761
+ const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
2762
+ return [...acc.entries()].map(([pattern, v]) => ({
2763
+ count: v.count,
2764
+ examples: v.examples,
2765
+ pattern
2766
+ })).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
2767
+ };
2768
+
2769
+ //#endregion
2770
+ //#region src/analysis/repeating-sequences.ts
2771
+ const resolveOptions = (options) => {
2772
+ const minElements = Math.max(1, options?.minElements ?? 1);
2773
+ return {
2774
+ contextChars: options?.contextChars ?? 50,
2775
+ maxElements: Math.max(minElements, options?.maxElements ?? 3),
2776
+ maxExamples: options?.maxExamples ?? 3,
2777
+ maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
2778
+ minCount: Math.max(1, options?.minCount ?? 3),
2779
+ minElements,
2780
+ normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
2781
+ requireToken: options?.requireToken ?? true,
2782
+ topK: Math.max(1, options?.topK ?? 20),
2783
+ whitespace: options?.whitespace ?? "regex"
2784
+ };
2785
+ };
2786
+ /** Creates a cursor that tracks position in both normalized and raw text */
2787
+ const createRawCursor = (text, normalize) => {
2788
+ let rawPos = 0;
2789
+ return {
2790
+ advance(normalizedLen) {
2791
+ if (!normalize) {
2792
+ const chunk = text.slice(rawPos, rawPos + normalizedLen);
2793
+ rawPos += normalizedLen;
2794
+ return chunk;
2643
2795
  }
2796
+ const start = rawPos;
2797
+ let matchedLen = 0;
2798
+ while (matchedLen < normalizedLen && rawPos < text.length) {
2799
+ if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
2800
+ rawPos++;
2801
+ }
2802
+ while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
2803
+ return text.slice(start, rawPos);
2804
+ },
2805
+ get pos() {
2806
+ return rawPos;
2807
+ }
2808
+ };
2809
+ };
2810
+ /** Scans text and produces a stream of tokens and literals. */
2811
+ const tokenizeContent = (text, normalize) => {
2812
+ const normalized = normalize ? stripArabicDiacritics(text) : text;
2813
+ const compiled = compileTokenRegexes(buildTokenPriority());
2814
+ const cursor = createRawCursor(text, normalize);
2815
+ const items = [];
2816
+ let pos = 0;
2817
+ while (pos < normalized.length) {
2818
+ const ws = /^\s+/u.exec(normalized.slice(pos));
2819
+ if (ws) {
2820
+ pos += ws[0].length;
2821
+ cursor.advance(ws[0].length);
2822
+ continue;
2823
+ }
2824
+ const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
2825
+ if (token) {
2826
+ const raw = cursor.advance(token.text.length);
2827
+ items.push({
2828
+ end: cursor.pos,
2829
+ raw,
2830
+ start: cursor.pos - raw.length,
2831
+ text: `{{${token.token}}}`,
2832
+ type: "token"
2833
+ });
2834
+ pos += token.text.length;
2835
+ continue;
2836
+ }
2837
+ if (isCommonDelimiter(normalized[pos])) {
2838
+ const raw = cursor.advance(1);
2839
+ items.push({
2840
+ end: cursor.pos,
2841
+ raw,
2842
+ start: cursor.pos - 1,
2843
+ text: escapeSignatureLiteral(normalized[pos]),
2844
+ type: "literal"
2845
+ });
2846
+ pos++;
2847
+ continue;
2848
+ }
2849
+ const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
2850
+ if (word) {
2851
+ const raw = cursor.advance(word[0].length);
2852
+ items.push({
2853
+ end: cursor.pos,
2854
+ raw,
2855
+ start: cursor.pos - raw.length,
2856
+ text: escapeSignatureLiteral(word[0]),
2857
+ type: "literal"
2858
+ });
2859
+ pos += word[0].length;
2860
+ continue;
2644
2861
  }
2862
+ cursor.advance(1);
2863
+ pos++;
2645
2864
  }
2646
- const compareSpecificityThenCount = (a, b) => {
2647
- const sa = computeSpecificity(a.pattern);
2648
- const sb = computeSpecificity(b.pattern);
2649
- if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
2650
- if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
2651
- if (b.count !== a.count) return b.count - a.count;
2652
- return a.pattern.localeCompare(b.pattern);
2865
+ return items;
2866
+ };
2867
+ /** Build pattern string from window items */
2868
+ const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
2869
+ /** Check if window contains at least one token */
2870
+ const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
2871
+ /** Compute token count and literal length for a window */
2872
+ const computeWindowStats = (window) => {
2873
+ let tokenCount = 0, literalLen = 0;
2874
+ for (const item of window) if (item.type === "token") tokenCount++;
2875
+ else literalLen += item.text.length;
2876
+ return {
2877
+ literalLen,
2878
+ tokenCount
2653
2879
  };
2654
- const compareCountThenSpecificity = (a, b) => {
2655
- if (b.count !== a.count) return b.count - a.count;
2656
- return compareSpecificityThenCount(a, b);
2880
+ };
2881
+ /** Build example from page content and window */
2882
+ const buildExample = (page, window, contextChars) => {
2883
+ const start = window[0].start;
2884
+ const end = window.at(-1).end;
2885
+ const ctxStart = Math.max(0, start - contextChars);
2886
+ const ctxEnd = Math.min(page.content.length, end + contextChars);
2887
+ return {
2888
+ context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
2889
+ pageId: page.id,
2890
+ startIndices: window.map((w) => w.start),
2891
+ text: page.content.slice(start, end)
2657
2892
  };
2658
- return [...counts.entries()].map(([pattern, v]) => ({
2659
- count: v.count,
2660
- examples: v.examples,
2893
+ };
2894
+ /** Extract N-grams from a single page */
2895
+ const extractPageNgrams = (page, items, opts, stats) => {
2896
+ for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
2897
+ const window = items.slice(i, i + n);
2898
+ if (opts.requireToken && !hasTokenInWindow(window)) continue;
2899
+ const pattern = buildPattern(window, opts.whitespace);
2900
+ if (!stats.has(pattern)) {
2901
+ if (stats.size >= opts.maxUniquePatterns) continue;
2902
+ stats.set(pattern, {
2903
+ count: 0,
2904
+ examples: [],
2905
+ ...computeWindowStats(window)
2906
+ });
2907
+ }
2908
+ const entry = stats.get(pattern);
2909
+ entry.count++;
2910
+ if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
2911
+ }
2912
+ };
2913
+ /**
2914
+ * Analyze pages for commonly repeating word sequences.
2915
+ *
2916
+ * Use for continuous text without line breaks. For line-based analysis,
2917
+ * use `analyzeCommonLineStarts()` instead.
2918
+ */
2919
+ const analyzeRepeatingSequences = (pages, options) => {
2920
+ const opts = resolveOptions(options);
2921
+ const stats = /* @__PURE__ */ new Map();
2922
+ for (const page of pages) {
2923
+ if (!page.content) continue;
2924
+ extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
2925
+ }
2926
+ return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
2927
+ count: s.count,
2928
+ examples: s.examples,
2661
2929
  pattern
2662
- })).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
2930
+ }));
2663
2931
  };
2664
2932
 
2665
2933
  //#endregion
@@ -2831,5 +3099,524 @@ const analyzeTextForRule = (text) => {
2831
3099
  };
2832
3100
 
2833
3101
  //#endregion
2834
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
3102
+ //#region src/recovery.ts
3103
+ const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
3104
+ const normalizeForCompare = (s, mode) => {
3105
+ if (mode === "none") return s;
3106
+ let out = s;
3107
+ if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
3108
+ out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
3109
+ return out;
3110
+ };
3111
+ const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
3112
+ const buildFixedOptions = (options, selectedRuleIndices) => {
3113
+ const fixedRules = (options.rules ?? []).map((r, idx) => {
3114
+ if (!selectedRuleIndices.has(idx)) return r;
3115
+ if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
3116
+ const { lineStartsAfter, ...rest } = r;
3117
+ return {
3118
+ ...rest,
3119
+ lineStartsWith: lineStartsAfter
3120
+ };
3121
+ });
3122
+ return {
3123
+ ...options,
3124
+ rules: fixedRules
3125
+ };
3126
+ };
3127
+ const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
3128
+ const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
3129
+ const parts = [];
3130
+ for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
3131
+ const matchContent = parts.join("\n");
3132
+ if (pageJoiner === "newline") return {
3133
+ matchContent,
3134
+ outputContent: matchContent
3135
+ };
3136
+ return {
3137
+ matchContent,
3138
+ outputContent: parts.join(" ")
3139
+ };
3140
+ };
3141
+ const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
3142
+ const rules = options.rules ?? [];
3143
+ const compiled = [];
3144
+ for (const idx of selectedRuleIndices) {
3145
+ const r = rules[idx];
3146
+ if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3147
+ const { lineStartsAfter, ...rest } = r;
3148
+ const built = buildRuleRegex({
3149
+ ...rest,
3150
+ lineStartsWith: lineStartsAfter
3151
+ });
3152
+ compiled.push({
3153
+ ruleIndex: idx,
3154
+ startsWithRegex: new RegExp(built.regex.source, "mu")
3155
+ });
3156
+ }
3157
+ return compiled;
3158
+ };
3159
+ const findUniqueAnchorPos = (outputContent, segmentContent) => {
3160
+ for (const len of [
3161
+ 80,
3162
+ 60,
3163
+ 40,
3164
+ 30,
3165
+ 20,
3166
+ 15
3167
+ ]) {
3168
+ const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
3169
+ if (!needle.trim()) continue;
3170
+ const first = outputContent.indexOf(needle);
3171
+ if (first === -1) continue;
3172
+ if (outputContent.indexOf(needle, first + 1) === -1) return first;
3173
+ }
3174
+ return null;
3175
+ };
3176
+ const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
3177
+ const line = matchContent.slice(lineStart);
3178
+ for (const mr of compiledMistaken) {
3179
+ mr.startsWithRegex.lastIndex = 0;
3180
+ const m = mr.startsWithRegex.exec(line);
3181
+ if (!m || m.index !== 0) continue;
3182
+ const markerMatch = m[0];
3183
+ const markerEnd = lineStart + markerMatch.length;
3184
+ if (anchorPos < markerEnd) continue;
3185
+ const gap = matchContent.slice(markerEnd, anchorPos);
3186
+ const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
3187
+ if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
3188
+ return { prefix: recoveredPrefix };
3189
+ }
3190
+ return { reason: "no selected marker pattern matched at anchored line start" };
3191
+ };
3192
+ const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
3193
+ const fromIdx = pageIdToIndex.get(segment.from);
3194
+ const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
3195
+ if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
3196
+ kind: "unresolved",
3197
+ reason: "segment page range not found in pages"
3198
+ };
3199
+ const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
3200
+ if (!segment.content) return {
3201
+ kind: "unresolved",
3202
+ reason: "empty segment content"
3203
+ };
3204
+ const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
3205
+ if (anchorPos === null) return {
3206
+ kind: "unresolved",
3207
+ reason: "could not uniquely anchor segment content in page range"
3208
+ };
3209
+ const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
3210
+ const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
3211
+ if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
3212
+ kind: "unresolved",
3213
+ reason: found.reason
3214
+ };
3215
+ return {
3216
+ kind: "recovered",
3217
+ recoveredContent: `${found.prefix}${segment.content}`,
3218
+ recoveredPrefix: found.prefix
3219
+ };
3220
+ };
3221
+ const resolveRuleIndicesSelector = (rules, indicesIn) => {
3222
+ const errors = [];
3223
+ const indices = /* @__PURE__ */ new Set();
3224
+ for (const idx of indicesIn) {
3225
+ if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
3226
+ errors.push(`Selector index out of range: ${idx}`);
3227
+ continue;
3228
+ }
3229
+ const rule = rules[idx];
3230
+ if (!rule || !("lineStartsAfter" in rule)) {
3231
+ errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
3232
+ continue;
3233
+ }
3234
+ indices.add(idx);
3235
+ }
3236
+ return {
3237
+ errors,
3238
+ indices,
3239
+ warnings: []
3240
+ };
3241
+ };
3242
+ const resolvePredicateSelector = (rules, predicate) => {
3243
+ const errors = [];
3244
+ const warnings = [];
3245
+ const indices = /* @__PURE__ */ new Set();
3246
+ rules.forEach((r, i) => {
3247
+ try {
3248
+ if (!predicate(r, i)) return;
3249
+ if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
3250
+ indices.add(i);
3251
+ return;
3252
+ }
3253
+ warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
3254
+ } catch (e) {
3255
+ const msg = e instanceof Error ? e.message : String(e);
3256
+ errors.push(`Predicate threw at rule ${i}: ${msg}`);
3257
+ }
3258
+ });
3259
+ if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
3260
+ return {
3261
+ errors,
3262
+ indices,
3263
+ warnings
3264
+ };
3265
+ };
3266
+ const resolvePatternsSelector = (rules, patterns, matchMode) => {
3267
+ const errors = [];
3268
+ const warnings = [];
3269
+ const indices = /* @__PURE__ */ new Set();
3270
+ const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
3271
+ const targets = patterns.map(normalizePattern);
3272
+ for (let pi = 0; pi < patterns.length; pi++) {
3273
+ const rawPattern = patterns[pi];
3274
+ const pat = targets[pi];
3275
+ const matched = [];
3276
+ for (let i = 0; i < rules.length; i++) {
3277
+ const r = rules[i];
3278
+ if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
3279
+ if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
3280
+ }
3281
+ if (matched.length === 0) {
3282
+ errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
3283
+ continue;
3284
+ }
3285
+ if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
3286
+ matched.forEach((i) => {
3287
+ indices.add(i);
3288
+ });
3289
+ }
3290
+ return {
3291
+ errors,
3292
+ indices,
3293
+ warnings
3294
+ };
3295
+ };
3296
+ const resolveSelectorToRuleIndices = (options, selector) => {
3297
+ const rules = options.rules ?? [];
3298
+ if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
3299
+ if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
3300
+ return resolvePatternsSelector(rules, selector.patterns, selector.match);
3301
+ };
3302
+ const longestCommonSuffixLength = (a, b) => {
3303
+ const max = Math.min(a.length, b.length);
3304
+ let i = 0;
3305
+ while (i < max) {
3306
+ if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
3307
+ i++;
3308
+ }
3309
+ return i;
3310
+ };
3311
+ const AMBIGUITY_SCORE_GAP = 5;
3312
+ const scoreCandidate = (orig, fixed, normalizeMode) => {
3313
+ if (fixed.content === orig.content) return {
3314
+ fixedIndex: -1,
3315
+ kind: "exact",
3316
+ score: 100
3317
+ };
3318
+ if (fixed.content.endsWith(orig.content)) {
3319
+ const markerLen = fixed.content.length - orig.content.length;
3320
+ return {
3321
+ fixedIndex: -1,
3322
+ kind: "exact_suffix",
3323
+ score: 90 + Math.min(30, markerLen)
3324
+ };
3325
+ }
3326
+ if (normalizeMode !== "none") {
3327
+ const normFixed = normalizeForCompare(fixed.content, normalizeMode);
3328
+ const normOrig = normalizeForCompare(orig.content, normalizeMode);
3329
+ if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
3330
+ const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
3331
+ return {
3332
+ fixedIndex: -1,
3333
+ kind: "normalized_suffix",
3334
+ score: 70 + Math.floor(overlap * 20)
3335
+ };
3336
+ }
3337
+ }
3338
+ return null;
3339
+ };
3340
+ const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
3341
+ const warnings = [...reportBase.warnings];
3342
+ warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
3343
+ const details = segments.map((s, i) => {
3344
+ const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
3345
+ return {
3346
+ from: s.from,
3347
+ notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
3348
+ originalStartPreview: preview(s.content),
3349
+ segmentIndex: i,
3350
+ status,
3351
+ strategy: "none",
3352
+ to: s.to
3353
+ };
3354
+ });
3355
+ return {
3356
+ report: {
3357
+ ...reportBase,
3358
+ details,
3359
+ summary: {
3360
+ mode,
3361
+ recovered: 0,
3362
+ totalSegments: segments.length,
3363
+ unchanged: segments.length,
3364
+ unresolved: selectorErrors.length ? segments.length : 0
3365
+ },
3366
+ warnings
3367
+ },
3368
+ segments
3369
+ };
3370
+ };
3371
+ const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
3372
+ const recoveredAtIndex = /* @__PURE__ */ new Map();
3373
+ const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
3374
+ if (mode !== "best_effort_then_rerun") return {
3375
+ recoveredAtIndex,
3376
+ recoveredDetailAtIndex
3377
+ };
3378
+ const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
3379
+ const pageIdToIndex = buildPageIdToIndex(processedPages);
3380
+ const pageJoiner = options.pageJoiner ?? "space";
3381
+ const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
3382
+ for (let i = 0; i < segments.length; i++) {
3383
+ const orig = segments[i];
3384
+ const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
3385
+ if (r.kind !== "recovered") continue;
3386
+ const seg = {
3387
+ ...orig,
3388
+ content: r.recoveredContent
3389
+ };
3390
+ recoveredAtIndex.set(i, seg);
3391
+ recoveredDetailAtIndex.set(i, {
3392
+ from: orig.from,
3393
+ originalStartPreview: preview(orig.content),
3394
+ recoveredPrefixPreview: preview(r.recoveredPrefix),
3395
+ recoveredStartPreview: preview(seg.content),
3396
+ segmentIndex: i,
3397
+ status: "recovered",
3398
+ strategy: "stage1",
3399
+ to: orig.to
3400
+ });
3401
+ }
3402
+ return {
3403
+ recoveredAtIndex,
3404
+ recoveredDetailAtIndex
3405
+ };
3406
+ };
3407
+ const buildFixedBuckets = (fixedSegments) => {
3408
+ const buckets = /* @__PURE__ */ new Map();
3409
+ for (let i = 0; i < fixedSegments.length; i++) {
3410
+ const k = segmentRangeKey(fixedSegments[i]);
3411
+ const arr = buckets.get(k);
3412
+ if (!arr) buckets.set(k, [i]);
3413
+ else arr.push(i);
3414
+ }
3415
+ return buckets;
3416
+ };
3417
+ const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
3418
+ let best = null;
3419
+ let secondBestScore = -Infinity;
3420
+ for (const fixedIdx of candidates) {
3421
+ if (usedFixed.has(fixedIdx)) continue;
3422
+ const fixed = fixedSegments[fixedIdx];
3423
+ const scored = scoreCandidate(orig, fixed, normalizeCompare);
3424
+ if (!scored) continue;
3425
+ const candidateScore = scored.score;
3426
+ if (!best || candidateScore > best.score) {
3427
+ secondBestScore = best?.score ?? -Infinity;
3428
+ best = {
3429
+ fixedIdx,
3430
+ score: candidateScore
3431
+ };
3432
+ } else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
3433
+ }
3434
+ if (!best) return { kind: "none" };
3435
+ if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
3436
+ return {
3437
+ fixedIdx: best.fixedIdx,
3438
+ kind: "match"
3439
+ };
3440
+ };
3441
+ const detailUnresolved = (orig, segmentIndex, notes) => ({
3442
+ from: orig.from,
3443
+ notes,
3444
+ originalStartPreview: preview(orig.content),
3445
+ segmentIndex,
3446
+ status: "unresolved_alignment",
3447
+ strategy: "rerun",
3448
+ to: orig.to
3449
+ });
3450
+ const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
3451
+ from: orig.from,
3452
+ notes,
3453
+ originalStartPreview: preview(orig.content),
3454
+ segmentIndex,
3455
+ status: "skipped_idempotent",
3456
+ strategy: "rerun",
3457
+ to: orig.to
3458
+ });
3459
+ const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
3460
+ let recoveredPrefixPreview;
3461
+ if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
3462
+ return {
3463
+ from: orig.from,
3464
+ originalStartPreview: preview(orig.content),
3465
+ recoveredPrefixPreview,
3466
+ recoveredStartPreview: preview(fixed.content),
3467
+ segmentIndex,
3468
+ status: "recovered",
3469
+ strategy: "rerun",
3470
+ to: orig.to
3471
+ };
3472
+ };
3473
+ const mergeWithRerun = (params) => {
3474
+ const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
3475
+ const usedFixed = /* @__PURE__ */ new Set();
3476
+ const out = [];
3477
+ const details = [];
3478
+ let recovered = 0;
3479
+ let unresolved = 0;
3480
+ let unchanged = 0;
3481
+ for (let i = 0; i < originalSegments.length; i++) {
3482
+ const stage1Recovered = stage1RecoveredAtIndex.get(i);
3483
+ if (stage1Recovered) {
3484
+ out.push(stage1Recovered);
3485
+ recovered++;
3486
+ details.push(recoveredDetailAtIndex.get(i) ?? {
3487
+ from: stage1Recovered.from,
3488
+ originalStartPreview: preview(originalSegments[i].content),
3489
+ recoveredStartPreview: preview(stage1Recovered.content),
3490
+ segmentIndex: i,
3491
+ status: "recovered",
3492
+ strategy: "stage1",
3493
+ to: stage1Recovered.to
3494
+ });
3495
+ continue;
3496
+ }
3497
+ const orig = originalSegments[i];
3498
+ const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
3499
+ if (best.kind === "none") {
3500
+ out.push(orig);
3501
+ unresolved++;
3502
+ details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
3503
+ continue;
3504
+ }
3505
+ if (best.kind === "ambiguous") {
3506
+ out.push(orig);
3507
+ unresolved++;
3508
+ details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
3509
+ continue;
3510
+ }
3511
+ usedFixed.add(best.fixedIdx);
3512
+ const fixed = fixedSegments[best.fixedIdx];
3513
+ if (fixed.content === orig.content) {
3514
+ out.push(orig);
3515
+ unchanged++;
3516
+ details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
3517
+ continue;
3518
+ }
3519
+ out.push({
3520
+ ...orig,
3521
+ content: fixed.content
3522
+ });
3523
+ recovered++;
3524
+ details.push(detailRecoveredRerun(orig, fixed, i));
3525
+ }
3526
+ return {
3527
+ details,
3528
+ segments: out,
3529
+ summary: {
3530
+ recovered,
3531
+ unchanged,
3532
+ unresolved
3533
+ }
3534
+ };
3535
+ };
3536
+ function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
3537
+ const mode = opts?.mode ?? "rerun_only";
3538
+ const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
3539
+ const resolved = resolveSelectorToRuleIndices(options, selector);
3540
+ const reportBase = {
3541
+ byRun: void 0,
3542
+ errors: resolved.errors,
3543
+ warnings: resolved.warnings
3544
+ };
3545
+ if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
3546
+ const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
3547
+ const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
3548
+ const merged = mergeWithRerun({
3549
+ fixedBuckets: buildFixedBuckets(fixedSegments),
3550
+ fixedSegments,
3551
+ normalizeCompare,
3552
+ originalSegments: segments,
3553
+ recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
3554
+ stage1RecoveredAtIndex: stage1.recoveredAtIndex
3555
+ });
3556
+ return {
3557
+ report: {
3558
+ ...reportBase,
3559
+ details: merged.details,
3560
+ summary: {
3561
+ mode,
3562
+ recovered: merged.summary.recovered,
3563
+ totalSegments: segments.length,
3564
+ unchanged: merged.summary.unchanged,
3565
+ unresolved: merged.summary.unresolved
3566
+ }
3567
+ },
3568
+ segments: merged.segments
3569
+ };
3570
+ }
3571
+ function recoverMistakenMarkersForRuns(runs, opts) {
3572
+ const allSegments = [];
3573
+ const byRun = [];
3574
+ const details = [];
3575
+ const warnings = [];
3576
+ const errors = [];
3577
+ let recovered = 0;
3578
+ let unchanged = 0;
3579
+ let unresolved = 0;
3580
+ let offset = 0;
3581
+ for (let i = 0; i < runs.length; i++) {
3582
+ const run = runs[i];
3583
+ const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
3584
+ allSegments.push(...res.segments);
3585
+ for (const d of res.report.details) details.push({
3586
+ ...d,
3587
+ segmentIndex: d.segmentIndex + offset
3588
+ });
3589
+ offset += run.segments.length;
3590
+ recovered += res.report.summary.recovered;
3591
+ unchanged += res.report.summary.unchanged;
3592
+ unresolved += res.report.summary.unresolved;
3593
+ warnings.push(...res.report.warnings);
3594
+ errors.push(...res.report.errors);
3595
+ byRun.push({
3596
+ recovered: res.report.summary.recovered,
3597
+ runIndex: i,
3598
+ totalSegments: run.segments.length,
3599
+ unresolved: res.report.summary.unresolved
3600
+ });
3601
+ }
3602
+ return {
3603
+ report: {
3604
+ byRun,
3605
+ details,
3606
+ errors,
3607
+ summary: {
3608
+ mode: opts?.mode ?? "rerun_only",
3609
+ recovered,
3610
+ totalSegments: offset,
3611
+ unchanged,
3612
+ unresolved
3613
+ },
3614
+ warnings
3615
+ },
3616
+ segments: allSegments
3617
+ };
3618
+ }
3619
+
3620
+ //#endregion
3621
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
2835
3622
  //# sourceMappingURL=index.mjs.map