flappa-doormal 2.7.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +80 -5
- package/README.md +138 -47
- package/dist/index.d.mts +113 -95
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1072 -285
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -645,6 +645,10 @@ const buildBareTokenRegex = () => {
|
|
|
645
645
|
* Validates a single pattern for common issues.
|
|
646
646
|
*/
|
|
647
647
|
const validatePattern = (pattern, seenPatterns) => {
|
|
648
|
+
if (!pattern.trim()) return {
|
|
649
|
+
message: "Empty pattern is not allowed",
|
|
650
|
+
type: "empty_pattern"
|
|
651
|
+
};
|
|
648
652
|
if (seenPatterns.has(pattern)) return {
|
|
649
653
|
message: `Duplicate pattern: "${pattern}"`,
|
|
650
654
|
type: "duplicate"
|
|
@@ -727,7 +731,7 @@ const validateRules = (rules) => {
|
|
|
727
731
|
hasIssues = true;
|
|
728
732
|
}
|
|
729
733
|
}
|
|
730
|
-
if ("template" in rule && rule.template) {
|
|
734
|
+
if ("template" in rule && rule.template !== void 0) {
|
|
731
735
|
const seenPatterns = /* @__PURE__ */ new Set();
|
|
732
736
|
const issue = validatePattern(rule.template, seenPatterns);
|
|
733
737
|
if (issue) {
|
|
@@ -1245,16 +1249,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1245
1249
|
*/
|
|
1246
1250
|
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
|
|
1247
1251
|
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
1248
|
-
for (
|
|
1252
|
+
for (let i = 0; i < expandedBreakpoints.length; i++) {
|
|
1253
|
+
const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
|
|
1249
1254
|
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
1250
1255
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1251
1256
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1252
|
-
if (regex === null) return
|
|
1257
|
+
if (regex === null) return {
|
|
1258
|
+
breakpointIndex: i,
|
|
1259
|
+
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1260
|
+
rule
|
|
1261
|
+
};
|
|
1253
1262
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1254
|
-
if (breakPos > 0) return
|
|
1263
|
+
if (breakPos > 0) return {
|
|
1264
|
+
breakpointIndex: i,
|
|
1265
|
+
breakPos,
|
|
1266
|
+
rule
|
|
1267
|
+
};
|
|
1255
1268
|
}
|
|
1256
|
-
return
|
|
1269
|
+
return null;
|
|
1270
|
+
};
|
|
1271
|
+
|
|
1272
|
+
//#endregion
|
|
1273
|
+
//#region src/segmentation/debug-meta.ts
|
|
1274
|
+
const resolveDebugConfig = (debug) => {
|
|
1275
|
+
if (!debug) return null;
|
|
1276
|
+
if (debug === true) return {
|
|
1277
|
+
includeBreakpoint: true,
|
|
1278
|
+
includeRule: true,
|
|
1279
|
+
metaKey: "_flappa"
|
|
1280
|
+
};
|
|
1281
|
+
if (typeof debug !== "object") return null;
|
|
1282
|
+
const metaKey = debug.metaKey;
|
|
1283
|
+
const include = debug.include;
|
|
1284
|
+
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
1285
|
+
return {
|
|
1286
|
+
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
1287
|
+
includeRule,
|
|
1288
|
+
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
1289
|
+
};
|
|
1290
|
+
};
|
|
1291
|
+
const getRulePatternType = (rule) => {
|
|
1292
|
+
if ("lineStartsWith" in rule) return "lineStartsWith";
|
|
1293
|
+
if ("lineStartsAfter" in rule) return "lineStartsAfter";
|
|
1294
|
+
if ("lineEndsWith" in rule) return "lineEndsWith";
|
|
1295
|
+
if ("template" in rule) return "template";
|
|
1296
|
+
return "regex";
|
|
1297
|
+
};
|
|
1298
|
+
const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
1299
|
+
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
1300
|
+
const out = meta ? { ...meta } : {};
|
|
1301
|
+
const existing = out[metaKey];
|
|
1302
|
+
out[metaKey] = {
|
|
1303
|
+
...isPlainObject(existing) ? existing : {},
|
|
1304
|
+
...patch
|
|
1305
|
+
};
|
|
1306
|
+
return out;
|
|
1257
1307
|
};
|
|
1308
|
+
const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
|
|
1309
|
+
index: ruleIndex,
|
|
1310
|
+
patternType: getRulePatternType(rule)
|
|
1311
|
+
} });
|
|
1312
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
|
|
1313
|
+
index: breakpointIndex,
|
|
1314
|
+
kind: rule.pattern === "" ? "pageBoundary" : "pattern",
|
|
1315
|
+
pattern: rule.pattern
|
|
1316
|
+
} });
|
|
1258
1317
|
|
|
1259
1318
|
//#endregion
|
|
1260
1319
|
//#region src/segmentation/breakpoint-processor.ts
|
|
@@ -1338,20 +1397,25 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
|
|
|
1338
1397
|
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
1339
1398
|
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
1340
1399
|
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1341
|
-
if (exclusionBreak > 0) return exclusionBreak;
|
|
1400
|
+
if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
|
|
1342
1401
|
}
|
|
1343
|
-
const
|
|
1402
|
+
const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
1344
1403
|
expandedBreakpoints,
|
|
1345
1404
|
normalizedPages,
|
|
1346
1405
|
pageIds,
|
|
1347
1406
|
prefer
|
|
1348
1407
|
});
|
|
1349
|
-
|
|
1408
|
+
if (patternMatch && patternMatch.breakPos > 0) return {
|
|
1409
|
+
breakOffset: patternMatch.breakPos,
|
|
1410
|
+
breakpointIndex: patternMatch.breakpointIndex,
|
|
1411
|
+
breakpointRule: patternMatch.rule
|
|
1412
|
+
};
|
|
1413
|
+
return { breakOffset: windowEndPosition };
|
|
1350
1414
|
};
|
|
1351
1415
|
/**
|
|
1352
1416
|
* Advances cursor position past any leading whitespace.
|
|
1353
1417
|
*/
|
|
1354
|
-
const skipWhitespace = (content, startPos) => {
|
|
1418
|
+
const skipWhitespace$1 = (content, startPos) => {
|
|
1355
1419
|
let pos = startPos;
|
|
1356
1420
|
while (pos < content.length && /\s/.test(content[pos])) pos++;
|
|
1357
1421
|
return pos;
|
|
@@ -1362,12 +1426,13 @@ const skipWhitespace = (content, startPos) => {
|
|
|
1362
1426
|
*
|
|
1363
1427
|
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
1364
1428
|
*/
|
|
1365
|
-
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
1429
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
|
|
1366
1430
|
const result = [];
|
|
1367
1431
|
const fullContent = segment.content;
|
|
1368
1432
|
let cursorPos = 0;
|
|
1369
1433
|
let currentFromIdx = fromIdx;
|
|
1370
1434
|
let isFirstPiece = true;
|
|
1435
|
+
let lastBreakpoint = null;
|
|
1371
1436
|
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1372
1437
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1373
1438
|
boundaryPositions,
|
|
@@ -1382,7 +1447,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1382
1447
|
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
1383
1448
|
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1384
1449
|
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1385
|
-
const
|
|
1450
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1451
|
+
const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
|
|
1452
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
|
|
1386
1453
|
if (finalSeg) result.push(finalSeg);
|
|
1387
1454
|
break;
|
|
1388
1455
|
}
|
|
@@ -1393,8 +1460,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1393
1460
|
cursorPos,
|
|
1394
1461
|
windowEndIdx
|
|
1395
1462
|
});
|
|
1396
|
-
const
|
|
1397
|
-
|
|
1463
|
+
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
1464
|
+
if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
|
|
1465
|
+
breakpointIndex: found.breakpointIndex,
|
|
1466
|
+
rule: found.breakpointRule
|
|
1467
|
+
};
|
|
1468
|
+
const breakPos = cursorPos + found.breakOffset;
|
|
1398
1469
|
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
1399
1470
|
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1400
1471
|
logger?.trace?.("[breakpoints] piece", {
|
|
@@ -1403,10 +1474,11 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1403
1474
|
pieceLength: pieceContent.length
|
|
1404
1475
|
});
|
|
1405
1476
|
if (pieceContent) {
|
|
1406
|
-
const
|
|
1477
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1478
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
|
|
1407
1479
|
if (pieceSeg) result.push(pieceSeg);
|
|
1408
1480
|
}
|
|
1409
|
-
cursorPos = skipWhitespace(fullContent, breakPos);
|
|
1481
|
+
cursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
1410
1482
|
currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
1411
1483
|
isFirstPiece = false;
|
|
1412
1484
|
}
|
|
@@ -1418,7 +1490,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1418
1490
|
*
|
|
1419
1491
|
* Note: This is an internal engine used by `segmentPages()`.
|
|
1420
1492
|
*/
|
|
1421
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
|
|
1493
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
|
|
1422
1494
|
const pageIds = pages.map((p) => p.id);
|
|
1423
1495
|
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
1424
1496
|
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
@@ -1446,7 +1518,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1446
1518
|
result.push(segment);
|
|
1447
1519
|
continue;
|
|
1448
1520
|
}
|
|
1449
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
1521
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
|
|
1450
1522
|
result.push(...broken.map((s) => {
|
|
1451
1523
|
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
1452
1524
|
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
@@ -1959,6 +2031,129 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
1959
2031
|
return splitPointsByRule;
|
|
1960
2032
|
};
|
|
1961
2033
|
|
|
2034
|
+
//#endregion
|
|
2035
|
+
//#region src/segmentation/split-point-helpers.ts
|
|
2036
|
+
/**
|
|
2037
|
+
* Helper module for collectSplitPointsFromRules to reduce complexity.
|
|
2038
|
+
* Handles combined regex matching and split point creation.
|
|
2039
|
+
*/
|
|
2040
|
+
const MAX_REGEX_ITERATIONS = 1e5;
|
|
2041
|
+
const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
|
|
2042
|
+
const result = {};
|
|
2043
|
+
if (!groups) return result;
|
|
2044
|
+
for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
|
|
2045
|
+
return result;
|
|
2046
|
+
};
|
|
2047
|
+
const buildContentOffsets = (match, ruleInfo) => {
|
|
2048
|
+
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
2049
|
+
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
2050
|
+
if (captured === void 0) return {};
|
|
2051
|
+
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
|
|
2052
|
+
};
|
|
2053
|
+
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
2054
|
+
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
2055
|
+
const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
|
|
2056
|
+
const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
|
|
2057
|
+
return {
|
|
2058
|
+
capturedContent: void 0,
|
|
2059
|
+
contentStartOffset,
|
|
2060
|
+
index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
|
|
2061
|
+
meta: rule.meta,
|
|
2062
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
2063
|
+
};
|
|
2064
|
+
};
|
|
2065
|
+
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
2066
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
2067
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
2068
|
+
logger?.debug?.("[segmenter] combined regex built", {
|
|
2069
|
+
combinableRuleCount: combinableRules.length,
|
|
2070
|
+
combinedSourceLength: combinedSource.length
|
|
2071
|
+
});
|
|
2072
|
+
let m = combinedRegex.exec(matchContent);
|
|
2073
|
+
let iterations = 0;
|
|
2074
|
+
while (m !== null) {
|
|
2075
|
+
iterations++;
|
|
2076
|
+
if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
|
|
2077
|
+
if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
|
|
2078
|
+
iterations,
|
|
2079
|
+
position: m.index
|
|
2080
|
+
});
|
|
2081
|
+
const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2082
|
+
if (matchedIndex !== -1) {
|
|
2083
|
+
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
2084
|
+
const ruleInfo = ruleRegexes[matchedIndex];
|
|
2085
|
+
if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
|
|
2086
|
+
const sp = createSplitPointFromMatch(m, rule, ruleInfo);
|
|
2087
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2088
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
2089
|
+
}
|
|
2090
|
+
}
|
|
2091
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2092
|
+
m = combinedRegex.exec(matchContent);
|
|
2093
|
+
}
|
|
2094
|
+
};
|
|
2095
|
+
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
2096
|
+
const built = buildRuleRegex(rule, prefix);
|
|
2097
|
+
return {
|
|
2098
|
+
...built,
|
|
2099
|
+
prefix,
|
|
2100
|
+
source: `(?<${prefix}>${built.regex.source})`
|
|
2101
|
+
};
|
|
2102
|
+
});
|
|
2103
|
+
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
2104
|
+
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2105
|
+
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2106
|
+
const isLSA = usesLineStartsAfter && m.captured !== void 0;
|
|
2107
|
+
const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
|
|
2108
|
+
return {
|
|
2109
|
+
capturedContent: isLSA ? void 0 : m.captured,
|
|
2110
|
+
contentStartOffset: isLSA ? markerLen : void 0,
|
|
2111
|
+
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2112
|
+
meta: rule.meta,
|
|
2113
|
+
namedCaptures: m.namedCaptures
|
|
2114
|
+
};
|
|
2115
|
+
});
|
|
2116
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2117
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2118
|
+
};
|
|
2119
|
+
const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
2120
|
+
const matches = [];
|
|
2121
|
+
let m = regex.exec(content);
|
|
2122
|
+
while (m !== null) {
|
|
2123
|
+
matches.push({
|
|
2124
|
+
captured: usesCapture ? getLastPositionalCapture(m) : void 0,
|
|
2125
|
+
end: m.index + m[0].length,
|
|
2126
|
+
namedCaptures: extractNamedCaptures(m.groups, captureNames),
|
|
2127
|
+
start: m.index
|
|
2128
|
+
});
|
|
2129
|
+
if (m[0].length === 0) regex.lastIndex++;
|
|
2130
|
+
m = regex.exec(content);
|
|
2131
|
+
}
|
|
2132
|
+
return matches;
|
|
2133
|
+
};
|
|
2134
|
+
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
2135
|
+
const result = [];
|
|
2136
|
+
rules.forEach((rule, index) => {
|
|
2137
|
+
const points = splitPointsByRule.get(index);
|
|
2138
|
+
if (!points?.length) return;
|
|
2139
|
+
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
2140
|
+
if (!debugMetaKey) {
|
|
2141
|
+
result.push(...filtered.map((p) => ({
|
|
2142
|
+
...p,
|
|
2143
|
+
ruleIndex: index
|
|
2144
|
+
})));
|
|
2145
|
+
return;
|
|
2146
|
+
}
|
|
2147
|
+
const debugPatch = buildRuleDebugPatch(index, rule);
|
|
2148
|
+
result.push(...filtered.map((p) => ({
|
|
2149
|
+
...p,
|
|
2150
|
+
meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
|
|
2151
|
+
ruleIndex: index
|
|
2152
|
+
})));
|
|
2153
|
+
});
|
|
2154
|
+
return result;
|
|
2155
|
+
};
|
|
2156
|
+
|
|
1962
2157
|
//#endregion
|
|
1963
2158
|
//#region src/segmentation/textUtils.ts
|
|
1964
2159
|
/**
|
|
@@ -1985,7 +2180,6 @@ const normalizeLineEndings = (content) => {
|
|
|
1985
2180
|
*
|
|
1986
2181
|
* @module segmenter
|
|
1987
2182
|
*/
|
|
1988
|
-
const MAX_REGEX_ITERATIONS = 1e5;
|
|
1989
2183
|
/**
|
|
1990
2184
|
* Builds a concatenated content string and page mapping from input pages.
|
|
1991
2185
|
*
|
|
@@ -2082,7 +2276,7 @@ const dedupeSplitPoints = (splitPoints) => {
|
|
|
2082
2276
|
const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
|
|
2083
2277
|
if (segments.length > 0 || pages.length === 0) return segments;
|
|
2084
2278
|
const firstPage = pages[0];
|
|
2085
|
-
const lastPage = pages
|
|
2279
|
+
const lastPage = pages.at(-1);
|
|
2086
2280
|
const joinChar = pageJoiner === "newline" ? "\n" : " ";
|
|
2087
2281
|
const allContent = normalizedContent.join(joinChar).trim();
|
|
2088
2282
|
if (!allContent) return segments;
|
|
@@ -2093,7 +2287,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
2093
2287
|
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
2094
2288
|
return [initialSeg];
|
|
2095
2289
|
};
|
|
2096
|
-
const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
2290
|
+
const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
|
|
2097
2291
|
logger?.debug?.("[segmenter] collecting split points from rules", {
|
|
2098
2292
|
contentLength: matchContent.length,
|
|
2099
2293
|
ruleCount: rules.length
|
|
@@ -2106,124 +2300,9 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
|
2106
2300
|
standaloneCount: standaloneRules.length
|
|
2107
2301
|
});
|
|
2108
2302
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
2109
|
-
if (combinableRules.length > 0)
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
return {
|
|
2113
|
-
prefix,
|
|
2114
|
-
source: `(?<${prefix}>${built.regex.source})`,
|
|
2115
|
-
...built
|
|
2116
|
-
};
|
|
2117
|
-
});
|
|
2118
|
-
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
2119
|
-
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
2120
|
-
logger?.debug?.("[segmenter] combined regex built", {
|
|
2121
|
-
combinableRuleCount: combinableRules.length,
|
|
2122
|
-
combinedSourceLength: combinedSource.length
|
|
2123
|
-
});
|
|
2124
|
-
combinedRegex.lastIndex = 0;
|
|
2125
|
-
let m = combinedRegex.exec(matchContent);
|
|
2126
|
-
let iterationCount = 0;
|
|
2127
|
-
while (m !== null) {
|
|
2128
|
-
iterationCount++;
|
|
2129
|
-
if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
|
|
2130
|
-
if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
|
|
2131
|
-
iterationCount,
|
|
2132
|
-
lastIndex: combinedRegex.lastIndex,
|
|
2133
|
-
matchLength: m[0].length,
|
|
2134
|
-
matchPosition: m.index
|
|
2135
|
-
});
|
|
2136
|
-
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2137
|
-
if (matchedRuleIndex !== -1) {
|
|
2138
|
-
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
2139
|
-
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
2140
|
-
const namedCaptures = {};
|
|
2141
|
-
if (m.groups) {
|
|
2142
|
-
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
2143
|
-
const cleanName = prefixedName.slice(prefix.length);
|
|
2144
|
-
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
2145
|
-
}
|
|
2146
|
-
}
|
|
2147
|
-
let capturedContent;
|
|
2148
|
-
let contentStartOffset;
|
|
2149
|
-
if (ruleInfo.usesLineStartsAfter) {
|
|
2150
|
-
capturedContent = m.groups?.[`${prefix}__content`];
|
|
2151
|
-
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
2152
|
-
}
|
|
2153
|
-
const start = m.index;
|
|
2154
|
-
const end = m.index + m[0].length;
|
|
2155
|
-
const pageId = pageMap.getId(start);
|
|
2156
|
-
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
|
|
2157
|
-
const sp = {
|
|
2158
|
-
capturedContent: void 0,
|
|
2159
|
-
contentStartOffset,
|
|
2160
|
-
index: (rule.split ?? "at") === "at" ? start : end,
|
|
2161
|
-
meta: rule.meta,
|
|
2162
|
-
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
2163
|
-
};
|
|
2164
|
-
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2165
|
-
splitPointsByRule.get(originalIndex).push(sp);
|
|
2166
|
-
}
|
|
2167
|
-
}
|
|
2168
|
-
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2169
|
-
m = combinedRegex.exec(matchContent);
|
|
2170
|
-
}
|
|
2171
|
-
}
|
|
2172
|
-
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
2173
|
-
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2174
|
-
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2175
|
-
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
2176
|
-
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
2177
|
-
return {
|
|
2178
|
-
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
2179
|
-
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
2180
|
-
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2181
|
-
meta: rule.meta,
|
|
2182
|
-
namedCaptures: m.namedCaptures
|
|
2183
|
-
};
|
|
2184
|
-
});
|
|
2185
|
-
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2186
|
-
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2187
|
-
};
|
|
2188
|
-
standaloneRules.forEach((rule) => {
|
|
2189
|
-
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
2190
|
-
});
|
|
2191
|
-
const finalSplitPoints = [];
|
|
2192
|
-
rules.forEach((rule, index) => {
|
|
2193
|
-
const points = splitPointsByRule.get(index);
|
|
2194
|
-
if (!points || points.length === 0) return;
|
|
2195
|
-
let filtered = points;
|
|
2196
|
-
if (rule.occurrence === "first") filtered = [points[0]];
|
|
2197
|
-
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
2198
|
-
finalSplitPoints.push(...filtered);
|
|
2199
|
-
});
|
|
2200
|
-
return finalSplitPoints;
|
|
2201
|
-
};
|
|
2202
|
-
/**
|
|
2203
|
-
* Executes a regex against content and extracts match results with capture information.
|
|
2204
|
-
*
|
|
2205
|
-
* @param content - Full content string to search
|
|
2206
|
-
* @param regex - Compiled regex with 'g' flag
|
|
2207
|
-
* @param usesCapture - Whether to extract captured content
|
|
2208
|
-
* @param captureNames - Names of expected named capture groups
|
|
2209
|
-
* @returns Array of match results with positions and captures
|
|
2210
|
-
*/
|
|
2211
|
-
const findMatches = (content, regex, usesCapture, captureNames) => {
|
|
2212
|
-
const matches = [];
|
|
2213
|
-
regex.lastIndex = 0;
|
|
2214
|
-
let m = regex.exec(content);
|
|
2215
|
-
while (m !== null) {
|
|
2216
|
-
const result = {
|
|
2217
|
-
end: m.index + m[0].length,
|
|
2218
|
-
start: m.index
|
|
2219
|
-
};
|
|
2220
|
-
result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
|
|
2221
|
-
if (usesCapture) result.captured = getLastPositionalCapture(m);
|
|
2222
|
-
matches.push(result);
|
|
2223
|
-
if (m[0].length === 0) regex.lastIndex++;
|
|
2224
|
-
m = regex.exec(content);
|
|
2225
|
-
}
|
|
2226
|
-
return matches;
|
|
2303
|
+
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
2304
|
+
for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
2305
|
+
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
2227
2306
|
};
|
|
2228
2307
|
/**
|
|
2229
2308
|
* Finds page breaks within a given offset range using binary search.
|
|
@@ -2326,6 +2405,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2326
2405
|
*/
|
|
2327
2406
|
const segmentPages = (pages, options) => {
|
|
2328
2407
|
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
2408
|
+
const debug = resolveDebugConfig(options.debug);
|
|
2409
|
+
const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
|
|
2329
2410
|
logger?.info?.("[segmenter] starting segmentation", {
|
|
2330
2411
|
breakpointCount: breakpoints.length,
|
|
2331
2412
|
maxPages,
|
|
@@ -2339,7 +2420,7 @@ const segmentPages = (pages, options) => {
|
|
|
2339
2420
|
pageIds: pageMap.pageIds,
|
|
2340
2421
|
totalContentLength: matchContent.length
|
|
2341
2422
|
});
|
|
2342
|
-
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
|
|
2423
|
+
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
|
|
2343
2424
|
const unique = dedupeSplitPoints(splitPoints);
|
|
2344
2425
|
logger?.debug?.("[segmenter] split points collected", {
|
|
2345
2426
|
rawSplitPoints: splitPoints.length,
|
|
@@ -2358,7 +2439,7 @@ const segmentPages = (pages, options) => {
|
|
|
2358
2439
|
if (maxPages >= 0 && breakpoints.length) {
|
|
2359
2440
|
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
2360
2441
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2361
|
-
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
2442
|
+
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
|
|
2362
2443
|
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
2363
2444
|
return result;
|
|
2364
2445
|
}
|
|
@@ -2410,7 +2491,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2410
2491
|
const result = [];
|
|
2411
2492
|
for (let i = 0; i < splitPoints.length; i++) {
|
|
2412
2493
|
const sp = splitPoints[i];
|
|
2413
|
-
const end =
|
|
2494
|
+
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
2414
2495
|
const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
2415
2496
|
if (s) result.push(s);
|
|
2416
2497
|
}
|
|
@@ -2434,29 +2515,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2434
2515
|
};
|
|
2435
2516
|
|
|
2436
2517
|
//#endregion
|
|
2437
|
-
//#region src/analysis.ts
|
|
2438
|
-
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2439
|
-
const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
|
|
2440
|
-
const computeSpecificity = (pattern) => {
|
|
2441
|
-
const tokenCount = countTokenMarkers(pattern);
|
|
2442
|
-
return {
|
|
2443
|
-
literalLen: stripWhitespacePlaceholders(pattern).length,
|
|
2444
|
-
tokenCount
|
|
2445
|
-
};
|
|
2446
|
-
};
|
|
2447
|
-
const DEFAULT_OPTIONS = {
|
|
2448
|
-
includeFirstWordFallback: true,
|
|
2449
|
-
lineFilter: void 0,
|
|
2450
|
-
maxExamples: 1,
|
|
2451
|
-
minCount: 3,
|
|
2452
|
-
minLineLength: 6,
|
|
2453
|
-
normalizeArabicDiacritics: true,
|
|
2454
|
-
prefixChars: 60,
|
|
2455
|
-
prefixMatchers: [/^#+/u],
|
|
2456
|
-
sortBy: "specificity",
|
|
2457
|
-
topK: 40,
|
|
2458
|
-
whitespace: "regex"
|
|
2459
|
-
};
|
|
2518
|
+
//#region src/analysis/shared.ts
|
|
2460
2519
|
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
2461
2520
|
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2462
2521
|
"basmalah",
|
|
@@ -2497,30 +2556,7 @@ const appendWs = (out, mode) => {
|
|
|
2497
2556
|
if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
|
|
2498
2557
|
return out.endsWith("\\s*") ? out : `${out}\\s*`;
|
|
2499
2558
|
};
|
|
2500
|
-
const
|
|
2501
|
-
let matchedAny = false;
|
|
2502
|
-
let currentPos = pos;
|
|
2503
|
-
let currentOut = out;
|
|
2504
|
-
for (const re of prefixMatchers) {
|
|
2505
|
-
if (currentPos >= s.length) break;
|
|
2506
|
-
const m = re.exec(s.slice(currentPos));
|
|
2507
|
-
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2508
|
-
currentOut += escapeSignatureLiteral(m[0]);
|
|
2509
|
-
currentPos += m[0].length;
|
|
2510
|
-
matchedAny = true;
|
|
2511
|
-
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
2512
|
-
if (wsAfter) {
|
|
2513
|
-
currentPos += wsAfter[0].length;
|
|
2514
|
-
currentOut = appendWs(currentOut, whitespace);
|
|
2515
|
-
}
|
|
2516
|
-
}
|
|
2517
|
-
return {
|
|
2518
|
-
matchedAny,
|
|
2519
|
-
out: currentOut,
|
|
2520
|
-
pos: currentPos
|
|
2521
|
-
};
|
|
2522
|
-
};
|
|
2523
|
-
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
2559
|
+
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
|
|
2524
2560
|
let best = null;
|
|
2525
2561
|
for (const { token, re } of compiled) {
|
|
2526
2562
|
re.lastIndex = pos;
|
|
@@ -2534,132 +2570,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
|
2534
2570
|
if (best?.token === "rumuz") {
|
|
2535
2571
|
const end = pos + best.text.length;
|
|
2536
2572
|
const next = end < s.length ? s[end] : "";
|
|
2537
|
-
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
2573
|
+
if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
|
|
2538
2574
|
}
|
|
2539
2575
|
return best;
|
|
2540
2576
|
};
|
|
2541
|
-
const
|
|
2577
|
+
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
2578
|
+
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
2579
|
+
|
|
2580
|
+
//#endregion
|
|
2581
|
+
//#region src/analysis/line-starts.ts
|
|
2582
|
+
const resolveOptions$1 = (options = {}) => ({
|
|
2583
|
+
includeFirstWordFallback: options.includeFirstWordFallback ?? true,
|
|
2584
|
+
lineFilter: options.lineFilter,
|
|
2585
|
+
maxExamples: options.maxExamples ?? 1,
|
|
2586
|
+
minCount: options.minCount ?? 3,
|
|
2587
|
+
minLineLength: options.minLineLength ?? 6,
|
|
2588
|
+
normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
|
|
2589
|
+
prefixChars: options.prefixChars ?? 60,
|
|
2590
|
+
prefixMatchers: options.prefixMatchers ?? [/^#+/u],
|
|
2591
|
+
sortBy: options.sortBy ?? "specificity",
|
|
2592
|
+
topK: options.topK ?? 40,
|
|
2593
|
+
whitespace: options.whitespace ?? "regex"
|
|
2594
|
+
});
|
|
2595
|
+
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2596
|
+
const computeSpecificity = (pattern) => ({
|
|
2597
|
+
literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
|
|
2598
|
+
tokenCount: countTokenMarkers(pattern)
|
|
2599
|
+
});
|
|
2600
|
+
const compareBySpecificity = (a, b) => {
|
|
2601
|
+
const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
|
|
2602
|
+
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
2603
|
+
};
|
|
2604
|
+
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
2605
|
+
/** Remove trailing whitespace placeholders */
|
|
2606
|
+
const trimTrailingWs = (out, mode) => {
|
|
2607
|
+
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
2608
|
+
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
2609
|
+
return out;
|
|
2610
|
+
};
|
|
2611
|
+
/** Try to extract first word for fallback */
|
|
2612
|
+
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
2613
|
+
/** Consume prefix matchers at current position */
|
|
2614
|
+
const consumePrefixes = (s, pos, out, matchers, ws) => {
|
|
2615
|
+
let matched = false;
|
|
2616
|
+
for (const re of matchers) {
|
|
2617
|
+
if (pos >= s.length) break;
|
|
2618
|
+
const m = re.exec(s.slice(pos));
|
|
2619
|
+
if (!m?.index && m?.[0]) {
|
|
2620
|
+
out += escapeSignatureLiteral(m[0]);
|
|
2621
|
+
pos += m[0].length;
|
|
2622
|
+
matched = true;
|
|
2623
|
+
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
2624
|
+
if (wsm) {
|
|
2625
|
+
pos += wsm[0].length;
|
|
2626
|
+
out = appendWs(out, ws);
|
|
2627
|
+
}
|
|
2628
|
+
}
|
|
2629
|
+
}
|
|
2630
|
+
return {
|
|
2631
|
+
matched,
|
|
2632
|
+
out,
|
|
2633
|
+
pos
|
|
2634
|
+
};
|
|
2635
|
+
};
|
|
2636
|
+
/** Try to match a token at current position and append to signature */
|
|
2637
|
+
const tryMatchToken = (s, pos, out, compiled) => {
|
|
2638
|
+
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
2639
|
+
if (!best) return {
|
|
2640
|
+
matched: false,
|
|
2641
|
+
out,
|
|
2642
|
+
pos
|
|
2643
|
+
};
|
|
2644
|
+
return {
|
|
2645
|
+
matched: true,
|
|
2646
|
+
out: `${out}{{${best.token}}}`,
|
|
2647
|
+
pos: pos + best.text.length
|
|
2648
|
+
};
|
|
2649
|
+
};
|
|
2650
|
+
/** Try to match a delimiter at current position */
|
|
2651
|
+
const tryMatchDelimiter = (s, pos, out) => {
|
|
2652
|
+
const ch = s[pos];
|
|
2653
|
+
if (!ch || !isCommonDelimiter(ch)) return {
|
|
2654
|
+
matched: false,
|
|
2655
|
+
out,
|
|
2656
|
+
pos
|
|
2657
|
+
};
|
|
2658
|
+
return {
|
|
2659
|
+
matched: true,
|
|
2660
|
+
out: out + escapeSignatureLiteral(ch),
|
|
2661
|
+
pos: pos + 1
|
|
2662
|
+
};
|
|
2663
|
+
};
|
|
2664
|
+
/** Skip whitespace at position */
|
|
2665
|
+
const skipWhitespace = (s, pos, out, ws) => {
|
|
2666
|
+
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
2667
|
+
if (!m) return {
|
|
2668
|
+
out,
|
|
2669
|
+
pos,
|
|
2670
|
+
skipped: false
|
|
2671
|
+
};
|
|
2672
|
+
return {
|
|
2673
|
+
out: appendWs(out, ws),
|
|
2674
|
+
pos: pos + m[0].length,
|
|
2675
|
+
skipped: true
|
|
2676
|
+
};
|
|
2677
|
+
};
|
|
2678
|
+
const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
2542
2679
|
const trimmed = collapseWhitespace(line);
|
|
2543
2680
|
if (!trimmed) return null;
|
|
2544
|
-
const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
|
|
2545
|
-
let pos = 0;
|
|
2546
|
-
let out = "";
|
|
2547
|
-
let matchedAny = false;
|
|
2548
|
-
let matchedToken = false;
|
|
2681
|
+
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
2549
2682
|
const compiled = compileTokenRegexes(tokenNames);
|
|
2550
|
-
|
|
2551
|
-
const
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
|
|
2560
|
-
const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
|
|
2561
|
-
if (wsMatch) {
|
|
2562
|
-
pos += wsMatch[0].length;
|
|
2563
|
-
out = appendWs(out, whitespace);
|
|
2683
|
+
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
2684
|
+
const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
2685
|
+
pos = prefix.pos;
|
|
2686
|
+
out = prefix.out;
|
|
2687
|
+
matchedAny = prefix.matched;
|
|
2688
|
+
while (steps < 6 && pos < s.length) {
|
|
2689
|
+
const ws = skipWhitespace(s, pos, out, opts.whitespace);
|
|
2690
|
+
if (ws.skipped) {
|
|
2691
|
+
pos = ws.pos;
|
|
2692
|
+
out = ws.out;
|
|
2564
2693
|
continue;
|
|
2565
2694
|
}
|
|
2566
|
-
const
|
|
2567
|
-
if (
|
|
2568
|
-
|
|
2569
|
-
out
|
|
2570
|
-
matchedAny = true;
|
|
2571
|
-
|
|
2572
|
-
pos += best.text.length;
|
|
2573
|
-
tokenSteps++;
|
|
2695
|
+
const tok = tryMatchToken(s, pos, out, compiled);
|
|
2696
|
+
if (tok.matched) {
|
|
2697
|
+
pos = tok.pos;
|
|
2698
|
+
out = tok.out;
|
|
2699
|
+
matchedAny = matchedToken = true;
|
|
2700
|
+
steps++;
|
|
2574
2701
|
continue;
|
|
2575
2702
|
}
|
|
2576
2703
|
if (matchedAny) {
|
|
2577
|
-
const
|
|
2578
|
-
if (
|
|
2579
|
-
|
|
2580
|
-
|
|
2704
|
+
const delim = tryMatchDelimiter(s, pos, out);
|
|
2705
|
+
if (delim.matched) {
|
|
2706
|
+
pos = delim.pos;
|
|
2707
|
+
out = delim.out;
|
|
2581
2708
|
continue;
|
|
2582
2709
|
}
|
|
2583
2710
|
}
|
|
2584
2711
|
if (matchedAny) {
|
|
2585
|
-
if (includeFirstWordFallback && !matchedToken) {
|
|
2586
|
-
const
|
|
2587
|
-
if (
|
|
2588
|
-
|
|
2589
|
-
|
|
2712
|
+
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
2713
|
+
const word$1 = extractFirstWord(s.slice(pos));
|
|
2714
|
+
if (word$1) {
|
|
2715
|
+
out += escapeSignatureLiteral(word$1);
|
|
2716
|
+
steps++;
|
|
2717
|
+
}
|
|
2590
2718
|
}
|
|
2591
2719
|
break;
|
|
2592
2720
|
}
|
|
2593
|
-
if (!includeFirstWordFallback) return null;
|
|
2594
|
-
const
|
|
2595
|
-
if (!
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
return
|
|
2721
|
+
if (!opts.includeFirstWordFallback) return null;
|
|
2722
|
+
const word = extractFirstWord(s.slice(pos));
|
|
2723
|
+
if (!word) return null;
|
|
2724
|
+
return escapeSignatureLiteral(word);
|
|
2725
|
+
}
|
|
2726
|
+
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
2727
|
+
};
|
|
2728
|
+
const processLine = (line, pageId, tokenPriority, opts, acc) => {
|
|
2729
|
+
const trimmed = collapseWhitespace(line);
|
|
2730
|
+
if (trimmed.length < opts.minLineLength) return;
|
|
2731
|
+
if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
|
|
2732
|
+
const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
|
|
2733
|
+
if (!sig) return;
|
|
2734
|
+
const entry = acc.get(sig);
|
|
2735
|
+
if (!entry) acc.set(sig, {
|
|
2736
|
+
count: 1,
|
|
2737
|
+
examples: [{
|
|
2738
|
+
line: trimmed,
|
|
2739
|
+
pageId
|
|
2740
|
+
}]
|
|
2741
|
+
});
|
|
2742
|
+
else {
|
|
2743
|
+
entry.count++;
|
|
2744
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push({
|
|
2745
|
+
line: trimmed,
|
|
2746
|
+
pageId
|
|
2747
|
+
});
|
|
2748
|
+
}
|
|
2749
|
+
};
|
|
2750
|
+
const processPage = (page, tokenPriority, opts, acc) => {
|
|
2751
|
+
for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
|
|
2604
2752
|
};
|
|
2605
2753
|
/**
|
|
2606
2754
|
* Analyze pages and return the most common line-start patterns (top K).
|
|
2607
|
-
*
|
|
2608
|
-
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
2609
|
-
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
2610
2755
|
*/
|
|
2611
2756
|
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
2612
|
-
const
|
|
2613
|
-
...DEFAULT_OPTIONS,
|
|
2614
|
-
...options,
|
|
2615
|
-
lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
|
|
2616
|
-
prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
|
|
2617
|
-
whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
|
|
2618
|
-
};
|
|
2757
|
+
const opts = resolveOptions$1(options);
|
|
2619
2758
|
const tokenPriority = buildTokenPriority();
|
|
2620
|
-
const
|
|
2621
|
-
for (const page of pages)
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2759
|
+
const acc = /* @__PURE__ */ new Map();
|
|
2760
|
+
for (const page of pages) processPage(page, tokenPriority, opts, acc);
|
|
2761
|
+
const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
|
|
2762
|
+
return [...acc.entries()].map(([pattern, v]) => ({
|
|
2763
|
+
count: v.count,
|
|
2764
|
+
examples: v.examples,
|
|
2765
|
+
pattern
|
|
2766
|
+
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
2767
|
+
};
|
|
2768
|
+
|
|
2769
|
+
//#endregion
|
|
2770
|
+
//#region src/analysis/repeating-sequences.ts
|
|
2771
|
+
const resolveOptions = (options) => {
|
|
2772
|
+
const minElements = Math.max(1, options?.minElements ?? 1);
|
|
2773
|
+
return {
|
|
2774
|
+
contextChars: options?.contextChars ?? 50,
|
|
2775
|
+
maxElements: Math.max(minElements, options?.maxElements ?? 3),
|
|
2776
|
+
maxExamples: options?.maxExamples ?? 3,
|
|
2777
|
+
maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
|
|
2778
|
+
minCount: Math.max(1, options?.minCount ?? 3),
|
|
2779
|
+
minElements,
|
|
2780
|
+
normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
|
|
2781
|
+
requireToken: options?.requireToken ?? true,
|
|
2782
|
+
topK: Math.max(1, options?.topK ?? 20),
|
|
2783
|
+
whitespace: options?.whitespace ?? "regex"
|
|
2784
|
+
};
|
|
2785
|
+
};
|
|
2786
|
+
/** Creates a cursor that tracks position in both normalized and raw text */
|
|
2787
|
+
const createRawCursor = (text, normalize) => {
|
|
2788
|
+
let rawPos = 0;
|
|
2789
|
+
return {
|
|
2790
|
+
advance(normalizedLen) {
|
|
2791
|
+
if (!normalize) {
|
|
2792
|
+
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
2793
|
+
rawPos += normalizedLen;
|
|
2794
|
+
return chunk;
|
|
2643
2795
|
}
|
|
2796
|
+
const start = rawPos;
|
|
2797
|
+
let matchedLen = 0;
|
|
2798
|
+
while (matchedLen < normalizedLen && rawPos < text.length) {
|
|
2799
|
+
if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
|
|
2800
|
+
rawPos++;
|
|
2801
|
+
}
|
|
2802
|
+
while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
|
|
2803
|
+
return text.slice(start, rawPos);
|
|
2804
|
+
},
|
|
2805
|
+
get pos() {
|
|
2806
|
+
return rawPos;
|
|
2807
|
+
}
|
|
2808
|
+
};
|
|
2809
|
+
};
|
|
2810
|
+
/** Scans text and produces a stream of tokens and literals. */
|
|
2811
|
+
const tokenizeContent = (text, normalize) => {
|
|
2812
|
+
const normalized = normalize ? stripArabicDiacritics(text) : text;
|
|
2813
|
+
const compiled = compileTokenRegexes(buildTokenPriority());
|
|
2814
|
+
const cursor = createRawCursor(text, normalize);
|
|
2815
|
+
const items = [];
|
|
2816
|
+
let pos = 0;
|
|
2817
|
+
while (pos < normalized.length) {
|
|
2818
|
+
const ws = /^\s+/u.exec(normalized.slice(pos));
|
|
2819
|
+
if (ws) {
|
|
2820
|
+
pos += ws[0].length;
|
|
2821
|
+
cursor.advance(ws[0].length);
|
|
2822
|
+
continue;
|
|
2823
|
+
}
|
|
2824
|
+
const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
|
|
2825
|
+
if (token) {
|
|
2826
|
+
const raw = cursor.advance(token.text.length);
|
|
2827
|
+
items.push({
|
|
2828
|
+
end: cursor.pos,
|
|
2829
|
+
raw,
|
|
2830
|
+
start: cursor.pos - raw.length,
|
|
2831
|
+
text: `{{${token.token}}}`,
|
|
2832
|
+
type: "token"
|
|
2833
|
+
});
|
|
2834
|
+
pos += token.text.length;
|
|
2835
|
+
continue;
|
|
2836
|
+
}
|
|
2837
|
+
if (isCommonDelimiter(normalized[pos])) {
|
|
2838
|
+
const raw = cursor.advance(1);
|
|
2839
|
+
items.push({
|
|
2840
|
+
end: cursor.pos,
|
|
2841
|
+
raw,
|
|
2842
|
+
start: cursor.pos - 1,
|
|
2843
|
+
text: escapeSignatureLiteral(normalized[pos]),
|
|
2844
|
+
type: "literal"
|
|
2845
|
+
});
|
|
2846
|
+
pos++;
|
|
2847
|
+
continue;
|
|
2848
|
+
}
|
|
2849
|
+
const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
|
|
2850
|
+
if (word) {
|
|
2851
|
+
const raw = cursor.advance(word[0].length);
|
|
2852
|
+
items.push({
|
|
2853
|
+
end: cursor.pos,
|
|
2854
|
+
raw,
|
|
2855
|
+
start: cursor.pos - raw.length,
|
|
2856
|
+
text: escapeSignatureLiteral(word[0]),
|
|
2857
|
+
type: "literal"
|
|
2858
|
+
});
|
|
2859
|
+
pos += word[0].length;
|
|
2860
|
+
continue;
|
|
2644
2861
|
}
|
|
2862
|
+
cursor.advance(1);
|
|
2863
|
+
pos++;
|
|
2645
2864
|
}
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
|
|
2865
|
+
return items;
|
|
2866
|
+
};
|
|
2867
|
+
/** Build pattern string from window items */
|
|
2868
|
+
const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
|
|
2869
|
+
/** Check if window contains at least one token */
|
|
2870
|
+
const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
|
|
2871
|
+
/** Compute token count and literal length for a window */
|
|
2872
|
+
const computeWindowStats = (window) => {
|
|
2873
|
+
let tokenCount = 0, literalLen = 0;
|
|
2874
|
+
for (const item of window) if (item.type === "token") tokenCount++;
|
|
2875
|
+
else literalLen += item.text.length;
|
|
2876
|
+
return {
|
|
2877
|
+
literalLen,
|
|
2878
|
+
tokenCount
|
|
2653
2879
|
};
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2880
|
+
};
|
|
2881
|
+
/** Build example from page content and window */
|
|
2882
|
+
const buildExample = (page, window, contextChars) => {
|
|
2883
|
+
const start = window[0].start;
|
|
2884
|
+
const end = window.at(-1).end;
|
|
2885
|
+
const ctxStart = Math.max(0, start - contextChars);
|
|
2886
|
+
const ctxEnd = Math.min(page.content.length, end + contextChars);
|
|
2887
|
+
return {
|
|
2888
|
+
context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
|
|
2889
|
+
pageId: page.id,
|
|
2890
|
+
startIndices: window.map((w) => w.start),
|
|
2891
|
+
text: page.content.slice(start, end)
|
|
2657
2892
|
};
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2893
|
+
};
|
|
2894
|
+
/** Extract N-grams from a single page */
|
|
2895
|
+
const extractPageNgrams = (page, items, opts, stats) => {
|
|
2896
|
+
for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
|
|
2897
|
+
const window = items.slice(i, i + n);
|
|
2898
|
+
if (opts.requireToken && !hasTokenInWindow(window)) continue;
|
|
2899
|
+
const pattern = buildPattern(window, opts.whitespace);
|
|
2900
|
+
if (!stats.has(pattern)) {
|
|
2901
|
+
if (stats.size >= opts.maxUniquePatterns) continue;
|
|
2902
|
+
stats.set(pattern, {
|
|
2903
|
+
count: 0,
|
|
2904
|
+
examples: [],
|
|
2905
|
+
...computeWindowStats(window)
|
|
2906
|
+
});
|
|
2907
|
+
}
|
|
2908
|
+
const entry = stats.get(pattern);
|
|
2909
|
+
entry.count++;
|
|
2910
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
2911
|
+
}
|
|
2912
|
+
};
|
|
2913
|
+
/**
|
|
2914
|
+
* Analyze pages for commonly repeating word sequences.
|
|
2915
|
+
*
|
|
2916
|
+
* Use for continuous text without line breaks. For line-based analysis,
|
|
2917
|
+
* use `analyzeCommonLineStarts()` instead.
|
|
2918
|
+
*/
|
|
2919
|
+
const analyzeRepeatingSequences = (pages, options) => {
|
|
2920
|
+
const opts = resolveOptions(options);
|
|
2921
|
+
const stats = /* @__PURE__ */ new Map();
|
|
2922
|
+
for (const page of pages) {
|
|
2923
|
+
if (!page.content) continue;
|
|
2924
|
+
extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
|
|
2925
|
+
}
|
|
2926
|
+
return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
|
|
2927
|
+
count: s.count,
|
|
2928
|
+
examples: s.examples,
|
|
2661
2929
|
pattern
|
|
2662
|
-
}))
|
|
2930
|
+
}));
|
|
2663
2931
|
};
|
|
2664
2932
|
|
|
2665
2933
|
//#endregion
|
|
@@ -2831,5 +3099,524 @@ const analyzeTextForRule = (text) => {
|
|
|
2831
3099
|
};
|
|
2832
3100
|
|
|
2833
3101
|
//#endregion
|
|
2834
|
-
|
|
3102
|
+
//#region src/recovery.ts
|
|
3103
|
+
const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
|
|
3104
|
+
const normalizeForCompare = (s, mode) => {
|
|
3105
|
+
if (mode === "none") return s;
|
|
3106
|
+
let out = s;
|
|
3107
|
+
if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
|
|
3108
|
+
out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
|
|
3109
|
+
return out;
|
|
3110
|
+
};
|
|
3111
|
+
const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
|
|
3112
|
+
const buildFixedOptions = (options, selectedRuleIndices) => {
|
|
3113
|
+
const fixedRules = (options.rules ?? []).map((r, idx) => {
|
|
3114
|
+
if (!selectedRuleIndices.has(idx)) return r;
|
|
3115
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
|
|
3116
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3117
|
+
return {
|
|
3118
|
+
...rest,
|
|
3119
|
+
lineStartsWith: lineStartsAfter
|
|
3120
|
+
};
|
|
3121
|
+
});
|
|
3122
|
+
return {
|
|
3123
|
+
...options,
|
|
3124
|
+
rules: fixedRules
|
|
3125
|
+
};
|
|
3126
|
+
};
|
|
3127
|
+
const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
|
|
3128
|
+
const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
|
|
3129
|
+
const parts = [];
|
|
3130
|
+
for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
|
|
3131
|
+
const matchContent = parts.join("\n");
|
|
3132
|
+
if (pageJoiner === "newline") return {
|
|
3133
|
+
matchContent,
|
|
3134
|
+
outputContent: matchContent
|
|
3135
|
+
};
|
|
3136
|
+
return {
|
|
3137
|
+
matchContent,
|
|
3138
|
+
outputContent: parts.join(" ")
|
|
3139
|
+
};
|
|
3140
|
+
};
|
|
3141
|
+
const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
|
|
3142
|
+
const rules = options.rules ?? [];
|
|
3143
|
+
const compiled = [];
|
|
3144
|
+
for (const idx of selectedRuleIndices) {
|
|
3145
|
+
const r = rules[idx];
|
|
3146
|
+
if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3147
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3148
|
+
const built = buildRuleRegex({
|
|
3149
|
+
...rest,
|
|
3150
|
+
lineStartsWith: lineStartsAfter
|
|
3151
|
+
});
|
|
3152
|
+
compiled.push({
|
|
3153
|
+
ruleIndex: idx,
|
|
3154
|
+
startsWithRegex: new RegExp(built.regex.source, "mu")
|
|
3155
|
+
});
|
|
3156
|
+
}
|
|
3157
|
+
return compiled;
|
|
3158
|
+
};
|
|
3159
|
+
const findUniqueAnchorPos = (outputContent, segmentContent) => {
|
|
3160
|
+
for (const len of [
|
|
3161
|
+
80,
|
|
3162
|
+
60,
|
|
3163
|
+
40,
|
|
3164
|
+
30,
|
|
3165
|
+
20,
|
|
3166
|
+
15
|
|
3167
|
+
]) {
|
|
3168
|
+
const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
|
|
3169
|
+
if (!needle.trim()) continue;
|
|
3170
|
+
const first = outputContent.indexOf(needle);
|
|
3171
|
+
if (first === -1) continue;
|
|
3172
|
+
if (outputContent.indexOf(needle, first + 1) === -1) return first;
|
|
3173
|
+
}
|
|
3174
|
+
return null;
|
|
3175
|
+
};
|
|
3176
|
+
const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
|
|
3177
|
+
const line = matchContent.slice(lineStart);
|
|
3178
|
+
for (const mr of compiledMistaken) {
|
|
3179
|
+
mr.startsWithRegex.lastIndex = 0;
|
|
3180
|
+
const m = mr.startsWithRegex.exec(line);
|
|
3181
|
+
if (!m || m.index !== 0) continue;
|
|
3182
|
+
const markerMatch = m[0];
|
|
3183
|
+
const markerEnd = lineStart + markerMatch.length;
|
|
3184
|
+
if (anchorPos < markerEnd) continue;
|
|
3185
|
+
const gap = matchContent.slice(markerEnd, anchorPos);
|
|
3186
|
+
const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
|
|
3187
|
+
if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
|
|
3188
|
+
return { prefix: recoveredPrefix };
|
|
3189
|
+
}
|
|
3190
|
+
return { reason: "no selected marker pattern matched at anchored line start" };
|
|
3191
|
+
};
|
|
3192
|
+
const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
|
|
3193
|
+
const fromIdx = pageIdToIndex.get(segment.from);
|
|
3194
|
+
const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
|
|
3195
|
+
if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
|
|
3196
|
+
kind: "unresolved",
|
|
3197
|
+
reason: "segment page range not found in pages"
|
|
3198
|
+
};
|
|
3199
|
+
const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
|
|
3200
|
+
if (!segment.content) return {
|
|
3201
|
+
kind: "unresolved",
|
|
3202
|
+
reason: "empty segment content"
|
|
3203
|
+
};
|
|
3204
|
+
const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
|
|
3205
|
+
if (anchorPos === null) return {
|
|
3206
|
+
kind: "unresolved",
|
|
3207
|
+
reason: "could not uniquely anchor segment content in page range"
|
|
3208
|
+
};
|
|
3209
|
+
const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
|
|
3210
|
+
const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
|
|
3211
|
+
if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
|
|
3212
|
+
kind: "unresolved",
|
|
3213
|
+
reason: found.reason
|
|
3214
|
+
};
|
|
3215
|
+
return {
|
|
3216
|
+
kind: "recovered",
|
|
3217
|
+
recoveredContent: `${found.prefix}${segment.content}`,
|
|
3218
|
+
recoveredPrefix: found.prefix
|
|
3219
|
+
};
|
|
3220
|
+
};
|
|
3221
|
+
const resolveRuleIndicesSelector = (rules, indicesIn) => {
|
|
3222
|
+
const errors = [];
|
|
3223
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3224
|
+
for (const idx of indicesIn) {
|
|
3225
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
|
|
3226
|
+
errors.push(`Selector index out of range: ${idx}`);
|
|
3227
|
+
continue;
|
|
3228
|
+
}
|
|
3229
|
+
const rule = rules[idx];
|
|
3230
|
+
if (!rule || !("lineStartsAfter" in rule)) {
|
|
3231
|
+
errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
|
|
3232
|
+
continue;
|
|
3233
|
+
}
|
|
3234
|
+
indices.add(idx);
|
|
3235
|
+
}
|
|
3236
|
+
return {
|
|
3237
|
+
errors,
|
|
3238
|
+
indices,
|
|
3239
|
+
warnings: []
|
|
3240
|
+
};
|
|
3241
|
+
};
|
|
3242
|
+
const resolvePredicateSelector = (rules, predicate) => {
|
|
3243
|
+
const errors = [];
|
|
3244
|
+
const warnings = [];
|
|
3245
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3246
|
+
rules.forEach((r, i) => {
|
|
3247
|
+
try {
|
|
3248
|
+
if (!predicate(r, i)) return;
|
|
3249
|
+
if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
|
|
3250
|
+
indices.add(i);
|
|
3251
|
+
return;
|
|
3252
|
+
}
|
|
3253
|
+
warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
|
|
3254
|
+
} catch (e) {
|
|
3255
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
3256
|
+
errors.push(`Predicate threw at rule ${i}: ${msg}`);
|
|
3257
|
+
}
|
|
3258
|
+
});
|
|
3259
|
+
if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
|
|
3260
|
+
return {
|
|
3261
|
+
errors,
|
|
3262
|
+
indices,
|
|
3263
|
+
warnings
|
|
3264
|
+
};
|
|
3265
|
+
};
|
|
3266
|
+
const resolvePatternsSelector = (rules, patterns, matchMode) => {
|
|
3267
|
+
const errors = [];
|
|
3268
|
+
const warnings = [];
|
|
3269
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3270
|
+
const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
|
|
3271
|
+
const targets = patterns.map(normalizePattern);
|
|
3272
|
+
for (let pi = 0; pi < patterns.length; pi++) {
|
|
3273
|
+
const rawPattern = patterns[pi];
|
|
3274
|
+
const pat = targets[pi];
|
|
3275
|
+
const matched = [];
|
|
3276
|
+
for (let i = 0; i < rules.length; i++) {
|
|
3277
|
+
const r = rules[i];
|
|
3278
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3279
|
+
if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
|
|
3280
|
+
}
|
|
3281
|
+
if (matched.length === 0) {
|
|
3282
|
+
errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
|
|
3283
|
+
continue;
|
|
3284
|
+
}
|
|
3285
|
+
if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
|
|
3286
|
+
matched.forEach((i) => {
|
|
3287
|
+
indices.add(i);
|
|
3288
|
+
});
|
|
3289
|
+
}
|
|
3290
|
+
return {
|
|
3291
|
+
errors,
|
|
3292
|
+
indices,
|
|
3293
|
+
warnings
|
|
3294
|
+
};
|
|
3295
|
+
};
|
|
3296
|
+
const resolveSelectorToRuleIndices = (options, selector) => {
|
|
3297
|
+
const rules = options.rules ?? [];
|
|
3298
|
+
if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
|
|
3299
|
+
if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
|
|
3300
|
+
return resolvePatternsSelector(rules, selector.patterns, selector.match);
|
|
3301
|
+
};
|
|
3302
|
+
const longestCommonSuffixLength = (a, b) => {
|
|
3303
|
+
const max = Math.min(a.length, b.length);
|
|
3304
|
+
let i = 0;
|
|
3305
|
+
while (i < max) {
|
|
3306
|
+
if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
|
|
3307
|
+
i++;
|
|
3308
|
+
}
|
|
3309
|
+
return i;
|
|
3310
|
+
};
|
|
3311
|
+
const AMBIGUITY_SCORE_GAP = 5;
|
|
3312
|
+
const scoreCandidate = (orig, fixed, normalizeMode) => {
|
|
3313
|
+
if (fixed.content === orig.content) return {
|
|
3314
|
+
fixedIndex: -1,
|
|
3315
|
+
kind: "exact",
|
|
3316
|
+
score: 100
|
|
3317
|
+
};
|
|
3318
|
+
if (fixed.content.endsWith(orig.content)) {
|
|
3319
|
+
const markerLen = fixed.content.length - orig.content.length;
|
|
3320
|
+
return {
|
|
3321
|
+
fixedIndex: -1,
|
|
3322
|
+
kind: "exact_suffix",
|
|
3323
|
+
score: 90 + Math.min(30, markerLen)
|
|
3324
|
+
};
|
|
3325
|
+
}
|
|
3326
|
+
if (normalizeMode !== "none") {
|
|
3327
|
+
const normFixed = normalizeForCompare(fixed.content, normalizeMode);
|
|
3328
|
+
const normOrig = normalizeForCompare(orig.content, normalizeMode);
|
|
3329
|
+
if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
|
|
3330
|
+
const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
|
|
3331
|
+
return {
|
|
3332
|
+
fixedIndex: -1,
|
|
3333
|
+
kind: "normalized_suffix",
|
|
3334
|
+
score: 70 + Math.floor(overlap * 20)
|
|
3335
|
+
};
|
|
3336
|
+
}
|
|
3337
|
+
}
|
|
3338
|
+
return null;
|
|
3339
|
+
};
|
|
3340
|
+
const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
|
|
3341
|
+
const warnings = [...reportBase.warnings];
|
|
3342
|
+
warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
|
|
3343
|
+
const details = segments.map((s, i) => {
|
|
3344
|
+
const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
|
|
3345
|
+
return {
|
|
3346
|
+
from: s.from,
|
|
3347
|
+
notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
|
|
3348
|
+
originalStartPreview: preview(s.content),
|
|
3349
|
+
segmentIndex: i,
|
|
3350
|
+
status,
|
|
3351
|
+
strategy: "none",
|
|
3352
|
+
to: s.to
|
|
3353
|
+
};
|
|
3354
|
+
});
|
|
3355
|
+
return {
|
|
3356
|
+
report: {
|
|
3357
|
+
...reportBase,
|
|
3358
|
+
details,
|
|
3359
|
+
summary: {
|
|
3360
|
+
mode,
|
|
3361
|
+
recovered: 0,
|
|
3362
|
+
totalSegments: segments.length,
|
|
3363
|
+
unchanged: segments.length,
|
|
3364
|
+
unresolved: selectorErrors.length ? segments.length : 0
|
|
3365
|
+
},
|
|
3366
|
+
warnings
|
|
3367
|
+
},
|
|
3368
|
+
segments
|
|
3369
|
+
};
|
|
3370
|
+
};
|
|
3371
|
+
const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
|
|
3372
|
+
const recoveredAtIndex = /* @__PURE__ */ new Map();
|
|
3373
|
+
const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
|
|
3374
|
+
if (mode !== "best_effort_then_rerun") return {
|
|
3375
|
+
recoveredAtIndex,
|
|
3376
|
+
recoveredDetailAtIndex
|
|
3377
|
+
};
|
|
3378
|
+
const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
|
|
3379
|
+
const pageIdToIndex = buildPageIdToIndex(processedPages);
|
|
3380
|
+
const pageJoiner = options.pageJoiner ?? "space";
|
|
3381
|
+
const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
|
|
3382
|
+
for (let i = 0; i < segments.length; i++) {
|
|
3383
|
+
const orig = segments[i];
|
|
3384
|
+
const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
|
|
3385
|
+
if (r.kind !== "recovered") continue;
|
|
3386
|
+
const seg = {
|
|
3387
|
+
...orig,
|
|
3388
|
+
content: r.recoveredContent
|
|
3389
|
+
};
|
|
3390
|
+
recoveredAtIndex.set(i, seg);
|
|
3391
|
+
recoveredDetailAtIndex.set(i, {
|
|
3392
|
+
from: orig.from,
|
|
3393
|
+
originalStartPreview: preview(orig.content),
|
|
3394
|
+
recoveredPrefixPreview: preview(r.recoveredPrefix),
|
|
3395
|
+
recoveredStartPreview: preview(seg.content),
|
|
3396
|
+
segmentIndex: i,
|
|
3397
|
+
status: "recovered",
|
|
3398
|
+
strategy: "stage1",
|
|
3399
|
+
to: orig.to
|
|
3400
|
+
});
|
|
3401
|
+
}
|
|
3402
|
+
return {
|
|
3403
|
+
recoveredAtIndex,
|
|
3404
|
+
recoveredDetailAtIndex
|
|
3405
|
+
};
|
|
3406
|
+
};
|
|
3407
|
+
const buildFixedBuckets = (fixedSegments) => {
|
|
3408
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
3409
|
+
for (let i = 0; i < fixedSegments.length; i++) {
|
|
3410
|
+
const k = segmentRangeKey(fixedSegments[i]);
|
|
3411
|
+
const arr = buckets.get(k);
|
|
3412
|
+
if (!arr) buckets.set(k, [i]);
|
|
3413
|
+
else arr.push(i);
|
|
3414
|
+
}
|
|
3415
|
+
return buckets;
|
|
3416
|
+
};
|
|
3417
|
+
const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
|
|
3418
|
+
let best = null;
|
|
3419
|
+
let secondBestScore = -Infinity;
|
|
3420
|
+
for (const fixedIdx of candidates) {
|
|
3421
|
+
if (usedFixed.has(fixedIdx)) continue;
|
|
3422
|
+
const fixed = fixedSegments[fixedIdx];
|
|
3423
|
+
const scored = scoreCandidate(orig, fixed, normalizeCompare);
|
|
3424
|
+
if (!scored) continue;
|
|
3425
|
+
const candidateScore = scored.score;
|
|
3426
|
+
if (!best || candidateScore > best.score) {
|
|
3427
|
+
secondBestScore = best?.score ?? -Infinity;
|
|
3428
|
+
best = {
|
|
3429
|
+
fixedIdx,
|
|
3430
|
+
score: candidateScore
|
|
3431
|
+
};
|
|
3432
|
+
} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
|
|
3433
|
+
}
|
|
3434
|
+
if (!best) return { kind: "none" };
|
|
3435
|
+
if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
|
|
3436
|
+
return {
|
|
3437
|
+
fixedIdx: best.fixedIdx,
|
|
3438
|
+
kind: "match"
|
|
3439
|
+
};
|
|
3440
|
+
};
|
|
3441
|
+
const detailUnresolved = (orig, segmentIndex, notes) => ({
|
|
3442
|
+
from: orig.from,
|
|
3443
|
+
notes,
|
|
3444
|
+
originalStartPreview: preview(orig.content),
|
|
3445
|
+
segmentIndex,
|
|
3446
|
+
status: "unresolved_alignment",
|
|
3447
|
+
strategy: "rerun",
|
|
3448
|
+
to: orig.to
|
|
3449
|
+
});
|
|
3450
|
+
const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
|
|
3451
|
+
from: orig.from,
|
|
3452
|
+
notes,
|
|
3453
|
+
originalStartPreview: preview(orig.content),
|
|
3454
|
+
segmentIndex,
|
|
3455
|
+
status: "skipped_idempotent",
|
|
3456
|
+
strategy: "rerun",
|
|
3457
|
+
to: orig.to
|
|
3458
|
+
});
|
|
3459
|
+
const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
|
|
3460
|
+
let recoveredPrefixPreview;
|
|
3461
|
+
if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
|
|
3462
|
+
return {
|
|
3463
|
+
from: orig.from,
|
|
3464
|
+
originalStartPreview: preview(orig.content),
|
|
3465
|
+
recoveredPrefixPreview,
|
|
3466
|
+
recoveredStartPreview: preview(fixed.content),
|
|
3467
|
+
segmentIndex,
|
|
3468
|
+
status: "recovered",
|
|
3469
|
+
strategy: "rerun",
|
|
3470
|
+
to: orig.to
|
|
3471
|
+
};
|
|
3472
|
+
};
|
|
3473
|
+
const mergeWithRerun = (params) => {
|
|
3474
|
+
const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
|
|
3475
|
+
const usedFixed = /* @__PURE__ */ new Set();
|
|
3476
|
+
const out = [];
|
|
3477
|
+
const details = [];
|
|
3478
|
+
let recovered = 0;
|
|
3479
|
+
let unresolved = 0;
|
|
3480
|
+
let unchanged = 0;
|
|
3481
|
+
for (let i = 0; i < originalSegments.length; i++) {
|
|
3482
|
+
const stage1Recovered = stage1RecoveredAtIndex.get(i);
|
|
3483
|
+
if (stage1Recovered) {
|
|
3484
|
+
out.push(stage1Recovered);
|
|
3485
|
+
recovered++;
|
|
3486
|
+
details.push(recoveredDetailAtIndex.get(i) ?? {
|
|
3487
|
+
from: stage1Recovered.from,
|
|
3488
|
+
originalStartPreview: preview(originalSegments[i].content),
|
|
3489
|
+
recoveredStartPreview: preview(stage1Recovered.content),
|
|
3490
|
+
segmentIndex: i,
|
|
3491
|
+
status: "recovered",
|
|
3492
|
+
strategy: "stage1",
|
|
3493
|
+
to: stage1Recovered.to
|
|
3494
|
+
});
|
|
3495
|
+
continue;
|
|
3496
|
+
}
|
|
3497
|
+
const orig = originalSegments[i];
|
|
3498
|
+
const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
|
|
3499
|
+
if (best.kind === "none") {
|
|
3500
|
+
out.push(orig);
|
|
3501
|
+
unresolved++;
|
|
3502
|
+
details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
|
|
3503
|
+
continue;
|
|
3504
|
+
}
|
|
3505
|
+
if (best.kind === "ambiguous") {
|
|
3506
|
+
out.push(orig);
|
|
3507
|
+
unresolved++;
|
|
3508
|
+
details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
|
|
3509
|
+
continue;
|
|
3510
|
+
}
|
|
3511
|
+
usedFixed.add(best.fixedIdx);
|
|
3512
|
+
const fixed = fixedSegments[best.fixedIdx];
|
|
3513
|
+
if (fixed.content === orig.content) {
|
|
3514
|
+
out.push(orig);
|
|
3515
|
+
unchanged++;
|
|
3516
|
+
details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
|
|
3517
|
+
continue;
|
|
3518
|
+
}
|
|
3519
|
+
out.push({
|
|
3520
|
+
...orig,
|
|
3521
|
+
content: fixed.content
|
|
3522
|
+
});
|
|
3523
|
+
recovered++;
|
|
3524
|
+
details.push(detailRecoveredRerun(orig, fixed, i));
|
|
3525
|
+
}
|
|
3526
|
+
return {
|
|
3527
|
+
details,
|
|
3528
|
+
segments: out,
|
|
3529
|
+
summary: {
|
|
3530
|
+
recovered,
|
|
3531
|
+
unchanged,
|
|
3532
|
+
unresolved
|
|
3533
|
+
}
|
|
3534
|
+
};
|
|
3535
|
+
};
|
|
3536
|
+
function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
|
|
3537
|
+
const mode = opts?.mode ?? "rerun_only";
|
|
3538
|
+
const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
|
|
3539
|
+
const resolved = resolveSelectorToRuleIndices(options, selector);
|
|
3540
|
+
const reportBase = {
|
|
3541
|
+
byRun: void 0,
|
|
3542
|
+
errors: resolved.errors,
|
|
3543
|
+
warnings: resolved.warnings
|
|
3544
|
+
};
|
|
3545
|
+
if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
|
|
3546
|
+
const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
|
|
3547
|
+
const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
|
|
3548
|
+
const merged = mergeWithRerun({
|
|
3549
|
+
fixedBuckets: buildFixedBuckets(fixedSegments),
|
|
3550
|
+
fixedSegments,
|
|
3551
|
+
normalizeCompare,
|
|
3552
|
+
originalSegments: segments,
|
|
3553
|
+
recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
|
|
3554
|
+
stage1RecoveredAtIndex: stage1.recoveredAtIndex
|
|
3555
|
+
});
|
|
3556
|
+
return {
|
|
3557
|
+
report: {
|
|
3558
|
+
...reportBase,
|
|
3559
|
+
details: merged.details,
|
|
3560
|
+
summary: {
|
|
3561
|
+
mode,
|
|
3562
|
+
recovered: merged.summary.recovered,
|
|
3563
|
+
totalSegments: segments.length,
|
|
3564
|
+
unchanged: merged.summary.unchanged,
|
|
3565
|
+
unresolved: merged.summary.unresolved
|
|
3566
|
+
}
|
|
3567
|
+
},
|
|
3568
|
+
segments: merged.segments
|
|
3569
|
+
};
|
|
3570
|
+
}
|
|
3571
|
+
function recoverMistakenMarkersForRuns(runs, opts) {
|
|
3572
|
+
const allSegments = [];
|
|
3573
|
+
const byRun = [];
|
|
3574
|
+
const details = [];
|
|
3575
|
+
const warnings = [];
|
|
3576
|
+
const errors = [];
|
|
3577
|
+
let recovered = 0;
|
|
3578
|
+
let unchanged = 0;
|
|
3579
|
+
let unresolved = 0;
|
|
3580
|
+
let offset = 0;
|
|
3581
|
+
for (let i = 0; i < runs.length; i++) {
|
|
3582
|
+
const run = runs[i];
|
|
3583
|
+
const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
|
|
3584
|
+
allSegments.push(...res.segments);
|
|
3585
|
+
for (const d of res.report.details) details.push({
|
|
3586
|
+
...d,
|
|
3587
|
+
segmentIndex: d.segmentIndex + offset
|
|
3588
|
+
});
|
|
3589
|
+
offset += run.segments.length;
|
|
3590
|
+
recovered += res.report.summary.recovered;
|
|
3591
|
+
unchanged += res.report.summary.unchanged;
|
|
3592
|
+
unresolved += res.report.summary.unresolved;
|
|
3593
|
+
warnings.push(...res.report.warnings);
|
|
3594
|
+
errors.push(...res.report.errors);
|
|
3595
|
+
byRun.push({
|
|
3596
|
+
recovered: res.report.summary.recovered,
|
|
3597
|
+
runIndex: i,
|
|
3598
|
+
totalSegments: run.segments.length,
|
|
3599
|
+
unresolved: res.report.summary.unresolved
|
|
3600
|
+
});
|
|
3601
|
+
}
|
|
3602
|
+
return {
|
|
3603
|
+
report: {
|
|
3604
|
+
byRun,
|
|
3605
|
+
details,
|
|
3606
|
+
errors,
|
|
3607
|
+
summary: {
|
|
3608
|
+
mode: opts?.mode ?? "rerun_only",
|
|
3609
|
+
recovered,
|
|
3610
|
+
totalSegments: offset,
|
|
3611
|
+
unchanged,
|
|
3612
|
+
unresolved
|
|
3613
|
+
},
|
|
3614
|
+
warnings
|
|
3615
|
+
},
|
|
3616
|
+
segments: allSegments
|
|
3617
|
+
};
|
|
3618
|
+
}
|
|
3619
|
+
|
|
3620
|
+
//#endregion
|
|
3621
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
2835
3622
|
//# sourceMappingURL=index.mjs.map
|