flappa-doormal 2.7.0 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +40 -0
- package/README.md +47 -0
- package/dist/index.d.mts +96 -57
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +968 -267
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -1351,7 +1351,7 @@ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx
|
|
|
1351
1351
|
/**
|
|
1352
1352
|
* Advances cursor position past any leading whitespace.
|
|
1353
1353
|
*/
|
|
1354
|
-
const skipWhitespace = (content, startPos) => {
|
|
1354
|
+
const skipWhitespace$1 = (content, startPos) => {
|
|
1355
1355
|
let pos = startPos;
|
|
1356
1356
|
while (pos < content.length && /\s/.test(content[pos])) pos++;
|
|
1357
1357
|
return pos;
|
|
@@ -1406,7 +1406,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1406
1406
|
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
1407
1407
|
if (pieceSeg) result.push(pieceSeg);
|
|
1408
1408
|
}
|
|
1409
|
-
cursorPos = skipWhitespace(fullContent, breakPos);
|
|
1409
|
+
cursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
1410
1410
|
currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
1411
1411
|
isFirstPiece = false;
|
|
1412
1412
|
}
|
|
@@ -1959,6 +1959,117 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
1959
1959
|
return splitPointsByRule;
|
|
1960
1960
|
};
|
|
1961
1961
|
|
|
1962
|
+
//#endregion
|
|
1963
|
+
//#region src/segmentation/split-point-helpers.ts
|
|
1964
|
+
/**
|
|
1965
|
+
* Helper module for collectSplitPointsFromRules to reduce complexity.
|
|
1966
|
+
* Handles combined regex matching and split point creation.
|
|
1967
|
+
*/
|
|
1968
|
+
const MAX_REGEX_ITERATIONS = 1e5;
|
|
1969
|
+
const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
|
|
1970
|
+
const result = {};
|
|
1971
|
+
if (!groups) return result;
|
|
1972
|
+
for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
|
|
1973
|
+
return result;
|
|
1974
|
+
};
|
|
1975
|
+
const buildContentOffsets = (match, ruleInfo) => {
|
|
1976
|
+
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
1977
|
+
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
1978
|
+
if (captured === void 0) return {};
|
|
1979
|
+
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
|
|
1980
|
+
};
|
|
1981
|
+
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
1982
|
+
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
1983
|
+
const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
|
|
1984
|
+
const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
|
|
1985
|
+
return {
|
|
1986
|
+
capturedContent: void 0,
|
|
1987
|
+
contentStartOffset,
|
|
1988
|
+
index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
|
|
1989
|
+
meta: rule.meta,
|
|
1990
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
1991
|
+
};
|
|
1992
|
+
};
|
|
1993
|
+
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
1994
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1995
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1996
|
+
logger?.debug?.("[segmenter] combined regex built", {
|
|
1997
|
+
combinableRuleCount: combinableRules.length,
|
|
1998
|
+
combinedSourceLength: combinedSource.length
|
|
1999
|
+
});
|
|
2000
|
+
let m = combinedRegex.exec(matchContent);
|
|
2001
|
+
let iterations = 0;
|
|
2002
|
+
while (m !== null) {
|
|
2003
|
+
iterations++;
|
|
2004
|
+
if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
|
|
2005
|
+
if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
|
|
2006
|
+
iterations,
|
|
2007
|
+
position: m.index
|
|
2008
|
+
});
|
|
2009
|
+
const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2010
|
+
if (matchedIndex !== -1) {
|
|
2011
|
+
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
2012
|
+
const ruleInfo = ruleRegexes[matchedIndex];
|
|
2013
|
+
if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
|
|
2014
|
+
const sp = createSplitPointFromMatch(m, rule, ruleInfo);
|
|
2015
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2016
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2020
|
+
m = combinedRegex.exec(matchContent);
|
|
2021
|
+
}
|
|
2022
|
+
};
|
|
2023
|
+
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
2024
|
+
const built = buildRuleRegex(rule, prefix);
|
|
2025
|
+
return {
|
|
2026
|
+
...built,
|
|
2027
|
+
prefix,
|
|
2028
|
+
source: `(?<${prefix}>${built.regex.source})`
|
|
2029
|
+
};
|
|
2030
|
+
});
|
|
2031
|
+
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
2032
|
+
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2033
|
+
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2034
|
+
const isLSA = usesLineStartsAfter && m.captured !== void 0;
|
|
2035
|
+
const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
|
|
2036
|
+
return {
|
|
2037
|
+
capturedContent: isLSA ? void 0 : m.captured,
|
|
2038
|
+
contentStartOffset: isLSA ? markerLen : void 0,
|
|
2039
|
+
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2040
|
+
meta: rule.meta,
|
|
2041
|
+
namedCaptures: m.namedCaptures
|
|
2042
|
+
};
|
|
2043
|
+
});
|
|
2044
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2045
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2046
|
+
};
|
|
2047
|
+
const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
2048
|
+
const matches = [];
|
|
2049
|
+
let m = regex.exec(content);
|
|
2050
|
+
while (m !== null) {
|
|
2051
|
+
matches.push({
|
|
2052
|
+
captured: usesCapture ? getLastPositionalCapture(m) : void 0,
|
|
2053
|
+
end: m.index + m[0].length,
|
|
2054
|
+
namedCaptures: extractNamedCaptures(m.groups, captureNames),
|
|
2055
|
+
start: m.index
|
|
2056
|
+
});
|
|
2057
|
+
if (m[0].length === 0) regex.lastIndex++;
|
|
2058
|
+
m = regex.exec(content);
|
|
2059
|
+
}
|
|
2060
|
+
return matches;
|
|
2061
|
+
};
|
|
2062
|
+
const applyOccurrenceFilter = (rules, splitPointsByRule) => {
|
|
2063
|
+
const result = [];
|
|
2064
|
+
rules.forEach((rule, index) => {
|
|
2065
|
+
const points = splitPointsByRule.get(index);
|
|
2066
|
+
if (!points?.length) return;
|
|
2067
|
+
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
2068
|
+
result.push(...filtered);
|
|
2069
|
+
});
|
|
2070
|
+
return result;
|
|
2071
|
+
};
|
|
2072
|
+
|
|
1962
2073
|
//#endregion
|
|
1963
2074
|
//#region src/segmentation/textUtils.ts
|
|
1964
2075
|
/**
|
|
@@ -1985,7 +2096,6 @@ const normalizeLineEndings = (content) => {
|
|
|
1985
2096
|
*
|
|
1986
2097
|
* @module segmenter
|
|
1987
2098
|
*/
|
|
1988
|
-
const MAX_REGEX_ITERATIONS = 1e5;
|
|
1989
2099
|
/**
|
|
1990
2100
|
* Builds a concatenated content string and page mapping from input pages.
|
|
1991
2101
|
*
|
|
@@ -2082,7 +2192,7 @@ const dedupeSplitPoints = (splitPoints) => {
|
|
|
2082
2192
|
const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
|
|
2083
2193
|
if (segments.length > 0 || pages.length === 0) return segments;
|
|
2084
2194
|
const firstPage = pages[0];
|
|
2085
|
-
const lastPage = pages
|
|
2195
|
+
const lastPage = pages.at(-1);
|
|
2086
2196
|
const joinChar = pageJoiner === "newline" ? "\n" : " ";
|
|
2087
2197
|
const allContent = normalizedContent.join(joinChar).trim();
|
|
2088
2198
|
if (!allContent) return segments;
|
|
@@ -2106,124 +2216,9 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
|
2106
2216
|
standaloneCount: standaloneRules.length
|
|
2107
2217
|
});
|
|
2108
2218
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
2109
|
-
if (combinableRules.length > 0)
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
return {
|
|
2113
|
-
prefix,
|
|
2114
|
-
source: `(?<${prefix}>${built.regex.source})`,
|
|
2115
|
-
...built
|
|
2116
|
-
};
|
|
2117
|
-
});
|
|
2118
|
-
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
2119
|
-
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
2120
|
-
logger?.debug?.("[segmenter] combined regex built", {
|
|
2121
|
-
combinableRuleCount: combinableRules.length,
|
|
2122
|
-
combinedSourceLength: combinedSource.length
|
|
2123
|
-
});
|
|
2124
|
-
combinedRegex.lastIndex = 0;
|
|
2125
|
-
let m = combinedRegex.exec(matchContent);
|
|
2126
|
-
let iterationCount = 0;
|
|
2127
|
-
while (m !== null) {
|
|
2128
|
-
iterationCount++;
|
|
2129
|
-
if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
|
|
2130
|
-
if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
|
|
2131
|
-
iterationCount,
|
|
2132
|
-
lastIndex: combinedRegex.lastIndex,
|
|
2133
|
-
matchLength: m[0].length,
|
|
2134
|
-
matchPosition: m.index
|
|
2135
|
-
});
|
|
2136
|
-
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2137
|
-
if (matchedRuleIndex !== -1) {
|
|
2138
|
-
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
2139
|
-
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
2140
|
-
const namedCaptures = {};
|
|
2141
|
-
if (m.groups) {
|
|
2142
|
-
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
2143
|
-
const cleanName = prefixedName.slice(prefix.length);
|
|
2144
|
-
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
2145
|
-
}
|
|
2146
|
-
}
|
|
2147
|
-
let capturedContent;
|
|
2148
|
-
let contentStartOffset;
|
|
2149
|
-
if (ruleInfo.usesLineStartsAfter) {
|
|
2150
|
-
capturedContent = m.groups?.[`${prefix}__content`];
|
|
2151
|
-
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
2152
|
-
}
|
|
2153
|
-
const start = m.index;
|
|
2154
|
-
const end = m.index + m[0].length;
|
|
2155
|
-
const pageId = pageMap.getId(start);
|
|
2156
|
-
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
|
|
2157
|
-
const sp = {
|
|
2158
|
-
capturedContent: void 0,
|
|
2159
|
-
contentStartOffset,
|
|
2160
|
-
index: (rule.split ?? "at") === "at" ? start : end,
|
|
2161
|
-
meta: rule.meta,
|
|
2162
|
-
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
2163
|
-
};
|
|
2164
|
-
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2165
|
-
splitPointsByRule.get(originalIndex).push(sp);
|
|
2166
|
-
}
|
|
2167
|
-
}
|
|
2168
|
-
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2169
|
-
m = combinedRegex.exec(matchContent);
|
|
2170
|
-
}
|
|
2171
|
-
}
|
|
2172
|
-
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
2173
|
-
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2174
|
-
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2175
|
-
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
2176
|
-
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
2177
|
-
return {
|
|
2178
|
-
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
2179
|
-
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
2180
|
-
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2181
|
-
meta: rule.meta,
|
|
2182
|
-
namedCaptures: m.namedCaptures
|
|
2183
|
-
};
|
|
2184
|
-
});
|
|
2185
|
-
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2186
|
-
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2187
|
-
};
|
|
2188
|
-
standaloneRules.forEach((rule) => {
|
|
2189
|
-
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
2190
|
-
});
|
|
2191
|
-
const finalSplitPoints = [];
|
|
2192
|
-
rules.forEach((rule, index) => {
|
|
2193
|
-
const points = splitPointsByRule.get(index);
|
|
2194
|
-
if (!points || points.length === 0) return;
|
|
2195
|
-
let filtered = points;
|
|
2196
|
-
if (rule.occurrence === "first") filtered = [points[0]];
|
|
2197
|
-
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
2198
|
-
finalSplitPoints.push(...filtered);
|
|
2199
|
-
});
|
|
2200
|
-
return finalSplitPoints;
|
|
2201
|
-
};
|
|
2202
|
-
/**
|
|
2203
|
-
* Executes a regex against content and extracts match results with capture information.
|
|
2204
|
-
*
|
|
2205
|
-
* @param content - Full content string to search
|
|
2206
|
-
* @param regex - Compiled regex with 'g' flag
|
|
2207
|
-
* @param usesCapture - Whether to extract captured content
|
|
2208
|
-
* @param captureNames - Names of expected named capture groups
|
|
2209
|
-
* @returns Array of match results with positions and captures
|
|
2210
|
-
*/
|
|
2211
|
-
const findMatches = (content, regex, usesCapture, captureNames) => {
|
|
2212
|
-
const matches = [];
|
|
2213
|
-
regex.lastIndex = 0;
|
|
2214
|
-
let m = regex.exec(content);
|
|
2215
|
-
while (m !== null) {
|
|
2216
|
-
const result = {
|
|
2217
|
-
end: m.index + m[0].length,
|
|
2218
|
-
start: m.index
|
|
2219
|
-
};
|
|
2220
|
-
result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
|
|
2221
|
-
if (usesCapture) result.captured = getLastPositionalCapture(m);
|
|
2222
|
-
matches.push(result);
|
|
2223
|
-
if (m[0].length === 0) regex.lastIndex++;
|
|
2224
|
-
m = regex.exec(content);
|
|
2225
|
-
}
|
|
2226
|
-
return matches;
|
|
2219
|
+
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
2220
|
+
for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
2221
|
+
return applyOccurrenceFilter(rules, splitPointsByRule);
|
|
2227
2222
|
};
|
|
2228
2223
|
/**
|
|
2229
2224
|
* Finds page breaks within a given offset range using binary search.
|
|
@@ -2410,7 +2405,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2410
2405
|
const result = [];
|
|
2411
2406
|
for (let i = 0; i < splitPoints.length; i++) {
|
|
2412
2407
|
const sp = splitPoints[i];
|
|
2413
|
-
const end =
|
|
2408
|
+
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
2414
2409
|
const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
2415
2410
|
if (s) result.push(s);
|
|
2416
2411
|
}
|
|
@@ -2434,29 +2429,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2434
2429
|
};
|
|
2435
2430
|
|
|
2436
2431
|
//#endregion
|
|
2437
|
-
//#region src/analysis.ts
|
|
2438
|
-
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2439
|
-
const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
|
|
2440
|
-
const computeSpecificity = (pattern) => {
|
|
2441
|
-
const tokenCount = countTokenMarkers(pattern);
|
|
2442
|
-
return {
|
|
2443
|
-
literalLen: stripWhitespacePlaceholders(pattern).length,
|
|
2444
|
-
tokenCount
|
|
2445
|
-
};
|
|
2446
|
-
};
|
|
2447
|
-
const DEFAULT_OPTIONS = {
|
|
2448
|
-
includeFirstWordFallback: true,
|
|
2449
|
-
lineFilter: void 0,
|
|
2450
|
-
maxExamples: 1,
|
|
2451
|
-
minCount: 3,
|
|
2452
|
-
minLineLength: 6,
|
|
2453
|
-
normalizeArabicDiacritics: true,
|
|
2454
|
-
prefixChars: 60,
|
|
2455
|
-
prefixMatchers: [/^#+/u],
|
|
2456
|
-
sortBy: "specificity",
|
|
2457
|
-
topK: 40,
|
|
2458
|
-
whitespace: "regex"
|
|
2459
|
-
};
|
|
2432
|
+
//#region src/analysis/shared.ts
|
|
2460
2433
|
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
2461
2434
|
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2462
2435
|
"basmalah",
|
|
@@ -2497,30 +2470,7 @@ const appendWs = (out, mode) => {
|
|
|
2497
2470
|
if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
|
|
2498
2471
|
return out.endsWith("\\s*") ? out : `${out}\\s*`;
|
|
2499
2472
|
};
|
|
2500
|
-
const
|
|
2501
|
-
let matchedAny = false;
|
|
2502
|
-
let currentPos = pos;
|
|
2503
|
-
let currentOut = out;
|
|
2504
|
-
for (const re of prefixMatchers) {
|
|
2505
|
-
if (currentPos >= s.length) break;
|
|
2506
|
-
const m = re.exec(s.slice(currentPos));
|
|
2507
|
-
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2508
|
-
currentOut += escapeSignatureLiteral(m[0]);
|
|
2509
|
-
currentPos += m[0].length;
|
|
2510
|
-
matchedAny = true;
|
|
2511
|
-
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
2512
|
-
if (wsAfter) {
|
|
2513
|
-
currentPos += wsAfter[0].length;
|
|
2514
|
-
currentOut = appendWs(currentOut, whitespace);
|
|
2515
|
-
}
|
|
2516
|
-
}
|
|
2517
|
-
return {
|
|
2518
|
-
matchedAny,
|
|
2519
|
-
out: currentOut,
|
|
2520
|
-
pos: currentPos
|
|
2521
|
-
};
|
|
2522
|
-
};
|
|
2523
|
-
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
2473
|
+
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
|
|
2524
2474
|
let best = null;
|
|
2525
2475
|
for (const { token, re } of compiled) {
|
|
2526
2476
|
re.lastIndex = pos;
|
|
@@ -2534,132 +2484,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
|
2534
2484
|
if (best?.token === "rumuz") {
|
|
2535
2485
|
const end = pos + best.text.length;
|
|
2536
2486
|
const next = end < s.length ? s[end] : "";
|
|
2537
|
-
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
2487
|
+
if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
|
|
2538
2488
|
}
|
|
2539
2489
|
return best;
|
|
2540
2490
|
};
|
|
2541
|
-
const
|
|
2491
|
+
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
2492
|
+
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
2493
|
+
|
|
2494
|
+
//#endregion
|
|
2495
|
+
//#region src/analysis/line-starts.ts
|
|
2496
|
+
const resolveOptions$1 = (options = {}) => ({
|
|
2497
|
+
includeFirstWordFallback: options.includeFirstWordFallback ?? true,
|
|
2498
|
+
lineFilter: options.lineFilter,
|
|
2499
|
+
maxExamples: options.maxExamples ?? 1,
|
|
2500
|
+
minCount: options.minCount ?? 3,
|
|
2501
|
+
minLineLength: options.minLineLength ?? 6,
|
|
2502
|
+
normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
|
|
2503
|
+
prefixChars: options.prefixChars ?? 60,
|
|
2504
|
+
prefixMatchers: options.prefixMatchers ?? [/^#+/u],
|
|
2505
|
+
sortBy: options.sortBy ?? "specificity",
|
|
2506
|
+
topK: options.topK ?? 40,
|
|
2507
|
+
whitespace: options.whitespace ?? "regex"
|
|
2508
|
+
});
|
|
2509
|
+
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2510
|
+
const computeSpecificity = (pattern) => ({
|
|
2511
|
+
literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
|
|
2512
|
+
tokenCount: countTokenMarkers(pattern)
|
|
2513
|
+
});
|
|
2514
|
+
const compareBySpecificity = (a, b) => {
|
|
2515
|
+
const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
|
|
2516
|
+
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
2517
|
+
};
|
|
2518
|
+
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
2519
|
+
/** Remove trailing whitespace placeholders */
|
|
2520
|
+
const trimTrailingWs = (out, mode) => {
|
|
2521
|
+
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
2522
|
+
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
2523
|
+
return out;
|
|
2524
|
+
};
|
|
2525
|
+
/** Try to extract first word for fallback */
|
|
2526
|
+
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
2527
|
+
/** Consume prefix matchers at current position */
|
|
2528
|
+
const consumePrefixes = (s, pos, out, matchers, ws) => {
|
|
2529
|
+
let matched = false;
|
|
2530
|
+
for (const re of matchers) {
|
|
2531
|
+
if (pos >= s.length) break;
|
|
2532
|
+
const m = re.exec(s.slice(pos));
|
|
2533
|
+
if (!m?.index && m?.[0]) {
|
|
2534
|
+
out += escapeSignatureLiteral(m[0]);
|
|
2535
|
+
pos += m[0].length;
|
|
2536
|
+
matched = true;
|
|
2537
|
+
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
2538
|
+
if (wsm) {
|
|
2539
|
+
pos += wsm[0].length;
|
|
2540
|
+
out = appendWs(out, ws);
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
return {
|
|
2545
|
+
matched,
|
|
2546
|
+
out,
|
|
2547
|
+
pos
|
|
2548
|
+
};
|
|
2549
|
+
};
|
|
2550
|
+
/** Try to match a token at current position and append to signature */
|
|
2551
|
+
const tryMatchToken = (s, pos, out, compiled) => {
|
|
2552
|
+
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
2553
|
+
if (!best) return {
|
|
2554
|
+
matched: false,
|
|
2555
|
+
out,
|
|
2556
|
+
pos
|
|
2557
|
+
};
|
|
2558
|
+
return {
|
|
2559
|
+
matched: true,
|
|
2560
|
+
out: `${out}{{${best.token}}}`,
|
|
2561
|
+
pos: pos + best.text.length
|
|
2562
|
+
};
|
|
2563
|
+
};
|
|
2564
|
+
/** Try to match a delimiter at current position */
|
|
2565
|
+
const tryMatchDelimiter = (s, pos, out) => {
|
|
2566
|
+
const ch = s[pos];
|
|
2567
|
+
if (!ch || !isCommonDelimiter(ch)) return {
|
|
2568
|
+
matched: false,
|
|
2569
|
+
out,
|
|
2570
|
+
pos
|
|
2571
|
+
};
|
|
2572
|
+
return {
|
|
2573
|
+
matched: true,
|
|
2574
|
+
out: out + escapeSignatureLiteral(ch),
|
|
2575
|
+
pos: pos + 1
|
|
2576
|
+
};
|
|
2577
|
+
};
|
|
2578
|
+
/** Skip whitespace at position */
|
|
2579
|
+
const skipWhitespace = (s, pos, out, ws) => {
|
|
2580
|
+
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
2581
|
+
if (!m) return {
|
|
2582
|
+
out,
|
|
2583
|
+
pos,
|
|
2584
|
+
skipped: false
|
|
2585
|
+
};
|
|
2586
|
+
return {
|
|
2587
|
+
out: appendWs(out, ws),
|
|
2588
|
+
pos: pos + m[0].length,
|
|
2589
|
+
skipped: true
|
|
2590
|
+
};
|
|
2591
|
+
};
|
|
2592
|
+
const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
2542
2593
|
const trimmed = collapseWhitespace(line);
|
|
2543
2594
|
if (!trimmed) return null;
|
|
2544
|
-
const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
|
|
2545
|
-
let pos = 0;
|
|
2546
|
-
let out = "";
|
|
2547
|
-
let matchedAny = false;
|
|
2548
|
-
let matchedToken = false;
|
|
2595
|
+
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
2549
2596
|
const compiled = compileTokenRegexes(tokenNames);
|
|
2550
|
-
|
|
2551
|
-
const
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
|
|
2560
|
-
const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
|
|
2561
|
-
if (wsMatch) {
|
|
2562
|
-
pos += wsMatch[0].length;
|
|
2563
|
-
out = appendWs(out, whitespace);
|
|
2597
|
+
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
2598
|
+
const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
2599
|
+
pos = prefix.pos;
|
|
2600
|
+
out = prefix.out;
|
|
2601
|
+
matchedAny = prefix.matched;
|
|
2602
|
+
while (steps < 6 && pos < s.length) {
|
|
2603
|
+
const ws = skipWhitespace(s, pos, out, opts.whitespace);
|
|
2604
|
+
if (ws.skipped) {
|
|
2605
|
+
pos = ws.pos;
|
|
2606
|
+
out = ws.out;
|
|
2564
2607
|
continue;
|
|
2565
2608
|
}
|
|
2566
|
-
const
|
|
2567
|
-
if (
|
|
2568
|
-
|
|
2569
|
-
out
|
|
2570
|
-
matchedAny = true;
|
|
2571
|
-
|
|
2572
|
-
pos += best.text.length;
|
|
2573
|
-
tokenSteps++;
|
|
2609
|
+
const tok = tryMatchToken(s, pos, out, compiled);
|
|
2610
|
+
if (tok.matched) {
|
|
2611
|
+
pos = tok.pos;
|
|
2612
|
+
out = tok.out;
|
|
2613
|
+
matchedAny = matchedToken = true;
|
|
2614
|
+
steps++;
|
|
2574
2615
|
continue;
|
|
2575
2616
|
}
|
|
2576
2617
|
if (matchedAny) {
|
|
2577
|
-
const
|
|
2578
|
-
if (
|
|
2579
|
-
|
|
2580
|
-
|
|
2618
|
+
const delim = tryMatchDelimiter(s, pos, out);
|
|
2619
|
+
if (delim.matched) {
|
|
2620
|
+
pos = delim.pos;
|
|
2621
|
+
out = delim.out;
|
|
2581
2622
|
continue;
|
|
2582
2623
|
}
|
|
2583
2624
|
}
|
|
2584
2625
|
if (matchedAny) {
|
|
2585
|
-
if (includeFirstWordFallback && !matchedToken) {
|
|
2586
|
-
const
|
|
2587
|
-
if (
|
|
2588
|
-
|
|
2589
|
-
|
|
2626
|
+
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
2627
|
+
const word$1 = extractFirstWord(s.slice(pos));
|
|
2628
|
+
if (word$1) {
|
|
2629
|
+
out += escapeSignatureLiteral(word$1);
|
|
2630
|
+
steps++;
|
|
2631
|
+
}
|
|
2590
2632
|
}
|
|
2591
2633
|
break;
|
|
2592
2634
|
}
|
|
2593
|
-
if (!includeFirstWordFallback) return null;
|
|
2594
|
-
const
|
|
2595
|
-
if (!
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
return
|
|
2635
|
+
if (!opts.includeFirstWordFallback) return null;
|
|
2636
|
+
const word = extractFirstWord(s.slice(pos));
|
|
2637
|
+
if (!word) return null;
|
|
2638
|
+
return escapeSignatureLiteral(word);
|
|
2639
|
+
}
|
|
2640
|
+
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
2641
|
+
};
|
|
2642
|
+
const processLine = (line, pageId, tokenPriority, opts, acc) => {
|
|
2643
|
+
const trimmed = collapseWhitespace(line);
|
|
2644
|
+
if (trimmed.length < opts.minLineLength) return;
|
|
2645
|
+
if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
|
|
2646
|
+
const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
|
|
2647
|
+
if (!sig) return;
|
|
2648
|
+
const entry = acc.get(sig);
|
|
2649
|
+
if (!entry) acc.set(sig, {
|
|
2650
|
+
count: 1,
|
|
2651
|
+
examples: [{
|
|
2652
|
+
line: trimmed,
|
|
2653
|
+
pageId
|
|
2654
|
+
}]
|
|
2655
|
+
});
|
|
2656
|
+
else {
|
|
2657
|
+
entry.count++;
|
|
2658
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push({
|
|
2659
|
+
line: trimmed,
|
|
2660
|
+
pageId
|
|
2661
|
+
});
|
|
2662
|
+
}
|
|
2663
|
+
};
|
|
2664
|
+
const processPage = (page, tokenPriority, opts, acc) => {
|
|
2665
|
+
for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
|
|
2604
2666
|
};
|
|
2605
2667
|
/**
|
|
2606
2668
|
* Analyze pages and return the most common line-start patterns (top K).
|
|
2607
|
-
*
|
|
2608
|
-
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
2609
|
-
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
2610
2669
|
*/
|
|
2611
2670
|
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
2612
|
-
const
|
|
2613
|
-
...DEFAULT_OPTIONS,
|
|
2614
|
-
...options,
|
|
2615
|
-
lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
|
|
2616
|
-
prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
|
|
2617
|
-
whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
|
|
2618
|
-
};
|
|
2671
|
+
const opts = resolveOptions$1(options);
|
|
2619
2672
|
const tokenPriority = buildTokenPriority();
|
|
2620
|
-
const
|
|
2621
|
-
for (const page of pages)
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2673
|
+
const acc = /* @__PURE__ */ new Map();
|
|
2674
|
+
for (const page of pages) processPage(page, tokenPriority, opts, acc);
|
|
2675
|
+
const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
|
|
2676
|
+
return [...acc.entries()].map(([pattern, v]) => ({
|
|
2677
|
+
count: v.count,
|
|
2678
|
+
examples: v.examples,
|
|
2679
|
+
pattern
|
|
2680
|
+
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
2681
|
+
};
|
|
2682
|
+
|
|
2683
|
+
//#endregion
|
|
2684
|
+
//#region src/analysis/repeating-sequences.ts
|
|
2685
|
+
const resolveOptions = (options) => {
|
|
2686
|
+
const minElements = Math.max(1, options?.minElements ?? 1);
|
|
2687
|
+
return {
|
|
2688
|
+
contextChars: options?.contextChars ?? 50,
|
|
2689
|
+
maxElements: Math.max(minElements, options?.maxElements ?? 3),
|
|
2690
|
+
maxExamples: options?.maxExamples ?? 3,
|
|
2691
|
+
maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
|
|
2692
|
+
minCount: Math.max(1, options?.minCount ?? 3),
|
|
2693
|
+
minElements,
|
|
2694
|
+
normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
|
|
2695
|
+
requireToken: options?.requireToken ?? true,
|
|
2696
|
+
topK: Math.max(1, options?.topK ?? 20),
|
|
2697
|
+
whitespace: options?.whitespace ?? "regex"
|
|
2698
|
+
};
|
|
2699
|
+
};
|
|
2700
|
+
/** Creates a cursor that tracks position in both normalized and raw text */
|
|
2701
|
+
const createRawCursor = (text, normalize) => {
|
|
2702
|
+
let rawPos = 0;
|
|
2703
|
+
return {
|
|
2704
|
+
advance(normalizedLen) {
|
|
2705
|
+
if (!normalize) {
|
|
2706
|
+
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
2707
|
+
rawPos += normalizedLen;
|
|
2708
|
+
return chunk;
|
|
2709
|
+
}
|
|
2710
|
+
const start = rawPos;
|
|
2711
|
+
let matchedLen = 0;
|
|
2712
|
+
while (matchedLen < normalizedLen && rawPos < text.length) {
|
|
2713
|
+
if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
|
|
2714
|
+
rawPos++;
|
|
2643
2715
|
}
|
|
2716
|
+
while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
|
|
2717
|
+
return text.slice(start, rawPos);
|
|
2718
|
+
},
|
|
2719
|
+
get pos() {
|
|
2720
|
+
return rawPos;
|
|
2721
|
+
}
|
|
2722
|
+
};
|
|
2723
|
+
};
|
|
2724
|
+
/** Scans text and produces a stream of tokens and literals. */
|
|
2725
|
+
const tokenizeContent = (text, normalize) => {
|
|
2726
|
+
const normalized = normalize ? stripArabicDiacritics(text) : text;
|
|
2727
|
+
const compiled = compileTokenRegexes(buildTokenPriority());
|
|
2728
|
+
const cursor = createRawCursor(text, normalize);
|
|
2729
|
+
const items = [];
|
|
2730
|
+
let pos = 0;
|
|
2731
|
+
while (pos < normalized.length) {
|
|
2732
|
+
const ws = /^\s+/u.exec(normalized.slice(pos));
|
|
2733
|
+
if (ws) {
|
|
2734
|
+
pos += ws[0].length;
|
|
2735
|
+
cursor.advance(ws[0].length);
|
|
2736
|
+
continue;
|
|
2737
|
+
}
|
|
2738
|
+
const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
|
|
2739
|
+
if (token) {
|
|
2740
|
+
const raw = cursor.advance(token.text.length);
|
|
2741
|
+
items.push({
|
|
2742
|
+
end: cursor.pos,
|
|
2743
|
+
raw,
|
|
2744
|
+
start: cursor.pos - raw.length,
|
|
2745
|
+
text: `{{${token.token}}}`,
|
|
2746
|
+
type: "token"
|
|
2747
|
+
});
|
|
2748
|
+
pos += token.text.length;
|
|
2749
|
+
continue;
|
|
2750
|
+
}
|
|
2751
|
+
if (isCommonDelimiter(normalized[pos])) {
|
|
2752
|
+
const raw = cursor.advance(1);
|
|
2753
|
+
items.push({
|
|
2754
|
+
end: cursor.pos,
|
|
2755
|
+
raw,
|
|
2756
|
+
start: cursor.pos - 1,
|
|
2757
|
+
text: escapeSignatureLiteral(normalized[pos]),
|
|
2758
|
+
type: "literal"
|
|
2759
|
+
});
|
|
2760
|
+
pos++;
|
|
2761
|
+
continue;
|
|
2762
|
+
}
|
|
2763
|
+
const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
|
|
2764
|
+
if (word) {
|
|
2765
|
+
const raw = cursor.advance(word[0].length);
|
|
2766
|
+
items.push({
|
|
2767
|
+
end: cursor.pos,
|
|
2768
|
+
raw,
|
|
2769
|
+
start: cursor.pos - raw.length,
|
|
2770
|
+
text: escapeSignatureLiteral(word[0]),
|
|
2771
|
+
type: "literal"
|
|
2772
|
+
});
|
|
2773
|
+
pos += word[0].length;
|
|
2774
|
+
continue;
|
|
2644
2775
|
}
|
|
2776
|
+
cursor.advance(1);
|
|
2777
|
+
pos++;
|
|
2645
2778
|
}
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
|
|
2779
|
+
return items;
|
|
2780
|
+
};
|
|
2781
|
+
/** Build pattern string from window items */
|
|
2782
|
+
const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
|
|
2783
|
+
/** Check if window contains at least one token */
|
|
2784
|
+
const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
|
|
2785
|
+
/** Compute token count and literal length for a window */
|
|
2786
|
+
const computeWindowStats = (window) => {
|
|
2787
|
+
let tokenCount = 0, literalLen = 0;
|
|
2788
|
+
for (const item of window) if (item.type === "token") tokenCount++;
|
|
2789
|
+
else literalLen += item.text.length;
|
|
2790
|
+
return {
|
|
2791
|
+
literalLen,
|
|
2792
|
+
tokenCount
|
|
2653
2793
|
};
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2794
|
+
};
|
|
2795
|
+
/** Build example from page content and window */
|
|
2796
|
+
const buildExample = (page, window, contextChars) => {
|
|
2797
|
+
const start = window[0].start;
|
|
2798
|
+
const end = window.at(-1).end;
|
|
2799
|
+
const ctxStart = Math.max(0, start - contextChars);
|
|
2800
|
+
const ctxEnd = Math.min(page.content.length, end + contextChars);
|
|
2801
|
+
return {
|
|
2802
|
+
context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
|
|
2803
|
+
pageId: page.id,
|
|
2804
|
+
startIndices: window.map((w) => w.start),
|
|
2805
|
+
text: page.content.slice(start, end)
|
|
2657
2806
|
};
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2807
|
+
};
|
|
2808
|
+
/** Extract N-grams from a single page */
|
|
2809
|
+
const extractPageNgrams = (page, items, opts, stats) => {
|
|
2810
|
+
for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
|
|
2811
|
+
const window = items.slice(i, i + n);
|
|
2812
|
+
if (opts.requireToken && !hasTokenInWindow(window)) continue;
|
|
2813
|
+
const pattern = buildPattern(window, opts.whitespace);
|
|
2814
|
+
if (!stats.has(pattern)) {
|
|
2815
|
+
if (stats.size >= opts.maxUniquePatterns) continue;
|
|
2816
|
+
stats.set(pattern, {
|
|
2817
|
+
count: 0,
|
|
2818
|
+
examples: [],
|
|
2819
|
+
...computeWindowStats(window)
|
|
2820
|
+
});
|
|
2821
|
+
}
|
|
2822
|
+
const entry = stats.get(pattern);
|
|
2823
|
+
entry.count++;
|
|
2824
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
2825
|
+
}
|
|
2826
|
+
};
|
|
2827
|
+
/**
|
|
2828
|
+
* Analyze pages for commonly repeating word sequences.
|
|
2829
|
+
*
|
|
2830
|
+
* Use for continuous text without line breaks. For line-based analysis,
|
|
2831
|
+
* use `analyzeCommonLineStarts()` instead.
|
|
2832
|
+
*/
|
|
2833
|
+
const analyzeRepeatingSequences = (pages, options) => {
|
|
2834
|
+
const opts = resolveOptions(options);
|
|
2835
|
+
const stats = /* @__PURE__ */ new Map();
|
|
2836
|
+
for (const page of pages) {
|
|
2837
|
+
if (!page.content) continue;
|
|
2838
|
+
extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
|
|
2839
|
+
}
|
|
2840
|
+
return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
|
|
2841
|
+
count: s.count,
|
|
2842
|
+
examples: s.examples,
|
|
2661
2843
|
pattern
|
|
2662
|
-
}))
|
|
2844
|
+
}));
|
|
2663
2845
|
};
|
|
2664
2846
|
|
|
2665
2847
|
//#endregion
|
|
@@ -2831,5 +3013,524 @@ const analyzeTextForRule = (text) => {
|
|
|
2831
3013
|
};
|
|
2832
3014
|
|
|
2833
3015
|
//#endregion
|
|
2834
|
-
|
|
3016
|
+
//#region src/recovery.ts
|
|
3017
|
+
const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
|
|
3018
|
+
const normalizeForCompare = (s, mode) => {
|
|
3019
|
+
if (mode === "none") return s;
|
|
3020
|
+
let out = s;
|
|
3021
|
+
if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
|
|
3022
|
+
out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
|
|
3023
|
+
return out;
|
|
3024
|
+
};
|
|
3025
|
+
const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
|
|
3026
|
+
const buildFixedOptions = (options, selectedRuleIndices) => {
|
|
3027
|
+
const fixedRules = (options.rules ?? []).map((r, idx) => {
|
|
3028
|
+
if (!selectedRuleIndices.has(idx)) return r;
|
|
3029
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
|
|
3030
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3031
|
+
return {
|
|
3032
|
+
...rest,
|
|
3033
|
+
lineStartsWith: lineStartsAfter
|
|
3034
|
+
};
|
|
3035
|
+
});
|
|
3036
|
+
return {
|
|
3037
|
+
...options,
|
|
3038
|
+
rules: fixedRules
|
|
3039
|
+
};
|
|
3040
|
+
};
|
|
3041
|
+
const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
|
|
3042
|
+
const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
|
|
3043
|
+
const parts = [];
|
|
3044
|
+
for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
|
|
3045
|
+
const matchContent = parts.join("\n");
|
|
3046
|
+
if (pageJoiner === "newline") return {
|
|
3047
|
+
matchContent,
|
|
3048
|
+
outputContent: matchContent
|
|
3049
|
+
};
|
|
3050
|
+
return {
|
|
3051
|
+
matchContent,
|
|
3052
|
+
outputContent: parts.join(" ")
|
|
3053
|
+
};
|
|
3054
|
+
};
|
|
3055
|
+
const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
|
|
3056
|
+
const rules = options.rules ?? [];
|
|
3057
|
+
const compiled = [];
|
|
3058
|
+
for (const idx of selectedRuleIndices) {
|
|
3059
|
+
const r = rules[idx];
|
|
3060
|
+
if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3061
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3062
|
+
const built = buildRuleRegex({
|
|
3063
|
+
...rest,
|
|
3064
|
+
lineStartsWith: lineStartsAfter
|
|
3065
|
+
});
|
|
3066
|
+
compiled.push({
|
|
3067
|
+
ruleIndex: idx,
|
|
3068
|
+
startsWithRegex: new RegExp(built.regex.source, "mu")
|
|
3069
|
+
});
|
|
3070
|
+
}
|
|
3071
|
+
return compiled;
|
|
3072
|
+
};
|
|
3073
|
+
const findUniqueAnchorPos = (outputContent, segmentContent) => {
|
|
3074
|
+
for (const len of [
|
|
3075
|
+
80,
|
|
3076
|
+
60,
|
|
3077
|
+
40,
|
|
3078
|
+
30,
|
|
3079
|
+
20,
|
|
3080
|
+
15
|
|
3081
|
+
]) {
|
|
3082
|
+
const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
|
|
3083
|
+
if (!needle.trim()) continue;
|
|
3084
|
+
const first = outputContent.indexOf(needle);
|
|
3085
|
+
if (first === -1) continue;
|
|
3086
|
+
if (outputContent.indexOf(needle, first + 1) === -1) return first;
|
|
3087
|
+
}
|
|
3088
|
+
return null;
|
|
3089
|
+
};
|
|
3090
|
+
const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
|
|
3091
|
+
const line = matchContent.slice(lineStart);
|
|
3092
|
+
for (const mr of compiledMistaken) {
|
|
3093
|
+
mr.startsWithRegex.lastIndex = 0;
|
|
3094
|
+
const m = mr.startsWithRegex.exec(line);
|
|
3095
|
+
if (!m || m.index !== 0) continue;
|
|
3096
|
+
const markerMatch = m[0];
|
|
3097
|
+
const markerEnd = lineStart + markerMatch.length;
|
|
3098
|
+
if (anchorPos < markerEnd) continue;
|
|
3099
|
+
const gap = matchContent.slice(markerEnd, anchorPos);
|
|
3100
|
+
const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
|
|
3101
|
+
if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
|
|
3102
|
+
return { prefix: recoveredPrefix };
|
|
3103
|
+
}
|
|
3104
|
+
return { reason: "no selected marker pattern matched at anchored line start" };
|
|
3105
|
+
};
|
|
3106
|
+
const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
|
|
3107
|
+
const fromIdx = pageIdToIndex.get(segment.from);
|
|
3108
|
+
const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
|
|
3109
|
+
if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
|
|
3110
|
+
kind: "unresolved",
|
|
3111
|
+
reason: "segment page range not found in pages"
|
|
3112
|
+
};
|
|
3113
|
+
const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
|
|
3114
|
+
if (!segment.content) return {
|
|
3115
|
+
kind: "unresolved",
|
|
3116
|
+
reason: "empty segment content"
|
|
3117
|
+
};
|
|
3118
|
+
const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
|
|
3119
|
+
if (anchorPos === null) return {
|
|
3120
|
+
kind: "unresolved",
|
|
3121
|
+
reason: "could not uniquely anchor segment content in page range"
|
|
3122
|
+
};
|
|
3123
|
+
const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
|
|
3124
|
+
const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
|
|
3125
|
+
if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
|
|
3126
|
+
kind: "unresolved",
|
|
3127
|
+
reason: found.reason
|
|
3128
|
+
};
|
|
3129
|
+
return {
|
|
3130
|
+
kind: "recovered",
|
|
3131
|
+
recoveredContent: `${found.prefix}${segment.content}`,
|
|
3132
|
+
recoveredPrefix: found.prefix
|
|
3133
|
+
};
|
|
3134
|
+
};
|
|
3135
|
+
const resolveRuleIndicesSelector = (rules, indicesIn) => {
|
|
3136
|
+
const errors = [];
|
|
3137
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3138
|
+
for (const idx of indicesIn) {
|
|
3139
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
|
|
3140
|
+
errors.push(`Selector index out of range: ${idx}`);
|
|
3141
|
+
continue;
|
|
3142
|
+
}
|
|
3143
|
+
const rule = rules[idx];
|
|
3144
|
+
if (!rule || !("lineStartsAfter" in rule)) {
|
|
3145
|
+
errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
|
|
3146
|
+
continue;
|
|
3147
|
+
}
|
|
3148
|
+
indices.add(idx);
|
|
3149
|
+
}
|
|
3150
|
+
return {
|
|
3151
|
+
errors,
|
|
3152
|
+
indices,
|
|
3153
|
+
warnings: []
|
|
3154
|
+
};
|
|
3155
|
+
};
|
|
3156
|
+
const resolvePredicateSelector = (rules, predicate) => {
|
|
3157
|
+
const errors = [];
|
|
3158
|
+
const warnings = [];
|
|
3159
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3160
|
+
rules.forEach((r, i) => {
|
|
3161
|
+
try {
|
|
3162
|
+
if (!predicate(r, i)) return;
|
|
3163
|
+
if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
|
|
3164
|
+
indices.add(i);
|
|
3165
|
+
return;
|
|
3166
|
+
}
|
|
3167
|
+
warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
|
|
3168
|
+
} catch (e) {
|
|
3169
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
3170
|
+
errors.push(`Predicate threw at rule ${i}: ${msg}`);
|
|
3171
|
+
}
|
|
3172
|
+
});
|
|
3173
|
+
if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
|
|
3174
|
+
return {
|
|
3175
|
+
errors,
|
|
3176
|
+
indices,
|
|
3177
|
+
warnings
|
|
3178
|
+
};
|
|
3179
|
+
};
|
|
3180
|
+
const resolvePatternsSelector = (rules, patterns, matchMode) => {
|
|
3181
|
+
const errors = [];
|
|
3182
|
+
const warnings = [];
|
|
3183
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3184
|
+
const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
|
|
3185
|
+
const targets = patterns.map(normalizePattern);
|
|
3186
|
+
for (let pi = 0; pi < patterns.length; pi++) {
|
|
3187
|
+
const rawPattern = patterns[pi];
|
|
3188
|
+
const pat = targets[pi];
|
|
3189
|
+
const matched = [];
|
|
3190
|
+
for (let i = 0; i < rules.length; i++) {
|
|
3191
|
+
const r = rules[i];
|
|
3192
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3193
|
+
if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
|
|
3194
|
+
}
|
|
3195
|
+
if (matched.length === 0) {
|
|
3196
|
+
errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
|
|
3197
|
+
continue;
|
|
3198
|
+
}
|
|
3199
|
+
if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
|
|
3200
|
+
matched.forEach((i) => {
|
|
3201
|
+
indices.add(i);
|
|
3202
|
+
});
|
|
3203
|
+
}
|
|
3204
|
+
return {
|
|
3205
|
+
errors,
|
|
3206
|
+
indices,
|
|
3207
|
+
warnings
|
|
3208
|
+
};
|
|
3209
|
+
};
|
|
3210
|
+
const resolveSelectorToRuleIndices = (options, selector) => {
|
|
3211
|
+
const rules = options.rules ?? [];
|
|
3212
|
+
if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
|
|
3213
|
+
if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
|
|
3214
|
+
return resolvePatternsSelector(rules, selector.patterns, selector.match);
|
|
3215
|
+
};
|
|
3216
|
+
const longestCommonSuffixLength = (a, b) => {
|
|
3217
|
+
const max = Math.min(a.length, b.length);
|
|
3218
|
+
let i = 0;
|
|
3219
|
+
while (i < max) {
|
|
3220
|
+
if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
|
|
3221
|
+
i++;
|
|
3222
|
+
}
|
|
3223
|
+
return i;
|
|
3224
|
+
};
|
|
3225
|
+
const AMBIGUITY_SCORE_GAP = 5;
|
|
3226
|
+
const scoreCandidate = (orig, fixed, normalizeMode) => {
|
|
3227
|
+
if (fixed.content === orig.content) return {
|
|
3228
|
+
fixedIndex: -1,
|
|
3229
|
+
kind: "exact",
|
|
3230
|
+
score: 100
|
|
3231
|
+
};
|
|
3232
|
+
if (fixed.content.endsWith(orig.content)) {
|
|
3233
|
+
const markerLen = fixed.content.length - orig.content.length;
|
|
3234
|
+
return {
|
|
3235
|
+
fixedIndex: -1,
|
|
3236
|
+
kind: "exact_suffix",
|
|
3237
|
+
score: 90 + Math.min(30, markerLen)
|
|
3238
|
+
};
|
|
3239
|
+
}
|
|
3240
|
+
if (normalizeMode !== "none") {
|
|
3241
|
+
const normFixed = normalizeForCompare(fixed.content, normalizeMode);
|
|
3242
|
+
const normOrig = normalizeForCompare(orig.content, normalizeMode);
|
|
3243
|
+
if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
|
|
3244
|
+
const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
|
|
3245
|
+
return {
|
|
3246
|
+
fixedIndex: -1,
|
|
3247
|
+
kind: "normalized_suffix",
|
|
3248
|
+
score: 70 + Math.floor(overlap * 20)
|
|
3249
|
+
};
|
|
3250
|
+
}
|
|
3251
|
+
}
|
|
3252
|
+
return null;
|
|
3253
|
+
};
|
|
3254
|
+
const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
|
|
3255
|
+
const warnings = [...reportBase.warnings];
|
|
3256
|
+
warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
|
|
3257
|
+
const details = segments.map((s, i) => {
|
|
3258
|
+
const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
|
|
3259
|
+
return {
|
|
3260
|
+
from: s.from,
|
|
3261
|
+
notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
|
|
3262
|
+
originalStartPreview: preview(s.content),
|
|
3263
|
+
segmentIndex: i,
|
|
3264
|
+
status,
|
|
3265
|
+
strategy: "none",
|
|
3266
|
+
to: s.to
|
|
3267
|
+
};
|
|
3268
|
+
});
|
|
3269
|
+
return {
|
|
3270
|
+
report: {
|
|
3271
|
+
...reportBase,
|
|
3272
|
+
details,
|
|
3273
|
+
summary: {
|
|
3274
|
+
mode,
|
|
3275
|
+
recovered: 0,
|
|
3276
|
+
totalSegments: segments.length,
|
|
3277
|
+
unchanged: segments.length,
|
|
3278
|
+
unresolved: selectorErrors.length ? segments.length : 0
|
|
3279
|
+
},
|
|
3280
|
+
warnings
|
|
3281
|
+
},
|
|
3282
|
+
segments
|
|
3283
|
+
};
|
|
3284
|
+
};
|
|
3285
|
+
const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
|
|
3286
|
+
const recoveredAtIndex = /* @__PURE__ */ new Map();
|
|
3287
|
+
const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
|
|
3288
|
+
if (mode !== "best_effort_then_rerun") return {
|
|
3289
|
+
recoveredAtIndex,
|
|
3290
|
+
recoveredDetailAtIndex
|
|
3291
|
+
};
|
|
3292
|
+
const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
|
|
3293
|
+
const pageIdToIndex = buildPageIdToIndex(processedPages);
|
|
3294
|
+
const pageJoiner = options.pageJoiner ?? "space";
|
|
3295
|
+
const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
|
|
3296
|
+
for (let i = 0; i < segments.length; i++) {
|
|
3297
|
+
const orig = segments[i];
|
|
3298
|
+
const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
|
|
3299
|
+
if (r.kind !== "recovered") continue;
|
|
3300
|
+
const seg = {
|
|
3301
|
+
...orig,
|
|
3302
|
+
content: r.recoveredContent
|
|
3303
|
+
};
|
|
3304
|
+
recoveredAtIndex.set(i, seg);
|
|
3305
|
+
recoveredDetailAtIndex.set(i, {
|
|
3306
|
+
from: orig.from,
|
|
3307
|
+
originalStartPreview: preview(orig.content),
|
|
3308
|
+
recoveredPrefixPreview: preview(r.recoveredPrefix),
|
|
3309
|
+
recoveredStartPreview: preview(seg.content),
|
|
3310
|
+
segmentIndex: i,
|
|
3311
|
+
status: "recovered",
|
|
3312
|
+
strategy: "stage1",
|
|
3313
|
+
to: orig.to
|
|
3314
|
+
});
|
|
3315
|
+
}
|
|
3316
|
+
return {
|
|
3317
|
+
recoveredAtIndex,
|
|
3318
|
+
recoveredDetailAtIndex
|
|
3319
|
+
};
|
|
3320
|
+
};
|
|
3321
|
+
const buildFixedBuckets = (fixedSegments) => {
|
|
3322
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
3323
|
+
for (let i = 0; i < fixedSegments.length; i++) {
|
|
3324
|
+
const k = segmentRangeKey(fixedSegments[i]);
|
|
3325
|
+
const arr = buckets.get(k);
|
|
3326
|
+
if (!arr) buckets.set(k, [i]);
|
|
3327
|
+
else arr.push(i);
|
|
3328
|
+
}
|
|
3329
|
+
return buckets;
|
|
3330
|
+
};
|
|
3331
|
+
const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
|
|
3332
|
+
let best = null;
|
|
3333
|
+
let secondBestScore = -Infinity;
|
|
3334
|
+
for (const fixedIdx of candidates) {
|
|
3335
|
+
if (usedFixed.has(fixedIdx)) continue;
|
|
3336
|
+
const fixed = fixedSegments[fixedIdx];
|
|
3337
|
+
const scored = scoreCandidate(orig, fixed, normalizeCompare);
|
|
3338
|
+
if (!scored) continue;
|
|
3339
|
+
const candidateScore = scored.score;
|
|
3340
|
+
if (!best || candidateScore > best.score) {
|
|
3341
|
+
secondBestScore = best?.score ?? -Infinity;
|
|
3342
|
+
best = {
|
|
3343
|
+
fixedIdx,
|
|
3344
|
+
score: candidateScore
|
|
3345
|
+
};
|
|
3346
|
+
} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
|
|
3347
|
+
}
|
|
3348
|
+
if (!best) return { kind: "none" };
|
|
3349
|
+
if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
|
|
3350
|
+
return {
|
|
3351
|
+
fixedIdx: best.fixedIdx,
|
|
3352
|
+
kind: "match"
|
|
3353
|
+
};
|
|
3354
|
+
};
|
|
3355
|
+
const detailUnresolved = (orig, segmentIndex, notes) => ({
|
|
3356
|
+
from: orig.from,
|
|
3357
|
+
notes,
|
|
3358
|
+
originalStartPreview: preview(orig.content),
|
|
3359
|
+
segmentIndex,
|
|
3360
|
+
status: "unresolved_alignment",
|
|
3361
|
+
strategy: "rerun",
|
|
3362
|
+
to: orig.to
|
|
3363
|
+
});
|
|
3364
|
+
const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
|
|
3365
|
+
from: orig.from,
|
|
3366
|
+
notes,
|
|
3367
|
+
originalStartPreview: preview(orig.content),
|
|
3368
|
+
segmentIndex,
|
|
3369
|
+
status: "skipped_idempotent",
|
|
3370
|
+
strategy: "rerun",
|
|
3371
|
+
to: orig.to
|
|
3372
|
+
});
|
|
3373
|
+
const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
|
|
3374
|
+
let recoveredPrefixPreview;
|
|
3375
|
+
if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
|
|
3376
|
+
return {
|
|
3377
|
+
from: orig.from,
|
|
3378
|
+
originalStartPreview: preview(orig.content),
|
|
3379
|
+
recoveredPrefixPreview,
|
|
3380
|
+
recoveredStartPreview: preview(fixed.content),
|
|
3381
|
+
segmentIndex,
|
|
3382
|
+
status: "recovered",
|
|
3383
|
+
strategy: "rerun",
|
|
3384
|
+
to: orig.to
|
|
3385
|
+
};
|
|
3386
|
+
};
|
|
3387
|
+
const mergeWithRerun = (params) => {
|
|
3388
|
+
const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
|
|
3389
|
+
const usedFixed = /* @__PURE__ */ new Set();
|
|
3390
|
+
const out = [];
|
|
3391
|
+
const details = [];
|
|
3392
|
+
let recovered = 0;
|
|
3393
|
+
let unresolved = 0;
|
|
3394
|
+
let unchanged = 0;
|
|
3395
|
+
for (let i = 0; i < originalSegments.length; i++) {
|
|
3396
|
+
const stage1Recovered = stage1RecoveredAtIndex.get(i);
|
|
3397
|
+
if (stage1Recovered) {
|
|
3398
|
+
out.push(stage1Recovered);
|
|
3399
|
+
recovered++;
|
|
3400
|
+
details.push(recoveredDetailAtIndex.get(i) ?? {
|
|
3401
|
+
from: stage1Recovered.from,
|
|
3402
|
+
originalStartPreview: preview(originalSegments[i].content),
|
|
3403
|
+
recoveredStartPreview: preview(stage1Recovered.content),
|
|
3404
|
+
segmentIndex: i,
|
|
3405
|
+
status: "recovered",
|
|
3406
|
+
strategy: "stage1",
|
|
3407
|
+
to: stage1Recovered.to
|
|
3408
|
+
});
|
|
3409
|
+
continue;
|
|
3410
|
+
}
|
|
3411
|
+
const orig = originalSegments[i];
|
|
3412
|
+
const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
|
|
3413
|
+
if (best.kind === "none") {
|
|
3414
|
+
out.push(orig);
|
|
3415
|
+
unresolved++;
|
|
3416
|
+
details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
|
|
3417
|
+
continue;
|
|
3418
|
+
}
|
|
3419
|
+
if (best.kind === "ambiguous") {
|
|
3420
|
+
out.push(orig);
|
|
3421
|
+
unresolved++;
|
|
3422
|
+
details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
|
|
3423
|
+
continue;
|
|
3424
|
+
}
|
|
3425
|
+
usedFixed.add(best.fixedIdx);
|
|
3426
|
+
const fixed = fixedSegments[best.fixedIdx];
|
|
3427
|
+
if (fixed.content === orig.content) {
|
|
3428
|
+
out.push(orig);
|
|
3429
|
+
unchanged++;
|
|
3430
|
+
details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
|
|
3431
|
+
continue;
|
|
3432
|
+
}
|
|
3433
|
+
out.push({
|
|
3434
|
+
...orig,
|
|
3435
|
+
content: fixed.content
|
|
3436
|
+
});
|
|
3437
|
+
recovered++;
|
|
3438
|
+
details.push(detailRecoveredRerun(orig, fixed, i));
|
|
3439
|
+
}
|
|
3440
|
+
return {
|
|
3441
|
+
details,
|
|
3442
|
+
segments: out,
|
|
3443
|
+
summary: {
|
|
3444
|
+
recovered,
|
|
3445
|
+
unchanged,
|
|
3446
|
+
unresolved
|
|
3447
|
+
}
|
|
3448
|
+
};
|
|
3449
|
+
};
|
|
3450
|
+
function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
|
|
3451
|
+
const mode = opts?.mode ?? "rerun_only";
|
|
3452
|
+
const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
|
|
3453
|
+
const resolved = resolveSelectorToRuleIndices(options, selector);
|
|
3454
|
+
const reportBase = {
|
|
3455
|
+
byRun: void 0,
|
|
3456
|
+
errors: resolved.errors,
|
|
3457
|
+
warnings: resolved.warnings
|
|
3458
|
+
};
|
|
3459
|
+
if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
|
|
3460
|
+
const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
|
|
3461
|
+
const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
|
|
3462
|
+
const merged = mergeWithRerun({
|
|
3463
|
+
fixedBuckets: buildFixedBuckets(fixedSegments),
|
|
3464
|
+
fixedSegments,
|
|
3465
|
+
normalizeCompare,
|
|
3466
|
+
originalSegments: segments,
|
|
3467
|
+
recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
|
|
3468
|
+
stage1RecoveredAtIndex: stage1.recoveredAtIndex
|
|
3469
|
+
});
|
|
3470
|
+
return {
|
|
3471
|
+
report: {
|
|
3472
|
+
...reportBase,
|
|
3473
|
+
details: merged.details,
|
|
3474
|
+
summary: {
|
|
3475
|
+
mode,
|
|
3476
|
+
recovered: merged.summary.recovered,
|
|
3477
|
+
totalSegments: segments.length,
|
|
3478
|
+
unchanged: merged.summary.unchanged,
|
|
3479
|
+
unresolved: merged.summary.unresolved
|
|
3480
|
+
}
|
|
3481
|
+
},
|
|
3482
|
+
segments: merged.segments
|
|
3483
|
+
};
|
|
3484
|
+
}
|
|
3485
|
+
function recoverMistakenMarkersForRuns(runs, opts) {
|
|
3486
|
+
const allSegments = [];
|
|
3487
|
+
const byRun = [];
|
|
3488
|
+
const details = [];
|
|
3489
|
+
const warnings = [];
|
|
3490
|
+
const errors = [];
|
|
3491
|
+
let recovered = 0;
|
|
3492
|
+
let unchanged = 0;
|
|
3493
|
+
let unresolved = 0;
|
|
3494
|
+
let offset = 0;
|
|
3495
|
+
for (let i = 0; i < runs.length; i++) {
|
|
3496
|
+
const run = runs[i];
|
|
3497
|
+
const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
|
|
3498
|
+
allSegments.push(...res.segments);
|
|
3499
|
+
for (const d of res.report.details) details.push({
|
|
3500
|
+
...d,
|
|
3501
|
+
segmentIndex: d.segmentIndex + offset
|
|
3502
|
+
});
|
|
3503
|
+
offset += run.segments.length;
|
|
3504
|
+
recovered += res.report.summary.recovered;
|
|
3505
|
+
unchanged += res.report.summary.unchanged;
|
|
3506
|
+
unresolved += res.report.summary.unresolved;
|
|
3507
|
+
warnings.push(...res.report.warnings);
|
|
3508
|
+
errors.push(...res.report.errors);
|
|
3509
|
+
byRun.push({
|
|
3510
|
+
recovered: res.report.summary.recovered,
|
|
3511
|
+
runIndex: i,
|
|
3512
|
+
totalSegments: run.segments.length,
|
|
3513
|
+
unresolved: res.report.summary.unresolved
|
|
3514
|
+
});
|
|
3515
|
+
}
|
|
3516
|
+
return {
|
|
3517
|
+
report: {
|
|
3518
|
+
byRun,
|
|
3519
|
+
details,
|
|
3520
|
+
errors,
|
|
3521
|
+
summary: {
|
|
3522
|
+
mode: opts?.mode ?? "rerun_only",
|
|
3523
|
+
recovered,
|
|
3524
|
+
totalSegments: offset,
|
|
3525
|
+
unchanged,
|
|
3526
|
+
unresolved
|
|
3527
|
+
},
|
|
3528
|
+
warnings
|
|
3529
|
+
},
|
|
3530
|
+
segments: allSegments
|
|
3531
|
+
};
|
|
3532
|
+
}
|
|
3533
|
+
|
|
3534
|
+
//#endregion
|
|
3535
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
2835
3536
|
//# sourceMappingURL=index.mjs.map
|