flappa-doormal 2.20.0 → 2.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +23 -0
- package/LICENSE.md +1 -1
- package/README.md +94 -3
- package/dist/index.d.mts +150 -67
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +2374 -1760
- package/dist/index.mjs.map +1 -1
- package/package.json +13 -8
package/dist/index.mjs
CHANGED
|
@@ -710,7 +710,7 @@ const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch
|
|
|
710
710
|
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
711
711
|
//#endregion
|
|
712
712
|
//#region src/analysis/line-starts.ts
|
|
713
|
-
const resolveOptions$
|
|
713
|
+
const resolveOptions$2 = (options = {}) => ({
|
|
714
714
|
includeFirstWordFallback: options.includeFirstWordFallback ?? true,
|
|
715
715
|
lineFilter: options.lineFilter,
|
|
716
716
|
maxExamples: options.maxExamples ?? 1,
|
|
@@ -939,7 +939,7 @@ const processPage = (page, tokenPriority, opts, acc) => {
|
|
|
939
939
|
* Analyze pages and return the most common line-start patterns (top K).
|
|
940
940
|
*/
|
|
941
941
|
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
942
|
-
const opts = resolveOptions$
|
|
942
|
+
const opts = resolveOptions$2(options);
|
|
943
943
|
const tokenPriority = buildTokenPriority();
|
|
944
944
|
const acc = /* @__PURE__ */ new Map();
|
|
945
945
|
for (const page of pages) processPage(page, tokenPriority, opts, acc);
|
|
@@ -952,7 +952,7 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
|
952
952
|
};
|
|
953
953
|
//#endregion
|
|
954
954
|
//#region src/analysis/repeating-sequences.ts
|
|
955
|
-
const resolveOptions = (options) => {
|
|
955
|
+
const resolveOptions$1 = (options) => {
|
|
956
956
|
const minElements = Math.max(1, options?.minElements ?? 1);
|
|
957
957
|
return {
|
|
958
958
|
contextChars: options?.contextChars ?? 50,
|
|
@@ -1106,7 +1106,7 @@ const extractPageNgrams = (page, items, opts, stats) => {
|
|
|
1106
1106
|
* use `analyzeCommonLineStarts()` instead.
|
|
1107
1107
|
*/
|
|
1108
1108
|
const analyzeRepeatingSequences = (pages, options) => {
|
|
1109
|
-
const opts = resolveOptions(options);
|
|
1109
|
+
const opts = resolveOptions$1(options);
|
|
1110
1110
|
const stats = /* @__PURE__ */ new Map();
|
|
1111
1111
|
for (const page of pages) {
|
|
1112
1112
|
if (!page.content) continue;
|
|
@@ -1119,636 +1119,821 @@ const analyzeRepeatingSequences = (pages, options) => {
|
|
|
1119
1119
|
}));
|
|
1120
1120
|
};
|
|
1121
1121
|
//#endregion
|
|
1122
|
-
//#region src/
|
|
1122
|
+
//#region src/types/rules.ts
|
|
1123
1123
|
/**
|
|
1124
|
-
*
|
|
1125
|
-
* Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
|
|
1124
|
+
* Pattern type key names for split rules.
|
|
1126
1125
|
*
|
|
1127
|
-
*
|
|
1126
|
+
* Use this array to dynamically iterate over pattern types in UIs,
|
|
1127
|
+
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
1128
|
+
*
|
|
1129
|
+
* @example
|
|
1130
|
+
* // Build a dropdown/select in UI
|
|
1131
|
+
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
1132
|
+
*
|
|
1133
|
+
* @example
|
|
1134
|
+
* // Type-safe pattern key validation
|
|
1135
|
+
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
1136
|
+
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
1128
1137
|
*/
|
|
1129
|
-
const
|
|
1130
|
-
"
|
|
1131
|
-
"
|
|
1132
|
-
"
|
|
1133
|
-
"
|
|
1134
|
-
"
|
|
1135
|
-
"
|
|
1136
|
-
"numbered",
|
|
1137
|
-
"raqms",
|
|
1138
|
-
"raqm",
|
|
1139
|
-
"tarqim",
|
|
1140
|
-
"bullet",
|
|
1141
|
-
"dash",
|
|
1142
|
-
"harf"
|
|
1138
|
+
const PATTERN_TYPE_KEYS = [
|
|
1139
|
+
"lineStartsWith",
|
|
1140
|
+
"lineStartsAfter",
|
|
1141
|
+
"lineEndsWith",
|
|
1142
|
+
"template",
|
|
1143
|
+
"regex",
|
|
1144
|
+
"dictionaryEntry"
|
|
1143
1145
|
];
|
|
1146
|
+
//#endregion
|
|
1147
|
+
//#region src/optimization/optimize-rules.ts
|
|
1148
|
+
const MERGEABLE_KEYS = new Set([
|
|
1149
|
+
"lineStartsWith",
|
|
1150
|
+
"lineStartsAfter",
|
|
1151
|
+
"lineEndsWith"
|
|
1152
|
+
]);
|
|
1144
1153
|
/**
|
|
1145
|
-
*
|
|
1146
|
-
* Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
|
|
1154
|
+
* Get the pattern type key for a rule.
|
|
1147
1155
|
*/
|
|
1148
|
-
const
|
|
1149
|
-
|
|
1150
|
-
const
|
|
1151
|
-
|
|
1152
|
-
return [...prioritized, ...remaining];
|
|
1156
|
+
const getPatternKey = (rule) => PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
|
|
1157
|
+
const getPatternArray = (rule, key) => {
|
|
1158
|
+
const value = rule[key];
|
|
1159
|
+
return Array.isArray(value) ? value : [];
|
|
1153
1160
|
};
|
|
1154
|
-
const
|
|
1155
|
-
const
|
|
1156
|
-
|
|
1157
|
-
const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
|
|
1158
|
-
const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
|
|
1159
|
-
const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
|
|
1160
|
-
const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
|
|
1161
|
-
const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
|
|
1162
|
-
const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
|
|
1163
|
-
return leftOk && rightOk;
|
|
1161
|
+
const getPatternString = (rule, key) => {
|
|
1162
|
+
const value = rule[key];
|
|
1163
|
+
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
|
|
1164
1164
|
};
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
const startIndex = match.index;
|
|
1195
|
-
const endIndex = startIndex + match[0].length;
|
|
1196
|
-
if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
|
|
1197
|
-
if (isPositionCovered(startIndex, endIndex)) continue;
|
|
1198
|
-
results.push({
|
|
1199
|
-
endIndex,
|
|
1200
|
-
index: startIndex,
|
|
1201
|
-
match: match[0],
|
|
1202
|
-
token: tokenName
|
|
1203
|
-
});
|
|
1204
|
-
coveredRanges.push([startIndex, endIndex]);
|
|
1205
|
-
}
|
|
1206
|
-
} catch {}
|
|
1207
|
-
}
|
|
1208
|
-
return results.sort((a, b) => a.index - b.index);
|
|
1165
|
+
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
1166
|
+
const serializePrimitive = (value) => {
|
|
1167
|
+
if (value === void 0) return "undefined";
|
|
1168
|
+
if (typeof value === "number") return Number.isFinite(value) ? JSON.stringify(value) : JSON.stringify(String(value));
|
|
1169
|
+
if (typeof value === "bigint") return JSON.stringify(`${value}n`);
|
|
1170
|
+
if (typeof value === "symbol") return JSON.stringify(value.toString());
|
|
1171
|
+
return JSON.stringify(value);
|
|
1172
|
+
};
|
|
1173
|
+
const stableSerializeArray = (values, seen) => `[${values.map((value) => stableSerializeValue(value, seen)).join(",")}]`;
|
|
1174
|
+
const stableSerializeObject = (value, seen) => {
|
|
1175
|
+
if (seen.has(value)) throw new TypeError("Cannot optimize rules with circular option values");
|
|
1176
|
+
seen.add(value);
|
|
1177
|
+
const serialized = Object.entries(value).filter(([, entryValue]) => entryValue !== void 0).sort(([left], [right]) => left.localeCompare(right)).map(([entryKey, entryValue]) => `${JSON.stringify(entryKey)}:${stableSerializeValue(entryValue, seen)}`).join(",");
|
|
1178
|
+
seen.delete(value);
|
|
1179
|
+
return `{${serialized}}`;
|
|
1180
|
+
};
|
|
1181
|
+
const stableSerializeValue = (value, seen) => {
|
|
1182
|
+
if (typeof value === "function") return JSON.stringify(`[Function:${value.name || "anonymous"}]`);
|
|
1183
|
+
if (!value || typeof value !== "object") return serializePrimitive(value);
|
|
1184
|
+
if (Array.isArray(value)) return stableSerializeArray(value, seen);
|
|
1185
|
+
if (value instanceof Date) return JSON.stringify(value.toISOString());
|
|
1186
|
+
if (value instanceof RegExp) return JSON.stringify(value.toString());
|
|
1187
|
+
return stableSerializeObject(value, seen);
|
|
1188
|
+
};
|
|
1189
|
+
const stableSerialize = (value) => stableSerializeValue(value, /* @__PURE__ */ new WeakSet());
|
|
1190
|
+
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
1191
|
+
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
|
|
1192
|
+
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
1193
|
+
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
1209
1194
|
};
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
* @param text - Original text
|
|
1215
|
-
* @param detected - Array of detected patterns from detectTokenPatterns
|
|
1216
|
-
* @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
|
|
1217
|
-
*
|
|
1218
|
-
* @example
|
|
1219
|
-
* const detected = detectTokenPatterns("٣٤ - ");
|
|
1220
|
-
* generateTemplateFromText("٣٤ - ", detected);
|
|
1221
|
-
* // Returns: "{{raqms}} {{dash}} "
|
|
1222
|
-
*/
|
|
1223
|
-
const generateTemplateFromText = (text, detected) => {
|
|
1224
|
-
if (!text || detected.length === 0) return text;
|
|
1225
|
-
let template = text;
|
|
1226
|
-
const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
|
|
1227
|
-
for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
|
|
1228
|
-
return template;
|
|
1195
|
+
const getSpecificityScore = (rule) => {
|
|
1196
|
+
const key = getPatternKey(rule);
|
|
1197
|
+
if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
|
|
1198
|
+
return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
|
|
1229
1199
|
};
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
const
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
fuzzy: false,
|
|
1259
|
-
metaType: "hadith",
|
|
1260
|
-
patternType: "lineStartsAfter"
|
|
1261
|
-
};
|
|
1200
|
+
const createMergeKey = (rule) => {
|
|
1201
|
+
const key = getPatternKey(rule);
|
|
1202
|
+
return `${key}|${stableSerialize(Object.fromEntries(Object.entries(rule).filter(([field]) => field !== key)))}`;
|
|
1203
|
+
};
|
|
1204
|
+
const optimizeRules = (rules) => {
|
|
1205
|
+
const output = [];
|
|
1206
|
+
const indexByMergeKey = /* @__PURE__ */ new Map();
|
|
1207
|
+
let mergedCount = 0;
|
|
1208
|
+
for (const rule of rules) {
|
|
1209
|
+
const key = getPatternKey(rule);
|
|
1210
|
+
if (!MERGEABLE_KEYS.has(key)) {
|
|
1211
|
+
output.push(rule);
|
|
1212
|
+
continue;
|
|
1213
|
+
}
|
|
1214
|
+
const mergeKey = createMergeKey(rule);
|
|
1215
|
+
const existingIndex = indexByMergeKey.get(mergeKey);
|
|
1216
|
+
if (existingIndex === void 0) {
|
|
1217
|
+
indexByMergeKey.set(mergeKey, output.length);
|
|
1218
|
+
output.push({
|
|
1219
|
+
...rule,
|
|
1220
|
+
[key]: normalizePatterns(getPatternArray(rule, key))
|
|
1221
|
+
});
|
|
1222
|
+
} else {
|
|
1223
|
+
const existing = output[existingIndex];
|
|
1224
|
+
existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
|
|
1225
|
+
mergedCount++;
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1262
1228
|
return {
|
|
1263
|
-
|
|
1264
|
-
|
|
1229
|
+
mergedCount,
|
|
1230
|
+
rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
|
|
1265
1231
|
};
|
|
1266
1232
|
};
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1233
|
+
//#endregion
|
|
1234
|
+
//#region src/segmentation/pattern-validator.ts
|
|
1235
|
+
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
1236
|
+
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
1237
|
+
const BARE_TOKEN_REGEX = (() => {
|
|
1238
|
+
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
1239
|
+
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
1240
|
+
})();
|
|
1241
|
+
const createMalformedTokenIssue = (tokenLiteral, side) => {
|
|
1242
|
+
const token = tokenLiteral.split(":", 1)[0] || void 0;
|
|
1276
1243
|
return {
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1244
|
+
message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
|
|
1245
|
+
suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
|
|
1246
|
+
token,
|
|
1247
|
+
type: "missing_braces"
|
|
1280
1248
|
};
|
|
1281
1249
|
};
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
const normalized = normalizeArabicForComparison(word);
|
|
1289
|
-
if (!normalized || seen.has(normalized)) continue;
|
|
1290
|
-
seen.add(normalized);
|
|
1291
|
-
result.push(word);
|
|
1250
|
+
const detectMalformedLeftToken = (pattern) => {
|
|
1251
|
+
for (let index = 0; index < pattern.length - 1; index++) {
|
|
1252
|
+
if (pattern.slice(index, index + 2) !== "{{") continue;
|
|
1253
|
+
const closeIndex = pattern.indexOf("}}", index + 2);
|
|
1254
|
+
if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
|
|
1255
|
+
index = closeIndex + 1;
|
|
1292
1256
|
}
|
|
1293
|
-
return result;
|
|
1294
|
-
};
|
|
1295
|
-
const buildStopAlternation = (stopWords) => {
|
|
1296
|
-
const unique = uniqueCanonicalWords(stopWords);
|
|
1297
|
-
if (unique.length === 0) return "";
|
|
1298
|
-
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1299
1257
|
};
|
|
1300
|
-
const
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1258
|
+
const detectMalformedRightToken = (pattern) => {
|
|
1259
|
+
for (let index = 0; index < pattern.length - 1; index++) {
|
|
1260
|
+
if (pattern.slice(index, index + 2) !== "}}") continue;
|
|
1261
|
+
if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
|
|
1262
|
+
}
|
|
1304
1263
|
};
|
|
1305
|
-
const
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1264
|
+
const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
|
|
1265
|
+
/**
|
|
1266
|
+
* Validates a single pattern for common issues.
|
|
1267
|
+
*/
|
|
1268
|
+
const validatePattern = (pattern, seenPatterns) => {
|
|
1269
|
+
if (!pattern.trim()) return {
|
|
1270
|
+
message: "Empty pattern is not allowed",
|
|
1271
|
+
type: "empty_pattern"
|
|
1272
|
+
};
|
|
1273
|
+
if (seenPatterns.has(pattern)) return {
|
|
1274
|
+
message: `Duplicate pattern: "${pattern}"`,
|
|
1275
|
+
pattern,
|
|
1276
|
+
type: "duplicate"
|
|
1277
|
+
};
|
|
1278
|
+
seenPatterns.add(pattern);
|
|
1279
|
+
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
1280
|
+
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
1281
|
+
const name = match[1];
|
|
1282
|
+
if (name && !KNOWN_TOKENS.has(name)) return {
|
|
1283
|
+
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
1284
|
+
suggestion: "Check spelling or use a known token",
|
|
1285
|
+
token: name,
|
|
1286
|
+
type: "unknown_token"
|
|
1287
|
+
};
|
|
1288
|
+
}
|
|
1289
|
+
const malformed = detectMalformedToken(pattern);
|
|
1290
|
+
if (malformed) return malformed;
|
|
1291
|
+
for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
|
|
1292
|
+
const [full, name] = match;
|
|
1293
|
+
const idx = match.index;
|
|
1294
|
+
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
1295
|
+
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
1296
|
+
suggestion: `{{${full}}}`,
|
|
1297
|
+
token: name,
|
|
1298
|
+
type: "missing_braces"
|
|
1299
|
+
};
|
|
1300
|
+
}
|
|
1310
1301
|
};
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1302
|
+
/**
|
|
1303
|
+
* Validates an array of patterns, returning parallel array of issues.
|
|
1304
|
+
*/
|
|
1305
|
+
const validatePatternArray = (patterns) => {
|
|
1306
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1307
|
+
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
1308
|
+
return issues.some(Boolean) ? issues : void 0;
|
|
1315
1309
|
};
|
|
1316
|
-
const
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
const
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
})
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1310
|
+
const applyRulePatternValidation = (result, key, patterns) => {
|
|
1311
|
+
if (!patterns) return false;
|
|
1312
|
+
const issues = validatePatternArray(patterns);
|
|
1313
|
+
if (!issues) return false;
|
|
1314
|
+
result[key] = issues;
|
|
1315
|
+
return true;
|
|
1316
|
+
};
|
|
1317
|
+
const validateTemplateRule = (rule, result) => {
|
|
1318
|
+
if (!("template" in rule)) return false;
|
|
1319
|
+
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
1320
|
+
if (!issue) return false;
|
|
1321
|
+
result.template = issue;
|
|
1322
|
+
return true;
|
|
1323
|
+
};
|
|
1324
|
+
const validateRegexRule = (rule, result) => {
|
|
1325
|
+
if (!("regex" in rule)) return false;
|
|
1326
|
+
if (!rule.regex.trim()) {
|
|
1327
|
+
result.regex = {
|
|
1328
|
+
message: "Empty pattern is not allowed",
|
|
1329
|
+
type: "empty_pattern"
|
|
1330
|
+
};
|
|
1331
|
+
return true;
|
|
1332
|
+
}
|
|
1333
|
+
try {
|
|
1334
|
+
new RegExp(rule.regex, "u");
|
|
1335
|
+
return false;
|
|
1336
|
+
} catch (error) {
|
|
1337
|
+
result.regex = {
|
|
1338
|
+
message: error instanceof Error ? error.message : String(error),
|
|
1339
|
+
pattern: rule.regex,
|
|
1340
|
+
type: "invalid_regex"
|
|
1341
|
+
};
|
|
1342
|
+
return true;
|
|
1343
|
+
}
|
|
1344
|
+
};
|
|
1345
|
+
const invalidDictionaryEntryIssue = (message) => ({
|
|
1346
|
+
message,
|
|
1347
|
+
type: "invalid_option"
|
|
1348
|
+
});
|
|
1349
|
+
const addBooleanDictionaryEntryIssue = (issues, key, value) => {
|
|
1350
|
+
if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
|
|
1351
|
+
};
|
|
1352
|
+
const addCaptureNameIssue = (issues, captureName) => {
|
|
1353
|
+
if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
1354
|
+
};
|
|
1355
|
+
const addMinLettersIssue = (issues, minLetters) => {
|
|
1356
|
+
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
1357
|
+
};
|
|
1358
|
+
const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
|
|
1359
|
+
const min = minLetters ?? 2;
|
|
1360
|
+
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
|
|
1361
|
+
};
|
|
1362
|
+
const validateDictionaryEntryRule = (rule, result) => {
|
|
1363
|
+
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
1364
|
+
const issues = {};
|
|
1365
|
+
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
1366
|
+
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
1367
|
+
addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
|
|
1368
|
+
addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
|
|
1369
|
+
addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
|
|
1370
|
+
addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
|
|
1371
|
+
addCaptureNameIssue(issues, captureName);
|
|
1372
|
+
addMinLettersIssue(issues, minLetters);
|
|
1373
|
+
addMaxLettersIssue(issues, maxLetters, minLetters);
|
|
1374
|
+
if (Object.keys(issues).length === 0) return false;
|
|
1375
|
+
result.dictionaryEntry = issues;
|
|
1376
|
+
return true;
|
|
1377
|
+
};
|
|
1378
|
+
const formatValidationIssue = (_type, issue, loc) => {
|
|
1379
|
+
if (!issue) return null;
|
|
1380
|
+
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
1381
|
+
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
1382
|
+
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
1383
|
+
if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
|
|
1384
|
+
return `${loc}: ${issue.message || issue.type}`;
|
|
1347
1385
|
};
|
|
1348
1386
|
/**
|
|
1349
|
-
*
|
|
1387
|
+
* Validates split rules for common pattern issues.
|
|
1350
1388
|
*
|
|
1351
|
-
*
|
|
1352
|
-
* `{
|
|
1353
|
-
*
|
|
1389
|
+
* Checks for:
|
|
1390
|
+
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
1391
|
+
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
1392
|
+
* - Duplicate patterns within the same rule
|
|
1354
1393
|
*
|
|
1355
|
-
* @
|
|
1356
|
-
*
|
|
1357
|
-
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1358
|
-
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1359
|
-
* })
|
|
1394
|
+
* @param rules - Array of split rules to validate
|
|
1395
|
+
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
1360
1396
|
*
|
|
1361
1397
|
* @example
|
|
1362
|
-
*
|
|
1363
|
-
*
|
|
1364
|
-
*
|
|
1365
|
-
*
|
|
1366
|
-
*
|
|
1367
|
-
*
|
|
1398
|
+
* const issues = validateRules([
|
|
1399
|
+
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
1400
|
+
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
1401
|
+
* ]);
|
|
1402
|
+
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
1403
|
+
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
1368
1404
|
*/
|
|
1405
|
+
const validateRules = (rules) => rules.map((rule) => {
|
|
1406
|
+
const result = {};
|
|
1407
|
+
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
|
|
1408
|
+
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
|
|
1409
|
+
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
|
|
1410
|
+
const templateIssues = validateTemplateRule(rule, result);
|
|
1411
|
+
const regexIssues = validateRegexRule(rule, result);
|
|
1412
|
+
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
1413
|
+
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
|
|
1414
|
+
});
|
|
1369
1415
|
/**
|
|
1370
|
-
*
|
|
1371
|
-
*
|
|
1372
|
-
*
|
|
1416
|
+
* Formats a validation result array into a list of human-readable error messages.
|
|
1417
|
+
*
|
|
1418
|
+
* Useful for displaying validation errors in UIs.
|
|
1419
|
+
*
|
|
1420
|
+
* @param results - The result array from `validateRules()`
|
|
1421
|
+
* @returns Array of formatted error strings
|
|
1422
|
+
*
|
|
1423
|
+
* @example
|
|
1424
|
+
* const issues = validateRules(rules);
|
|
1425
|
+
* const errors = formatValidationReport(issues);
|
|
1426
|
+
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
1373
1427
|
*/
|
|
1374
|
-
const
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
});
|
|
1380
|
-
return {
|
|
1381
|
-
dictionaryEntry: {
|
|
1382
|
-
allowCommaSeparated,
|
|
1383
|
-
allowParenthesized,
|
|
1384
|
-
allowWhitespaceBeforeColon,
|
|
1385
|
-
captureName,
|
|
1386
|
-
maxLetters,
|
|
1387
|
-
midLineSubentries,
|
|
1388
|
-
minLetters,
|
|
1389
|
-
stopWords: uniqueCanonicalWords(stopWords)
|
|
1390
|
-
},
|
|
1391
|
-
meta,
|
|
1392
|
-
pageStartPrevWordStoplist,
|
|
1393
|
-
samePagePrevWordStoplist
|
|
1394
|
-
};
|
|
1428
|
+
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
1429
|
+
if (!result) return [];
|
|
1430
|
+
return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
|
|
1431
|
+
});
|
|
1432
|
+
const formatValidationIssues = (type, issues, ruleNumber) => {
|
|
1433
|
+
if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
|
|
1434
|
+
return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
|
|
1395
1435
|
};
|
|
1396
1436
|
//#endregion
|
|
1397
|
-
//#region src/
|
|
1398
|
-
|
|
1399
|
-
const
|
|
1400
|
-
|
|
1401
|
-
const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
|
|
1402
|
-
const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
|
|
1403
|
-
const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN$1})(?:[)\\]])?$`, "u");
|
|
1404
|
-
const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
|
|
1405
|
-
const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
|
|
1406
|
-
const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
|
|
1407
|
-
const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
|
|
1408
|
-
const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
|
|
1409
|
-
const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
|
|
1410
|
-
const COLON_NOISE_RE = /^.+:\s*.+$/u;
|
|
1411
|
-
const CHAPTER_TERMS = [
|
|
1412
|
-
"باب",
|
|
1413
|
-
"فصل",
|
|
1414
|
-
"كتاب",
|
|
1415
|
-
"حرف",
|
|
1416
|
-
"أبواب"
|
|
1417
|
-
];
|
|
1418
|
-
const MARKER_PREFIXES = [
|
|
1419
|
-
"بسم الله",
|
|
1420
|
-
"توكلت على الله",
|
|
1421
|
-
"آخر كتاب",
|
|
1422
|
-
"ويتلوه"
|
|
1423
|
-
];
|
|
1424
|
-
const NOISE_TOKENS = [
|
|
1425
|
-
"قال",
|
|
1426
|
-
"وقيل",
|
|
1427
|
-
"ويقال",
|
|
1428
|
-
"وفي",
|
|
1429
|
-
"يعني",
|
|
1430
|
-
"فإذا"
|
|
1431
|
-
];
|
|
1432
|
-
const emptyCounts = () => ({
|
|
1433
|
-
chapter: 0,
|
|
1434
|
-
cluster: 0,
|
|
1435
|
-
codeLine: 0,
|
|
1436
|
-
entry: 0,
|
|
1437
|
-
inlineSubentry: 0,
|
|
1438
|
-
lineEntry: 0,
|
|
1439
|
-
marker: 0,
|
|
1440
|
-
noise: 0,
|
|
1441
|
-
pairedForms: 0
|
|
1442
|
-
});
|
|
1443
|
-
const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
|
|
1444
|
-
const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
|
|
1445
|
-
const isDelimitedPrefixMatch$1 = (text, prefix) => {
|
|
1446
|
-
if (text === prefix) return true;
|
|
1447
|
-
if (!text.startsWith(prefix)) return false;
|
|
1448
|
-
const nextChar = text[prefix.length];
|
|
1449
|
-
return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
|
|
1437
|
+
//#region src/preprocessing/transforms.ts
|
|
1438
|
+
/** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
|
|
1439
|
+
const assertNever$2 = (x) => {
|
|
1440
|
+
throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
|
|
1450
1441
|
};
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1442
|
+
/** Check if a character is whitespace (space, newline, tab, etc.) */
|
|
1443
|
+
const isWhitespace = (char) => /\s/.test(char);
|
|
1444
|
+
/**
|
|
1445
|
+
* Check if a character code is a zero-width control character.
|
|
1446
|
+
*
|
|
1447
|
+
* Covers:
|
|
1448
|
+
* - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
|
|
1449
|
+
* - U+202A–U+202E (Bidirectional Formatting)
|
|
1450
|
+
* - U+2060–U+2064 (Word Joiner, Invisible Operators)
|
|
1451
|
+
* - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
|
|
1452
|
+
*/
|
|
1453
|
+
const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
|
|
1454
|
+
/**
|
|
1455
|
+
* Remove zero-width control characters from text.
|
|
1456
|
+
*
|
|
1457
|
+
* @param text - Input text
|
|
1458
|
+
* @param mode - 'strip' (default) removes entirely, 'space' replaces with space
|
|
1459
|
+
* @returns Text with zero-width characters removed or replaced
|
|
1460
|
+
*/
|
|
1461
|
+
const removeZeroWidth = (text, mode = "strip") => {
|
|
1462
|
+
if (mode === "space") {
|
|
1463
|
+
const parts = [];
|
|
1464
|
+
let lastWasWhitespace = true;
|
|
1465
|
+
for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
|
|
1466
|
+
if (!lastWasWhitespace && parts.length > 0) {
|
|
1467
|
+
parts.push(" ");
|
|
1468
|
+
lastWasWhitespace = true;
|
|
1469
|
+
}
|
|
1470
|
+
} else {
|
|
1471
|
+
const char = text[i];
|
|
1472
|
+
parts.push(char);
|
|
1473
|
+
lastWasWhitespace = isWhitespace(char);
|
|
1474
|
+
}
|
|
1475
|
+
return parts.join("");
|
|
1476
|
+
}
|
|
1477
|
+
return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
|
|
1455
1478
|
};
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1479
|
+
/**
|
|
1480
|
+
* Condense multiple periods (...) into ellipsis character (…).
|
|
1481
|
+
*
|
|
1482
|
+
* Prevents `{{tarqim}}` from false-matching inside ellipsis since
|
|
1483
|
+
* the `.` in tarqim matches individual periods.
|
|
1484
|
+
*
|
|
1485
|
+
* @param text - Input text
|
|
1486
|
+
* @returns Text with period sequences replaced by ellipsis
|
|
1487
|
+
*/
|
|
1488
|
+
const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
|
|
1489
|
+
/**
|
|
1490
|
+
* Join trailing و (waw) to the next word.
|
|
1491
|
+
*
|
|
1492
|
+
* Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
|
|
1493
|
+
*
|
|
1494
|
+
* @param text - Input text
|
|
1495
|
+
* @returns Text with trailing waw joined to following word
|
|
1496
|
+
*/
|
|
1497
|
+
const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
|
|
1498
|
+
/**
|
|
1499
|
+
* Check if a page ID is within a constraint range.
|
|
1500
|
+
*/
|
|
1501
|
+
const isInRange = (pageId, constraint) => {
|
|
1502
|
+
if (constraint.min !== void 0 && pageId < constraint.min) return false;
|
|
1503
|
+
if (constraint.max !== void 0 && pageId > constraint.max) return false;
|
|
1504
|
+
return true;
|
|
1505
|
+
};
|
|
1506
|
+
/**
|
|
1507
|
+
* Normalize a transform to its object form.
|
|
1508
|
+
*/
|
|
1509
|
+
const normalizeTransform = (transform) => {
|
|
1510
|
+
if (typeof transform === "string") return { type: transform };
|
|
1511
|
+
return transform;
|
|
1512
|
+
};
|
|
1513
|
+
/**
|
|
1514
|
+
* Apply preprocessing transforms to a page's content.
|
|
1515
|
+
*
|
|
1516
|
+
* Transforms run in array order. Each can be limited to specific pages
|
|
1517
|
+
* via `min`/`max` constraints.
|
|
1518
|
+
*
|
|
1519
|
+
* @param content - Page content to transform
|
|
1520
|
+
* @param pageId - Page ID for constraint checking
|
|
1521
|
+
* @param transforms - Array of transforms to apply
|
|
1522
|
+
* @returns Transformed content
|
|
1523
|
+
*/
|
|
1524
|
+
const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
1525
|
+
let result = content;
|
|
1526
|
+
for (const transform of transforms) {
|
|
1527
|
+
const rule = normalizeTransform(transform);
|
|
1528
|
+
if (!isInRange(pageId, rule)) continue;
|
|
1529
|
+
switch (rule.type) {
|
|
1530
|
+
case "removeZeroWidth":
|
|
1531
|
+
result = removeZeroWidth(result, rule.mode ?? "strip");
|
|
1532
|
+
break;
|
|
1533
|
+
case "condenseEllipsis":
|
|
1534
|
+
result = condenseEllipsis(result);
|
|
1535
|
+
break;
|
|
1536
|
+
case "fixTrailingWaw":
|
|
1537
|
+
result = fixTrailingWaw(result);
|
|
1538
|
+
break;
|
|
1539
|
+
default: assertNever$2(rule.type);
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
return result;
|
|
1462
1543
|
};
|
|
1544
|
+
//#endregion
|
|
1545
|
+
//#region src/validation/validate-segments.ts
|
|
1463
1546
|
/**
|
|
1464
|
-
*
|
|
1547
|
+
* Creates a short preview string of text content for error reporting.
|
|
1548
|
+
* Truncates content exceeding PREVIEW_LIMIT.
|
|
1465
1549
|
*/
|
|
1466
|
-
const
|
|
1467
|
-
const
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
if (CHAPTER_HEADING_RE.test(text) || CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizeArabicForComparison(unwrapped), normalizeArabicForComparison(term)))) return "chapter";
|
|
1471
|
-
if (looksLikeNoiseHeading(text)) return "noise";
|
|
1472
|
-
if (isCodeHeading(text)) return "marker";
|
|
1473
|
-
if (MARKER_PREFIXES.some((token) => normalizeArabicForComparison(unwrapped).startsWith(normalizeArabicForComparison(token)))) return "marker";
|
|
1474
|
-
if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
|
|
1475
|
-
if (CLUSTER_HEADING_RE.test(text)) return "cluster";
|
|
1476
|
-
return "entry";
|
|
1550
|
+
const buildPreview = (text) => {
|
|
1551
|
+
const normalized = text.replace(/\s+/g, " ").trim();
|
|
1552
|
+
if (normalized.length <= 140) return normalized;
|
|
1553
|
+
return `${normalized.slice(0, 140)}...`;
|
|
1477
1554
|
};
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
|
|
1486
|
-
kind,
|
|
1487
|
-
lemma,
|
|
1488
|
-
line: lineNumber,
|
|
1489
|
-
pageId: page.id,
|
|
1490
|
-
text
|
|
1555
|
+
/**
|
|
1556
|
+
* Creates a lightweight snapshot of a segment for inclusion in validation checks.
|
|
1557
|
+
*/
|
|
1558
|
+
const buildSegmentSnapshot = (segment) => ({
|
|
1559
|
+
contentPreview: buildPreview(segment.content),
|
|
1560
|
+
from: segment.from,
|
|
1561
|
+
to: segment.to
|
|
1491
1562
|
});
|
|
1492
|
-
const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
|
|
1493
|
-
if (!rawLine.startsWith(HEADING_PREFIX$1)) return false;
|
|
1494
|
-
const kind = classifyDictionaryHeading(rawLine);
|
|
1495
|
-
matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
|
|
1496
|
-
return true;
|
|
1497
|
-
};
|
|
1498
|
-
const scanLineEntry = (page, rawLine, lineNumber, matches) => {
|
|
1499
|
-
const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
|
|
1500
|
-
if (!lineEntry?.groups?.lemma) return;
|
|
1501
|
-
matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
|
|
1502
|
-
};
|
|
1503
|
-
const scanPairedForms = (page, rawLine, lineNumber, matches) => {
|
|
1504
|
-
const pairedForms = rawLine.match(PAIRED_FORMS_RE);
|
|
1505
|
-
if (!pairedForms?.groups?.forms) return;
|
|
1506
|
-
matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
|
|
1507
|
-
};
|
|
1508
|
-
const scanCodeLine = (page, rawLine, lineNumber, matches) => {
|
|
1509
|
-
const codeLine = rawLine.match(CODE_LINE_RE);
|
|
1510
|
-
if (!codeLine?.groups?.codes) return;
|
|
1511
|
-
matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
|
|
1512
|
-
};
|
|
1513
|
-
const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
|
|
1514
|
-
for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
|
|
1515
|
-
if (!match.groups?.lemma) continue;
|
|
1516
|
-
matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
|
|
1517
|
-
}
|
|
1518
|
-
};
|
|
1519
1563
|
/**
|
|
1520
|
-
*
|
|
1564
|
+
* Normalizes page content by applying preprocessing transforms and standardizing line endings.
|
|
1521
1565
|
*/
|
|
1522
|
-
const
|
|
1523
|
-
const
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
scanPairedForms(page, rawLine, index + 1, matches);
|
|
1531
|
-
scanCodeLine(page, rawLine, index + 1, matches);
|
|
1532
|
-
scanInlineSubentries(page, rawLine, index + 1, matches);
|
|
1533
|
-
}
|
|
1534
|
-
return matches;
|
|
1566
|
+
const normalizePages = (pages, options) => {
|
|
1567
|
+
const transforms = options.preprocess ?? [];
|
|
1568
|
+
return pages.map((page) => {
|
|
1569
|
+
return {
|
|
1570
|
+
content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
|
|
1571
|
+
id: page.id
|
|
1572
|
+
};
|
|
1573
|
+
});
|
|
1535
1574
|
};
|
|
1536
1575
|
/**
|
|
1537
|
-
*
|
|
1576
|
+
* Joins all page content into a single string with boundary tracking.
|
|
1577
|
+
* Returns the joined string and a list of boundary mappings (start/end indices for each page).
|
|
1538
1578
|
*/
|
|
1539
|
-
const
|
|
1540
|
-
const
|
|
1541
|
-
const
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1579
|
+
const buildJoinedContent = (pages, joiner) => {
|
|
1580
|
+
const boundaries = [];
|
|
1581
|
+
const joined = pages.map((p) => p.content).join(joiner);
|
|
1582
|
+
let offset = 0;
|
|
1583
|
+
for (let i = 0; i < pages.length; i++) {
|
|
1584
|
+
const content = pages[i].content;
|
|
1585
|
+
const start = offset;
|
|
1586
|
+
const end = start + content.length;
|
|
1587
|
+
boundaries.push({
|
|
1588
|
+
end,
|
|
1589
|
+
id: pages[i].id,
|
|
1590
|
+
start
|
|
1591
|
+
});
|
|
1592
|
+
offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
|
|
1548
1593
|
}
|
|
1549
1594
|
return {
|
|
1550
|
-
|
|
1551
|
-
|
|
1595
|
+
boundaries,
|
|
1596
|
+
joined
|
|
1552
1597
|
};
|
|
1553
1598
|
};
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
const
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
const
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
allowSingleLetter: family.allowSingleLetter ?? false
|
|
1568
|
-
};
|
|
1569
|
-
case "lineEntry": return {
|
|
1570
|
-
...family,
|
|
1571
|
-
allowMultiWord: family.allowMultiWord ?? false,
|
|
1572
|
-
allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
|
|
1573
|
-
wrappers: family.wrappers ?? "none"
|
|
1574
|
-
};
|
|
1575
|
-
case "inlineSubentry": return {
|
|
1576
|
-
...family,
|
|
1577
|
-
prefixes: family.prefixes ?? ["و"],
|
|
1578
|
-
stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
|
|
1579
|
-
};
|
|
1580
|
-
case "codeLine": return {
|
|
1581
|
-
...family,
|
|
1582
|
-
wrappers: family.wrappers ?? "either"
|
|
1583
|
-
};
|
|
1584
|
-
case "pairedForms": return {
|
|
1585
|
-
...family,
|
|
1586
|
-
requireStatusTail: family.requireStatusTail ?? false,
|
|
1587
|
-
separator: family.separator ?? "comma"
|
|
1588
|
-
};
|
|
1589
|
-
default: return assertNever$2(family);
|
|
1599
|
+
/**
|
|
1600
|
+
* Binary search to find which page ID corresponds to a character offset in the joined content.
|
|
1601
|
+
* Returns undefined if the offset falls within a joiner gap or outside bounds.
|
|
1602
|
+
*/
|
|
1603
|
+
const findBoundaryIdForOffset = (offset, boundaries) => {
|
|
1604
|
+
let lo = 0;
|
|
1605
|
+
let hi = boundaries.length - 1;
|
|
1606
|
+
while (lo <= hi) {
|
|
1607
|
+
const mid = lo + hi >>> 1;
|
|
1608
|
+
const boundary = boundaries[mid];
|
|
1609
|
+
if (offset < boundary.start) hi = mid - 1;
|
|
1610
|
+
else if (offset > boundary.end) lo = mid + 1;
|
|
1611
|
+
else return boundary.id;
|
|
1590
1612
|
}
|
|
1613
|
+
if (boundaries.length === 0) return;
|
|
1614
|
+
const last = boundaries.at(-1);
|
|
1615
|
+
return offset > last.end ? last.id : void 0;
|
|
1591
1616
|
};
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1617
|
+
/**
|
|
1618
|
+
* Helper to construct a standardized validation issue object.
|
|
1619
|
+
*/
|
|
1620
|
+
const createIssue$1 = (type, segment, segmentIndex, overrides = {}, pageMap) => {
|
|
1621
|
+
const segmentSnapshot = buildSegmentSnapshot(segment);
|
|
1622
|
+
const page = pageMap?.get(segment.from);
|
|
1623
|
+
const matchIndex = overrides.matchIndex;
|
|
1624
|
+
const { matchIndex: _ignored, ...restOverrides } = overrides;
|
|
1625
|
+
const base = {
|
|
1626
|
+
actual: {
|
|
1627
|
+
from: segment.from,
|
|
1628
|
+
to: segment.to
|
|
1629
|
+
},
|
|
1630
|
+
segment: segmentSnapshot,
|
|
1631
|
+
segmentIndex,
|
|
1632
|
+
...restOverrides
|
|
1633
|
+
};
|
|
1634
|
+
switch (type) {
|
|
1635
|
+
case "page_not_found": return {
|
|
1636
|
+
...base,
|
|
1637
|
+
evidence: overrides.evidence ?? `Segment.from=${segment.from} does not exist in input pages.`,
|
|
1638
|
+
hint: "Check page IDs passed into segmentPages() and validateSegments().",
|
|
1639
|
+
severity: "error",
|
|
1640
|
+
type
|
|
1601
1641
|
};
|
|
1602
|
-
case "
|
|
1603
|
-
...
|
|
1604
|
-
|
|
1642
|
+
case "content_not_found": return {
|
|
1643
|
+
...base,
|
|
1644
|
+
evidence: overrides.evidence ?? "Segment content not found in any page content.",
|
|
1645
|
+
hint: overrides.hint ?? "Check preprocessing options, joiner settings, or whitespace normalization.",
|
|
1646
|
+
pageContext: page ? {
|
|
1647
|
+
pageId: page.id,
|
|
1648
|
+
pagePreview: buildPreview(page.content)
|
|
1649
|
+
} : void 0,
|
|
1650
|
+
severity: "error",
|
|
1651
|
+
type
|
|
1605
1652
|
};
|
|
1606
|
-
case "
|
|
1607
|
-
|
|
1608
|
-
|
|
1653
|
+
case "page_attribution_mismatch": {
|
|
1654
|
+
const matchedFromId = overrides.expected?.from ?? overrides.actual?.from ?? segment.from;
|
|
1655
|
+
const actualPage = pageMap?.get(matchedFromId);
|
|
1656
|
+
return {
|
|
1657
|
+
...base,
|
|
1658
|
+
evidence: overrides.evidence ?? `Content found in joined content at page ${matchedFromId}, but segment.from=${segment.from}.`,
|
|
1659
|
+
hint: overrides.hint ?? "Check duplicate content handling and boundary detection rules.",
|
|
1660
|
+
pageContext: actualPage ? {
|
|
1661
|
+
matchIndex: matchIndex ?? -1,
|
|
1662
|
+
pageId: actualPage.id,
|
|
1663
|
+
pagePreview: buildPreview(actualPage.content)
|
|
1664
|
+
} : void 0,
|
|
1665
|
+
severity: "error",
|
|
1666
|
+
type
|
|
1667
|
+
};
|
|
1668
|
+
}
|
|
1669
|
+
case "max_pages_violation": return {
|
|
1670
|
+
...base,
|
|
1671
|
+
evidence: overrides.evidence ?? `Segment spans pages ${segment.from}-${overrides.actual?.to}.`,
|
|
1672
|
+
hint: overrides.hint ?? "Check maxPages windowing in breakpoint-processor.ts and page constraints.",
|
|
1673
|
+
severity: "error",
|
|
1674
|
+
type
|
|
1675
|
+
};
|
|
1676
|
+
default: return {
|
|
1677
|
+
...base,
|
|
1678
|
+
severity: "error",
|
|
1679
|
+
type
|
|
1609
1680
|
};
|
|
1610
|
-
case "intro":
|
|
1611
|
-
case "pageContinuation": return blocker;
|
|
1612
|
-
default: return assertNever$2(blocker);
|
|
1613
|
-
}
|
|
1614
|
-
};
|
|
1615
|
-
const normalizeZone = (zone) => ({
|
|
1616
|
-
blockers: (zone.blockers ?? []).map(normalizeBlocker),
|
|
1617
|
-
families: zone.families.map(normalizeFamily),
|
|
1618
|
-
name: zone.name,
|
|
1619
|
-
when: zone.when ? {
|
|
1620
|
-
activateAfter: zone.when.activateAfter,
|
|
1621
|
-
maxPageId: zone.when.maxPageId,
|
|
1622
|
-
minPageId: zone.when.minPageId
|
|
1623
|
-
} : void 0
|
|
1624
|
-
});
|
|
1625
|
-
const createIssue$1 = (code, path, message, zoneName) => ({
|
|
1626
|
-
code,
|
|
1627
|
-
message,
|
|
1628
|
-
path,
|
|
1629
|
-
...zoneName ? { zoneName } : {}
|
|
1630
|
-
});
|
|
1631
|
-
const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
|
|
1632
|
-
const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
|
|
1633
|
-
if (gate.use === "headingText") {
|
|
1634
|
-
if (!gate.match.trim()) issues.push(createIssue$1("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
|
|
1635
|
-
if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue$1("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
|
|
1636
1681
|
}
|
|
1637
|
-
const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
|
|
1638
|
-
if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue$1("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
|
|
1639
|
-
seenActivateAfterKeys.add(dedupeKey);
|
|
1640
1682
|
};
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1683
|
+
/**
|
|
1684
|
+
* Finds all occurrences of a content string within the joined text.
|
|
1685
|
+
* Respects search limits to avoid performance cliffs on highly repetitive content.
|
|
1686
|
+
*/
|
|
1687
|
+
const findJoinedMatches = (content, joined, searchStart, searchEnd, limit = Infinity) => {
|
|
1688
|
+
const matches = [];
|
|
1689
|
+
if (!content || searchStart >= searchEnd) return matches;
|
|
1690
|
+
let idx = joined.indexOf(content, searchStart);
|
|
1691
|
+
let count = 0;
|
|
1692
|
+
while (idx >= 0 && idx < searchEnd && count < limit) {
|
|
1693
|
+
matches.push({
|
|
1694
|
+
end: idx + content.length - 1,
|
|
1695
|
+
start: idx
|
|
1696
|
+
});
|
|
1697
|
+
idx = joined.indexOf(content, idx + 1);
|
|
1698
|
+
if (idx >= searchEnd) break;
|
|
1699
|
+
count++;
|
|
1657
1700
|
}
|
|
1701
|
+
return matches;
|
|
1658
1702
|
};
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1703
|
+
/**
|
|
1704
|
+
* Verifies that a matched segment falls within the allowed maxTerms/maxPages constraints.
|
|
1705
|
+
* Checks both implicit spans (calculated from match end) and explicit segment.to claims.
|
|
1706
|
+
*/
|
|
1707
|
+
const checkMaxPagesViolation = (segment, segmentIndex, maxPages, matchEnd, _expectedBoundaryEnd, boundaries) => {
|
|
1708
|
+
const actualToId = findBoundaryIdForOffset(matchEnd, boundaries);
|
|
1709
|
+
if (actualToId === void 0) return [];
|
|
1710
|
+
if (maxPages === 0) {
|
|
1711
|
+
if (actualToId !== segment.from) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
|
|
1712
|
+
actual: {
|
|
1713
|
+
from: segment.from,
|
|
1714
|
+
to: actualToId
|
|
1715
|
+
},
|
|
1716
|
+
evidence: `Segment spans pages ${segment.from}-${actualToId} in joined content (maxPages=0).`,
|
|
1717
|
+
expected: {
|
|
1718
|
+
from: segment.from,
|
|
1719
|
+
to: segment.from
|
|
1720
|
+
}
|
|
1721
|
+
})];
|
|
1675
1722
|
}
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1723
|
+
if (segment.to !== void 0) {
|
|
1724
|
+
if (actualToId > segment.to) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
|
|
1725
|
+
actual: {
|
|
1726
|
+
from: segment.from,
|
|
1727
|
+
to: actualToId
|
|
1728
|
+
},
|
|
1729
|
+
evidence: `Segment content ends on page ${actualToId} but segment.to is ${segment.to}.`,
|
|
1730
|
+
expected: {
|
|
1731
|
+
from: segment.from,
|
|
1732
|
+
to: segment.to
|
|
1733
|
+
}
|
|
1734
|
+
})];
|
|
1735
|
+
} else if (maxPages !== void 0) {
|
|
1736
|
+
const span = actualToId - segment.from;
|
|
1737
|
+
if (span > maxPages) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
|
|
1738
|
+
actual: {
|
|
1739
|
+
from: segment.from,
|
|
1740
|
+
to: actualToId
|
|
1741
|
+
},
|
|
1742
|
+
evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
|
|
1743
|
+
expected: {
|
|
1744
|
+
from: segment.from,
|
|
1745
|
+
to: segment.from + maxPages
|
|
1746
|
+
}
|
|
1747
|
+
})];
|
|
1683
1748
|
}
|
|
1749
|
+
return [];
|
|
1684
1750
|
};
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
if (
|
|
1692
|
-
|
|
1693
|
-
const
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1751
|
+
/**
|
|
1752
|
+
* Handles validation when content is not found in the expected boundary window.
|
|
1753
|
+
* Fallback strategy: search entire document if segment matches existing content elsewhere.
|
|
1754
|
+
*/
|
|
1755
|
+
const handleMissingBoundary = (segment, segmentIndex, joined, boundaries, pageMap) => {
|
|
1756
|
+
const matches = findJoinedMatches(segment.content, joined, 0, joined.length, 1);
|
|
1757
|
+
if (matches.length === 0) return [createIssue$1("content_not_found", segment, segmentIndex, { evidence: "Segment content not found in any page content." }, pageMap)];
|
|
1758
|
+
const match = matches[0];
|
|
1759
|
+
const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
|
|
1760
|
+
const actualToId = findBoundaryIdForOffset(match.end, boundaries);
|
|
1761
|
+
return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
|
|
1762
|
+
actual: {
|
|
1763
|
+
from: segment.from,
|
|
1764
|
+
to: segment.to
|
|
1765
|
+
},
|
|
1766
|
+
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
1767
|
+
expected: {
|
|
1768
|
+
from: actualFromId,
|
|
1769
|
+
to: actualToId
|
|
1770
|
+
},
|
|
1771
|
+
matchIndex: match.start
|
|
1772
|
+
}, pageMap)];
|
|
1697
1773
|
};
|
|
1698
1774
|
/**
|
|
1699
|
-
*
|
|
1775
|
+
* Performs a widened search when the direct check fails.
|
|
1776
|
+
* Includes a small buffer around the expected position, and optionally a full-document search for short segments.
|
|
1700
1777
|
*/
|
|
1701
|
-
const
|
|
1702
|
-
const
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1778
|
+
const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions) => {
|
|
1779
|
+
const content = segment.content;
|
|
1780
|
+
const bufferSize = 1e3;
|
|
1781
|
+
const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
|
|
1782
|
+
if (rawMatches.length === 0) {
|
|
1783
|
+
const threshold = validationOptions?.fullSearchThreshold ?? 500;
|
|
1784
|
+
if (content.length < threshold) {
|
|
1785
|
+
const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
|
|
1786
|
+
const validMatch = fullMatches.find((m) => {
|
|
1787
|
+
return findBoundaryIdForOffset(m.start, boundaries) === segment.from;
|
|
1788
|
+
});
|
|
1789
|
+
if (validMatch) return checkMaxPagesViolation(segment, segmentIndex, maxPages, validMatch.end, expectedBoundary.end, boundaries);
|
|
1790
|
+
if (fullMatches.length > 0) {
|
|
1791
|
+
const match = fullMatches[0];
|
|
1792
|
+
const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
|
|
1793
|
+
const actualToId = findBoundaryIdForOffset(match.end, boundaries);
|
|
1794
|
+
return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
|
|
1795
|
+
actual: {
|
|
1796
|
+
from: segment.from,
|
|
1797
|
+
to: segment.to
|
|
1798
|
+
},
|
|
1799
|
+
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
1800
|
+
expected: {
|
|
1801
|
+
from: actualFromId,
|
|
1802
|
+
to: actualToId
|
|
1803
|
+
},
|
|
1804
|
+
matchIndex: match.start
|
|
1805
|
+
}, pageMap)];
|
|
1806
|
+
}
|
|
1807
|
+
}
|
|
1808
|
+
return [createIssue$1("content_not_found", segment, segmentIndex, {
|
|
1809
|
+
evidence: `Segment content (${content.length} chars) not found in expected window.`,
|
|
1810
|
+
hint: "Check page boundary attribution in segmenter.ts."
|
|
1811
|
+
}, pageMap)];
|
|
1707
1812
|
}
|
|
1708
|
-
const
|
|
1709
|
-
|
|
1710
|
-
|
|
1813
|
+
const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
|
|
1814
|
+
if (alignedMatches.length > 0) {
|
|
1815
|
+
const primary = alignedMatches[0];
|
|
1816
|
+
return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
|
|
1817
|
+
}
|
|
1818
|
+
const primary = rawMatches[0];
|
|
1819
|
+
const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
|
|
1820
|
+
const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
|
|
1821
|
+
return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
|
|
1822
|
+
actual: {
|
|
1823
|
+
from: segment.from,
|
|
1824
|
+
to: segment.to
|
|
1825
|
+
},
|
|
1826
|
+
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
1827
|
+
expected: {
|
|
1828
|
+
from: actualFromId,
|
|
1829
|
+
to: actualToId
|
|
1830
|
+
},
|
|
1831
|
+
matchIndex: primary.start
|
|
1832
|
+
}, pageMap)];
|
|
1711
1833
|
};
|
|
1712
1834
|
/**
|
|
1713
|
-
*
|
|
1835
|
+
* Calculates the search range end index based on segment.to or strict bounds.
|
|
1714
1836
|
*/
|
|
1715
|
-
const
|
|
1716
|
-
|
|
1717
|
-
if (
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
};
|
|
1724
|
-
normalizedProfileCache.set(profile, normalized);
|
|
1725
|
-
return normalized;
|
|
1837
|
+
const getSearchRange = (segment, expectedBoundary, boundaryMap, joinedLength) => {
|
|
1838
|
+
let searchEnd = expectedBoundary.end + 1;
|
|
1839
|
+
if (segment.to !== void 0) {
|
|
1840
|
+
const endBoundary = boundaryMap.get(segment.to);
|
|
1841
|
+
if (endBoundary) searchEnd = endBoundary.end + 1;
|
|
1842
|
+
else searchEnd = Math.min(joinedLength, expectedBoundary.end + 5e4);
|
|
1843
|
+
}
|
|
1844
|
+
return searchEnd;
|
|
1726
1845
|
};
|
|
1727
|
-
//#endregion
|
|
1728
|
-
//#region src/types/rules.ts
|
|
1729
1846
|
/**
|
|
1730
|
-
*
|
|
1731
|
-
*
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1847
|
+
* Validates attribution for a single segment by searching for its content in the joined text.
|
|
1848
|
+
* Returns issues if content is missing, mis-attributed, or violates page limits.
|
|
1849
|
+
*/
|
|
1850
|
+
const getAttributionIssues = (segment, segmentIndex, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions) => {
|
|
1851
|
+
if (!segment.content) return [createIssue$1("content_not_found", segment, segmentIndex, { evidence: "Segment content is empty." }, pageMap)];
|
|
1852
|
+
const expectedBoundary = boundaryMap.get(segment.from);
|
|
1853
|
+
if (!expectedBoundary) return handleMissingBoundary(segment, segmentIndex, joined, boundaries, pageMap);
|
|
1854
|
+
const searchEnd = getSearchRange(segment, expectedBoundary, boundaryMap, joined.length);
|
|
1855
|
+
const searchStart = expectedBoundary.start;
|
|
1856
|
+
const idx = joined.indexOf(segment.content, searchStart);
|
|
1857
|
+
if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
|
|
1858
|
+
return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
|
|
1859
|
+
};
|
|
1860
|
+
/**
|
|
1861
|
+
* Performs purely static checks on the segment metadata (Ids and spans) before expensive content searching.
|
|
1862
|
+
*/
|
|
1863
|
+
const checkStaticMaxPages = (segment, index, maxPages) => {
|
|
1864
|
+
if (maxPages === void 0 || segment.to === void 0) return null;
|
|
1865
|
+
if (maxPages === 0) {
|
|
1866
|
+
if (segment.to !== segment.from) return createIssue$1("max_pages_violation", segment, index, {
|
|
1867
|
+
evidence: "maxPages=0 requires all segments to stay within one page.",
|
|
1868
|
+
expected: {
|
|
1869
|
+
from: segment.from,
|
|
1870
|
+
to: segment.from
|
|
1871
|
+
},
|
|
1872
|
+
hint: "Check boundary detection in breakpoint-utils.ts."
|
|
1873
|
+
});
|
|
1874
|
+
return null;
|
|
1875
|
+
}
|
|
1876
|
+
const span = segment.to - segment.from;
|
|
1877
|
+
if (span > maxPages) return createIssue$1("max_pages_violation", segment, index, {
|
|
1878
|
+
evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
|
|
1879
|
+
expected: {
|
|
1880
|
+
from: segment.from,
|
|
1881
|
+
to: segment.from + maxPages
|
|
1882
|
+
},
|
|
1883
|
+
hint: "Check breakpoint windowing and page attribution in breakpoint-processor.ts."
|
|
1884
|
+
});
|
|
1885
|
+
return null;
|
|
1886
|
+
};
|
|
1887
|
+
/**
|
|
1888
|
+
* Validates a list of segments against the source pages.
|
|
1889
|
+
* checks for:
|
|
1890
|
+
* - Page existence (invalid IDs)
|
|
1891
|
+
* - Content fidelity (content must exist in pages)
|
|
1892
|
+
* - Page attribution (from/to must match content location)
|
|
1893
|
+
* - Page constraints (maxPages violations)
|
|
1738
1894
|
*
|
|
1739
|
-
* @
|
|
1740
|
-
*
|
|
1741
|
-
*
|
|
1742
|
-
*
|
|
1895
|
+
* @param pages Input pages used for segmentation
|
|
1896
|
+
* @param options Operations used during segmentation (for preprocessing/joining consistency)
|
|
1897
|
+
* @param segments The output segments to validate
|
|
1898
|
+
* @param validationOptions Optional settings for validation behavior
|
|
1899
|
+
* @returns A detailed validation report
|
|
1743
1900
|
*/
|
|
1744
|
-
const
|
|
1745
|
-
|
|
1746
|
-
"
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1901
|
+
const validateSegments = (pages, options, segments, validationOptions) => {
|
|
1902
|
+
const normalizedPages = normalizePages(pages, options);
|
|
1903
|
+
const { boundaries, joined } = buildJoinedContent(normalizedPages, options.pageJoiner === "newline" ? "\n" : " ");
|
|
1904
|
+
const boundaryMap = /* @__PURE__ */ new Map();
|
|
1905
|
+
const pageMap = /* @__PURE__ */ new Map();
|
|
1906
|
+
for (const b of boundaries) boundaryMap.set(b.id, b);
|
|
1907
|
+
for (const p of normalizedPages) pageMap.set(p.id, p);
|
|
1908
|
+
const pageIds = new Set(normalizedPages.map((p) => p.id));
|
|
1909
|
+
const maxPages = options.maxPages;
|
|
1910
|
+
const issues = [];
|
|
1911
|
+
for (let i = 0; i < segments.length; i++) {
|
|
1912
|
+
const segment = segments[i];
|
|
1913
|
+
if (!pageIds.has(segment.from)) {
|
|
1914
|
+
issues.push(createIssue$1("page_not_found", segment, i));
|
|
1915
|
+
continue;
|
|
1916
|
+
}
|
|
1917
|
+
if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue$1("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
|
|
1918
|
+
const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
|
|
1919
|
+
if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
|
|
1920
|
+
const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
|
|
1921
|
+
issues.push(...attributionIssues);
|
|
1922
|
+
}
|
|
1923
|
+
const errors = issues.filter((issue) => issue.severity === "error").length;
|
|
1924
|
+
const warnings = issues.filter((issue) => issue.severity === "warn").length;
|
|
1925
|
+
return {
|
|
1926
|
+
issues,
|
|
1927
|
+
ok: issues.length === 0,
|
|
1928
|
+
summary: {
|
|
1929
|
+
errors,
|
|
1930
|
+
issues: issues.length,
|
|
1931
|
+
pageCount: pages.length,
|
|
1932
|
+
segmentCount: segments.length,
|
|
1933
|
+
warnings
|
|
1934
|
+
}
|
|
1935
|
+
};
|
|
1936
|
+
};
|
|
1752
1937
|
//#endregion
|
|
1753
1938
|
//#region src/segmentation/debug-meta.ts
|
|
1754
1939
|
const resolveDebugConfig = (debug) => {
|
|
@@ -1843,7 +2028,14 @@ const getSegmentDebugReason = (segment, options) => {
|
|
|
1843
2028
|
return getDebugReason(segment.meta, options);
|
|
1844
2029
|
};
|
|
1845
2030
|
//#endregion
|
|
1846
|
-
//#region src/dictionary/
|
|
2031
|
+
//#region src/dictionary/constants.ts
|
|
2032
|
+
/**
|
|
2033
|
+
* Shared constants used by the dictionary runtime: phrase lists, regex patterns,
|
|
2034
|
+
* keyword sets, and structural-leak detection data.
|
|
2035
|
+
*
|
|
2036
|
+
* Keeping these here allows both runtime.ts and heading-classifier.ts to import
|
|
2037
|
+
* from a single source of truth without circular dependencies.
|
|
2038
|
+
*/
|
|
1847
2039
|
const INTRO_PHRASES = [
|
|
1848
2040
|
"وقال",
|
|
1849
2041
|
"قال",
|
|
@@ -1994,7 +2186,16 @@ const CONTINUATION_PREV_WORDS = [
|
|
|
1994
2186
|
"ثم",
|
|
1995
2187
|
"وجل"
|
|
1996
2188
|
];
|
|
1997
|
-
const
|
|
2189
|
+
const NORMALIZED_AUTHORITY_INTRO_PATTERN = [
|
|
2190
|
+
"أبو",
|
|
2191
|
+
"ابن",
|
|
2192
|
+
"ثعلب",
|
|
2193
|
+
"الليث",
|
|
2194
|
+
"الأزهري",
|
|
2195
|
+
"الجوهري",
|
|
2196
|
+
"الفراء"
|
|
2197
|
+
].map((term) => escapeRegex(normalizeArabicForComparison(term))).join("|");
|
|
2198
|
+
const NORMALIZED_AUTHORITY_RE = new RegExp(`^(?:(?:و)?قال\\s+(?:${NORMALIZED_AUTHORITY_INTRO_PATTERN})(?=$|[\\s:،؛,.])|(?:${NORMALIZED_AUTHORITY_INTRO_PATTERN})\\s+\\S+)`, "u");
|
|
1998
2199
|
const AUTHORITY_HEAD_WORDS = [
|
|
1999
2200
|
"الأزهري",
|
|
2000
2201
|
"الأصمعي",
|
|
@@ -2015,13 +2216,22 @@ const AUTHORITY_HEAD_WORDS = [
|
|
|
2015
2216
|
"ثعلب",
|
|
2016
2217
|
"شمر"
|
|
2017
2218
|
];
|
|
2219
|
+
/** Aggressive-precision authority terms (subset used for fast startsWith checks). */
|
|
2220
|
+
const AUTHORITY_AGGRESSIVE_TERMS = [
|
|
2221
|
+
"الليث",
|
|
2222
|
+
"الأزهري",
|
|
2223
|
+
"الأصمعي",
|
|
2224
|
+
"الجوهري",
|
|
2225
|
+
"الفراء",
|
|
2226
|
+
"ثعلب",
|
|
2227
|
+
"شمر"
|
|
2228
|
+
];
|
|
2018
2229
|
const STRONG_SENTENCE_TERMINATORS$1 = /[.!?؟؛۔…]$/u;
|
|
2019
|
-
const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»"
|
|
2020
|
-
const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»"
|
|
2230
|
+
const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»""'''()[\]{}<>]+$/u;
|
|
2231
|
+
const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»""'''()[\]{}<>.,!?؟؛،:]+$/u;
|
|
2021
2232
|
const ARABIC_WORD_REGEX$1 = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
|
|
2022
|
-
const
|
|
2023
|
-
const
|
|
2024
|
-
const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN})$`, "u");
|
|
2233
|
+
const CODE_LINE_PATTERN$1 = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
|
|
2234
|
+
const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1})$`, "u");
|
|
2025
2235
|
const STATUS_TAIL_PATTERN = "(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)";
|
|
2026
2236
|
const GATE_TOKEN_MAP = {
|
|
2027
2237
|
bab: "باب",
|
|
@@ -2029,18 +2239,38 @@ const GATE_TOKEN_MAP = {
|
|
|
2029
2239
|
kitab: "كتاب"
|
|
2030
2240
|
};
|
|
2031
2241
|
const GATE_DELIMITER_RE = /[\s:،؛()[\]{}\-–—]/u;
|
|
2032
|
-
const
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
const
|
|
2037
|
-
|
|
2242
|
+
const normalizeStopLemmaWord = (text) => normalizeArabicForComparison(text).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
|
|
2243
|
+
/** Pre-normalized intro phrases for startsWith / endsWith checks. */
|
|
2244
|
+
const NORMALIZED_INTRO_PHRASES = INTRO_PHRASES.map(normalizeArabicForComparison);
|
|
2245
|
+
/** Pre-normalized intro tail phrases for endsWith checks. */
|
|
2246
|
+
const NORMALIZED_INTRO_TAIL_PHRASES = INTRO_TAIL_PHRASES.map(normalizeArabicForComparison);
|
|
2247
|
+
/** Pre-normalized authority head words as a Set for O(1) lookup. */
|
|
2248
|
+
const NORMALIZED_AUTHORITY_HEAD_WORDS_SET = new Set(AUTHORITY_HEAD_WORDS.map(normalizeStopLemmaWord));
|
|
2249
|
+
/** Pre-normalized aggressive authority terms for startsWith checks. */
|
|
2250
|
+
const NORMALIZED_AUTHORITY_AGGRESSIVE_TERMS = AUTHORITY_AGGRESSIVE_TERMS.map(normalizeArabicForComparison);
|
|
2251
|
+
/** Pre-normalized qualifier tail prefixes for startsWith checks. */
|
|
2252
|
+
const NORMALIZED_QUALIFIER_TAIL_PREFIXES = QUALIFIER_TAIL_PREFIXES.map(normalizeArabicForComparison);
|
|
2253
|
+
/** Pre-normalized structural lemma prefixes for startsWith checks. */
|
|
2254
|
+
const NORMALIZED_STRUCTURAL_LEMMA_PREFIXES = STRUCTURAL_LEMMA_PREFIXES.map(normalizeArabicForComparison);
|
|
2255
|
+
/** Pre-normalized structural line keywords for includes checks. */
|
|
2256
|
+
const NORMALIZED_STRUCTURAL_LINE_KEYWORDS = STRUCTURAL_LINE_KEYWORDS.map(normalizeArabicForComparison);
|
|
2257
|
+
/** Pre-normalized continuation prev words as a Set for O(1) lookup. */
|
|
2258
|
+
const NORMALIZED_CONTINUATION_PREV_WORDS_SET = new Set(CONTINUATION_PREV_WORDS.map(normalizeArabicForComparison));
|
|
2259
|
+
/** Pre-normalized 'ولل' prefix. */
|
|
2260
|
+
const NORMALIZED_WLAL_PREFIX = normalizeArabicForComparison("ولل");
|
|
2261
|
+
//#endregion
|
|
2262
|
+
//#region src/dictionary/dictionary-blockers.ts
|
|
2263
|
+
/**
|
|
2264
|
+
* Limit backwards scans to a small suffix; dictionary blockers only need the
|
|
2265
|
+
* immediate local context rather than an unbounded full-page search.
|
|
2266
|
+
*/
|
|
2267
|
+
const LAST_ARABIC_WORD_LOOKBACK_CHARS = 256;
|
|
2268
|
+
const MAX_INTRO_CONTEXT_CHARS = 240;
|
|
2269
|
+
const IGNORABLE_BOUNDARY_CHAR_RE = /(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\u061C)/u;
|
|
2038
2270
|
const trimTrailingPageWrapNoise$1 = (text) => text.trimEnd().replace(TRAILING_PAGE_WRAP_NOISE$1, "");
|
|
2039
|
-
const endsWithStrongSentenceTerminator$1 = (pageContent) =>
|
|
2040
|
-
return STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
|
|
2041
|
-
};
|
|
2271
|
+
const endsWithStrongSentenceTerminator$1 = (pageContent) => STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
|
|
2042
2272
|
const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
|
|
2043
|
-
const windowStart = Math.max(0, endExclusive -
|
|
2273
|
+
const windowStart = Math.max(0, endExclusive - LAST_ARABIC_WORD_LOOKBACK_CHARS);
|
|
2044
2274
|
const withoutTrailingDelimiters = trimTrailingPageWrapNoise$1(text.slice(windowStart, endExclusive)).replace(TRAILING_WORD_DELIMITERS$1, "");
|
|
2045
2275
|
let lastMatch = "";
|
|
2046
2276
|
ARABIC_WORD_REGEX$1.lastIndex = 0;
|
|
@@ -2050,105 +2280,339 @@ const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
|
|
|
2050
2280
|
const previousNonWhitespaceChar = (text, endExclusive = text.length) => {
|
|
2051
2281
|
for (let index = endExclusive - 1; index >= 0; index--) {
|
|
2052
2282
|
const char = text[index];
|
|
2053
|
-
if (char &&
|
|
2283
|
+
if (char && !IGNORABLE_BOUNDARY_CHAR_RE.test(char)) return char;
|
|
2054
2284
|
}
|
|
2055
2285
|
return "";
|
|
2056
2286
|
};
|
|
2057
|
-
const
|
|
2058
|
-
|
|
2059
|
-
const
|
|
2060
|
-
|
|
2061
|
-
|
|
2287
|
+
const isAtPageStart = (text, endExclusive) => {
|
|
2288
|
+
for (let index = endExclusive - 1; index >= 0; index--) {
|
|
2289
|
+
const char = text[index];
|
|
2290
|
+
if (char && !IGNORABLE_BOUNDARY_CHAR_RE.test(char)) return false;
|
|
2291
|
+
}
|
|
2292
|
+
return true;
|
|
2293
|
+
};
|
|
2294
|
+
const normalizeStopLemma = normalizeStopLemmaWord;
|
|
2295
|
+
const getTrailingContext = (text, endExclusive, maxChars = MAX_INTRO_CONTEXT_CHARS) => text.slice(Math.max(0, endExclusive - maxChars), endExclusive);
|
|
2296
|
+
const normalizeIntroContextText = (text) => normalizeArabicForComparison(text).replace(/[/\\]+/gu, " ").replace(/[«»""'''()[\]{}]+/gu, " ").replace(/\s+/gu, " ").trim();
|
|
2297
|
+
const normalizeForIntroTailCheck = (text) => normalizeIntroContextText(text).replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
|
|
2298
|
+
const isIntroCandidate = (text) => {
|
|
2299
|
+
const normalized = normalizeIntroContextText(text);
|
|
2300
|
+
return NORMALIZED_INTRO_PHRASES.some((phrase) => normalized.startsWith(phrase));
|
|
2301
|
+
};
|
|
2302
|
+
const endsWithIntroContext = (text) => {
|
|
2303
|
+
const trimmed = text.trimEnd();
|
|
2304
|
+
if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
|
|
2305
|
+
const normalized = normalizeForIntroTailCheck(trimmed);
|
|
2306
|
+
if (!normalized) return false;
|
|
2307
|
+
if (NORMALIZED_INTRO_PHRASES.some((phrase) => normalized.endsWith(phrase))) return true;
|
|
2308
|
+
if (NORMALIZED_INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(phrase))) return true;
|
|
2309
|
+
return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
|
|
2310
|
+
};
|
|
2311
|
+
const isAuthorityCandidate = (text, precision) => {
|
|
2312
|
+
const head = normalizeStopLemma(text.split(":", 1)[0] ?? text);
|
|
2313
|
+
if (head && NORMALIZED_AUTHORITY_HEAD_WORDS_SET.has(head)) return true;
|
|
2314
|
+
const normalized = normalizeIntroContextText(text);
|
|
2315
|
+
if (NORMALIZED_AUTHORITY_RE.test(normalized)) return true;
|
|
2316
|
+
if (precision === "aggressive") return NORMALIZED_AUTHORITY_AGGRESSIVE_TERMS.some((term) => normalized.startsWith(term));
|
|
2317
|
+
return false;
|
|
2318
|
+
};
|
|
2319
|
+
const hasBlockedQualifierTail = (lemma) => {
|
|
2320
|
+
const parts = lemma.split(/[،,]/u).map((part) => part.trim()).filter(Boolean);
|
|
2321
|
+
if (parts.length < 2) return false;
|
|
2322
|
+
const tail = normalizeArabicForComparison(parts.slice(1).join(" "));
|
|
2323
|
+
return NORMALIZED_QUALIFIER_TAIL_PREFIXES.some((prefix) => tail.startsWith(prefix));
|
|
2324
|
+
};
|
|
2325
|
+
const looksLikeStructuralLeak = (candidate) => {
|
|
2326
|
+
if (!candidate.lemma) return false;
|
|
2327
|
+
const normalizedLemma = normalizeArabicForComparison(candidate.lemma);
|
|
2328
|
+
if (candidate.kind === "entry" && (/^[^\p{Script=Arabic}\d]+/u.test(candidate.lemma) || candidate.lemma.includes("{") || candidate.lemma.includes("}") || candidate.lemma.includes("##"))) return true;
|
|
2329
|
+
if (candidate.kind === "entry" && BARE_CODE_LEMMA_RE.test(candidate.lemma) && (candidate.text === candidate.lemma || candidate.text === `## ${candidate.lemma}` || candidate.text.startsWith(`## ${candidate.lemma}`) || candidate.text.startsWith(`${candidate.lemma}\n## `))) return true;
|
|
2330
|
+
if (candidate.family !== "pairedForms" && candidate.lemma.split(/\s+/u).filter(Boolean).length > 4) return true;
|
|
2331
|
+
if (NORMALIZED_STRUCTURAL_LEMMA_PREFIXES.some((prefix) => normalizedLemma.startsWith(prefix))) return true;
|
|
2332
|
+
if (normalizedLemma.startsWith(NORMALIZED_WLAL_PREFIX)) return true;
|
|
2333
|
+
const structuralText = candidate.text.startsWith("## ") ? candidate.text.slice(3).trim() : candidate.text;
|
|
2334
|
+
if (/^[\d\u0660-\u0669]+\s*-\s*\([^)]+\)(?:\s+##.*)?$/u.test(structuralText)) return true;
|
|
2335
|
+
const normalizedText = normalizeArabicForComparison(structuralText);
|
|
2336
|
+
if (STRUCTURAL_LINE_PATTERNS.some((pattern) => pattern.test(structuralText))) return NORMALIZED_STRUCTURAL_LINE_KEYWORDS.some((keyword) => normalizedText.includes(keyword));
|
|
2337
|
+
return false;
|
|
2338
|
+
};
|
|
2339
|
+
const blockerApplies = (blocker, family) => !blocker.appliesTo || blocker.appliesTo.includes(family);
|
|
2340
|
+
const rejectsViaIntroBlocker = (candidate, blocker, localBeforeCandidate) => {
|
|
2341
|
+
if (blocker.use !== "intro") return false;
|
|
2342
|
+
return isIntroCandidate(candidate.probeText) || endsWithIntroContext(localBeforeCandidate);
|
|
2343
|
+
};
|
|
2344
|
+
const rejectsViaAuthorityBlocker = (candidate, blocker) => blocker.use === "authorityIntro" && isAuthorityCandidate(candidate.probeText, blocker.precision);
|
|
2345
|
+
const rejectsViaStopLemmaBlocker = (candidate, blocker) => {
|
|
2346
|
+
if (blocker.use !== "stopLemma" || !candidate.lemma) return false;
|
|
2347
|
+
const normalizedLemma = normalizeStopLemma(candidate.lemma);
|
|
2348
|
+
return !!normalizedLemma && blocker.normalizedWords.has(normalizedLemma);
|
|
2349
|
+
};
|
|
2350
|
+
const previousWordIsBlocked = (blocker, word) => !!word && blocker.normalizedWords.has(normalizeArabicForComparison(word));
|
|
2351
|
+
const rejectsViaPageStartPreviousWord = (blocker, pageIndex, pages) => {
|
|
2352
|
+
if (pageIndex === 0) return false;
|
|
2353
|
+
const previousPage = pages[pageIndex - 1];
|
|
2354
|
+
if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
|
|
2355
|
+
return previousWordIsBlocked(blocker, extractLastArabicWord$1(previousPage.content));
|
|
2356
|
+
};
|
|
2357
|
+
const rejectsViaPreviousWordBlocker = (pageContent, localIndex, blocker, pageIndex, pages) => {
|
|
2358
|
+
if (blocker.use !== "previousWord") return false;
|
|
2359
|
+
if (isAtPageStart(pageContent, localIndex)) {
|
|
2360
|
+
if (blocker.scope === "pageStart") return rejectsViaPageStartPreviousWord(blocker, pageIndex, pages);
|
|
2361
|
+
if (blocker.scope === "any" && rejectsViaPageStartPreviousWord(blocker, pageIndex, pages)) return true;
|
|
2362
|
+
}
|
|
2363
|
+
if (blocker.scope === "pageStart") return false;
|
|
2364
|
+
return previousWordIsBlocked(blocker, extractLastArabicWord$1(pageContent, localIndex));
|
|
2365
|
+
};
|
|
2366
|
+
const rejectsViaPreviousCharBlocker = (pageContent, localIndex, blocker) => {
|
|
2367
|
+
if (blocker.use !== "previousChar") return false;
|
|
2368
|
+
const previousChar = previousNonWhitespaceChar(pageContent, localIndex);
|
|
2369
|
+
return !!previousChar && blocker.charSet.has(previousChar);
|
|
2370
|
+
};
|
|
2371
|
+
const rejectsViaPageContinuationBlocker = (candidate, blocker, pageContent, pageIndex, pages) => {
|
|
2372
|
+
if (blocker.use !== "pageContinuation") return false;
|
|
2373
|
+
if (!isAtPageStart(pageContent, candidate.localIndex) || pageIndex === 0) return false;
|
|
2374
|
+
const previousPage = pages[pageIndex - 1];
|
|
2375
|
+
if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
|
|
2376
|
+
const previousWord = extractLastArabicWord$1(previousPage.content);
|
|
2377
|
+
return !!previousWord && NORMALIZED_CONTINUATION_PREV_WORDS_SET.has(normalizeArabicForComparison(previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, blocker.authorityPrecision);
|
|
2378
|
+
};
|
|
2379
|
+
const getBlockerRejectionReason = (blocker, candidate, localBeforeCandidate, pageContent, pageIndex, pages) => {
|
|
2380
|
+
if (rejectsViaIntroBlocker(candidate, blocker, localBeforeCandidate)) return "intro";
|
|
2381
|
+
if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
|
|
2382
|
+
if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
|
|
2383
|
+
if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker, pageIndex, pages)) return "previousWord";
|
|
2384
|
+
if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
|
|
2385
|
+
if (rejectsViaPageContinuationBlocker(candidate, blocker, pageContent, pageIndex, pages)) return "pageContinuation";
|
|
2386
|
+
return null;
|
|
2387
|
+
};
|
|
2388
|
+
/**
|
|
2389
|
+
* Evaluates candidate rejection in two phases:
|
|
2390
|
+
*
|
|
2391
|
+
* Phase 1: global safety checks (not configurable per profile)
|
|
2392
|
+
* - `qualifierTail`: rejects comma-tail qualifier fragments such as "أي" and "قال"
|
|
2393
|
+
* - `structuralLeak`: rejects markdown artifacts, structural headings, and other non-lexeme leaks
|
|
2394
|
+
*
|
|
2395
|
+
* These are hard safety invariants for the Shamela-style dictionary surface,
|
|
2396
|
+
* so diagnostics report them alongside configurable blocker reasons.
|
|
2397
|
+
*
|
|
2398
|
+
* Phase 2: zone blockers (configurable per zone)
|
|
2399
|
+
* - iterates `zone.blockers` in declaration order
|
|
2400
|
+
* - returns the first matching rejection reason
|
|
2401
|
+
*/
|
|
2402
|
+
const getCandidateRejection = (candidate, zone, pageContext, pages) => {
|
|
2403
|
+
const hasQualifierTail = hasBlockedQualifierTail(candidate.lemma ?? "");
|
|
2404
|
+
if (hasQualifierTail || looksLikeStructuralLeak(candidate)) return { reason: hasQualifierTail ? "qualifierTail" : "structuralLeak" };
|
|
2405
|
+
const localBeforeCandidate = getTrailingContext(pageContext.content, candidate.localIndex);
|
|
2406
|
+
for (const blocker of zone.blockers) {
|
|
2407
|
+
if (!blockerApplies(blocker, candidate.family)) continue;
|
|
2408
|
+
const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
|
|
2409
|
+
if (reason) return { reason };
|
|
2410
|
+
}
|
|
2411
|
+
return null;
|
|
2412
|
+
};
|
|
2413
|
+
/**
|
|
2414
|
+
* Returns `true` when the candidate should be dropped (i.e. any rejection
|
|
2415
|
+
* reason exists). Convenience wrapper over `getCandidateRejection`.
|
|
2416
|
+
*/
|
|
2417
|
+
const shouldRejectCandidate = (candidate, zone, pageContext, pages) => getCandidateRejection(candidate, zone, pageContext, pages) !== null;
|
|
2418
|
+
//#endregion
|
|
2419
|
+
//#region src/dictionary/heading-classifier.ts
|
|
2420
|
+
const HEADING_PREFIX = "## ";
|
|
2421
|
+
const CODE_LINE_PATTERN = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
|
|
2422
|
+
const ARABIC_WORD_PATTERN = ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN;
|
|
2423
|
+
const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
|
|
2424
|
+
const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
|
|
2425
|
+
const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN})(?:[)\\]])?$`, "u");
|
|
2426
|
+
const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
|
|
2427
|
+
const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
|
|
2428
|
+
const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
|
|
2429
|
+
const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
|
|
2430
|
+
const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
|
|
2431
|
+
const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
|
|
2432
|
+
const COLON_NOISE_RE = /^.+:\s*.+$/u;
|
|
2433
|
+
const CHAPTER_TERMS = [
|
|
2434
|
+
"باب",
|
|
2435
|
+
"فصل",
|
|
2436
|
+
"كتاب",
|
|
2437
|
+
"حرف",
|
|
2438
|
+
"أبواب"
|
|
2439
|
+
];
|
|
2440
|
+
const MARKER_PREFIXES = [
|
|
2441
|
+
"بسم الله",
|
|
2442
|
+
"توكلت على الله",
|
|
2443
|
+
"آخر كتاب",
|
|
2444
|
+
"ويتلوه"
|
|
2445
|
+
];
|
|
2446
|
+
const NOISE_TOKENS = [
|
|
2447
|
+
"قال",
|
|
2448
|
+
"وقيل",
|
|
2449
|
+
"ويقال",
|
|
2450
|
+
"وفي",
|
|
2451
|
+
"يعني",
|
|
2452
|
+
"فإذا"
|
|
2453
|
+
];
|
|
2454
|
+
const NORMALIZED_CHAPTER_TERMS = CHAPTER_TERMS.map(normalizeArabicForComparison);
|
|
2455
|
+
const NORMALIZED_MARKER_PREFIXES = MARKER_PREFIXES.map(normalizeArabicForComparison);
|
|
2456
|
+
const NORMALIZED_NOISE_TOKENS = NOISE_TOKENS.map(normalizeArabicForComparison);
|
|
2457
|
+
const emptyCounts = () => ({
|
|
2458
|
+
chapter: 0,
|
|
2459
|
+
cluster: 0,
|
|
2460
|
+
codeLine: 0,
|
|
2461
|
+
entry: 0,
|
|
2462
|
+
inlineSubentry: 0,
|
|
2463
|
+
lineEntry: 0,
|
|
2464
|
+
marker: 0,
|
|
2465
|
+
noise: 0,
|
|
2466
|
+
pairedForms: 0
|
|
2467
|
+
});
|
|
2468
|
+
const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
|
|
2469
|
+
const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
|
|
2470
|
+
const isDelimitedPrefixMatch$1 = (text, prefix) => {
|
|
2062
2471
|
if (text === prefix) return true;
|
|
2063
2472
|
if (!text.startsWith(prefix)) return false;
|
|
2064
2473
|
const nextChar = text[prefix.length];
|
|
2065
|
-
return nextChar === void 0 ||
|
|
2474
|
+
return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
|
|
2475
|
+
};
|
|
2476
|
+
const isCodeHeading = (text) => {
|
|
2477
|
+
if (CODE_LINE_RE.test(text)) return true;
|
|
2478
|
+
const words = text.trim().split(/\s+/u).filter(Boolean);
|
|
2479
|
+
return words.length === 1 && (words[0]?.length ?? 0) === 1;
|
|
2480
|
+
};
|
|
2481
|
+
const looksLikeNoiseHeading = (text, normalizedText) => {
|
|
2482
|
+
const wordCount = text.trim().split(/\s+/u).filter(Boolean).length;
|
|
2483
|
+
if (/(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\s])/u.test(text)) return false;
|
|
2484
|
+
if (wordCount >= 8 && COLON_NOISE_RE.test(text)) return true;
|
|
2485
|
+
return NORMALIZED_NOISE_TOKENS.some((token) => normalizedText.includes(token)) && wordCount >= 4;
|
|
2486
|
+
};
|
|
2487
|
+
/**
|
|
2488
|
+
* Classifies a markdown heading line produced by `convertContentToMarkdown()`.
|
|
2489
|
+
*/
|
|
2490
|
+
const classifyDictionaryHeading = (line) => {
|
|
2491
|
+
const text = line.startsWith(HEADING_PREFIX) ? line.slice(3).trim() : line.trim();
|
|
2492
|
+
const unwrapped = stripLeadingWrappers(text);
|
|
2493
|
+
const normalizedText = normalizeArabicForComparison(text);
|
|
2494
|
+
const normalizedUnwrapped = normalizeArabicForComparison(unwrapped);
|
|
2495
|
+
if (!text) return "noise";
|
|
2496
|
+
if (CHAPTER_HEADING_RE.test(text) || NORMALIZED_CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizedUnwrapped, term))) return "chapter";
|
|
2497
|
+
if (looksLikeNoiseHeading(text, normalizedText)) return "noise";
|
|
2498
|
+
if (isCodeHeading(text)) return "marker";
|
|
2499
|
+
if (NORMALIZED_MARKER_PREFIXES.some((token) => normalizedUnwrapped.startsWith(token))) return "marker";
|
|
2500
|
+
if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
|
|
2501
|
+
if (CLUSTER_HEADING_RE.test(text)) return "cluster";
|
|
2502
|
+
return "entry";
|
|
2503
|
+
};
|
|
2504
|
+
const createHeadingMatch = (kind, page, rawLine, lineNumber) => ({
|
|
2505
|
+
kind,
|
|
2506
|
+
lemma: kind === "entry" ? rawLine.slice(3).trim() : void 0,
|
|
2507
|
+
line: lineNumber,
|
|
2508
|
+
pageId: page.id,
|
|
2509
|
+
text: rawLine
|
|
2510
|
+
});
|
|
2511
|
+
const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
|
|
2512
|
+
kind,
|
|
2513
|
+
lemma,
|
|
2514
|
+
line: lineNumber,
|
|
2515
|
+
pageId: page.id,
|
|
2516
|
+
text
|
|
2517
|
+
});
|
|
2518
|
+
const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
|
|
2519
|
+
if (!rawLine.startsWith(HEADING_PREFIX)) return false;
|
|
2520
|
+
const kind = classifyDictionaryHeading(rawLine);
|
|
2521
|
+
matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
|
|
2522
|
+
return true;
|
|
2523
|
+
};
|
|
2524
|
+
const scanLineEntry = (page, rawLine, lineNumber, matches) => {
|
|
2525
|
+
const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
|
|
2526
|
+
if (!lineEntry?.groups?.lemma) return;
|
|
2527
|
+
matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
|
|
2528
|
+
};
|
|
2529
|
+
const scanPairedForms = (page, rawLine, lineNumber, matches) => {
|
|
2530
|
+
const pairedForms = rawLine.match(PAIRED_FORMS_RE);
|
|
2531
|
+
if (!pairedForms?.groups?.forms) return;
|
|
2532
|
+
matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
|
|
2066
2533
|
};
|
|
2067
|
-
const
|
|
2068
|
-
|
|
2069
|
-
if (
|
|
2070
|
-
|
|
2071
|
-
for (let index = 0; index < pages.length; index++) {
|
|
2072
|
-
const page = pages[index];
|
|
2073
|
-
const boundary = pageMap.boundaries[index];
|
|
2074
|
-
if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
|
|
2075
|
-
const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
|
|
2076
|
-
contexts.push({
|
|
2077
|
-
boundary,
|
|
2078
|
-
content,
|
|
2079
|
-
index,
|
|
2080
|
-
lines: buildPageLines(content),
|
|
2081
|
-
page
|
|
2082
|
-
});
|
|
2083
|
-
}
|
|
2084
|
-
return contexts;
|
|
2534
|
+
const scanCodeLine = (page, rawLine, lineNumber, matches) => {
|
|
2535
|
+
const codeLine = rawLine.match(CODE_LINE_RE);
|
|
2536
|
+
if (!codeLine?.groups?.codes) return;
|
|
2537
|
+
matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
|
|
2085
2538
|
};
|
|
2086
|
-
const
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
const lines = [];
|
|
2091
|
-
let offset = 0;
|
|
2092
|
-
for (let index = 0; index < parts.length; index++) {
|
|
2093
|
-
const text = parts[index] ?? "";
|
|
2094
|
-
lines.push({
|
|
2095
|
-
lineNumber: index + 1,
|
|
2096
|
-
start: offset,
|
|
2097
|
-
text
|
|
2098
|
-
});
|
|
2099
|
-
offset += text.length + 1;
|
|
2539
|
+
const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
|
|
2540
|
+
for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
|
|
2541
|
+
if (!match.groups?.lemma) continue;
|
|
2542
|
+
matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
|
|
2100
2543
|
}
|
|
2101
|
-
return lines;
|
|
2102
2544
|
};
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2545
|
+
/**
|
|
2546
|
+
* Extracts dictionary surface matches from a markdown page.
|
|
2547
|
+
*/
|
|
2548
|
+
const scanDictionaryMarkdownPage = (page) => {
|
|
2549
|
+
const lines = page.content.split(/\n/u);
|
|
2550
|
+
const matches = [];
|
|
2551
|
+
for (let index = 0; index < lines.length; index++) {
|
|
2552
|
+
const rawLine = lines[index]?.trim() ?? "";
|
|
2553
|
+
if (!rawLine) continue;
|
|
2554
|
+
if (scanHeadingLine(page, rawLine, index + 1, matches)) continue;
|
|
2555
|
+
scanLineEntry(page, rawLine, index + 1, matches);
|
|
2556
|
+
scanPairedForms(page, rawLine, index + 1, matches);
|
|
2557
|
+
scanCodeLine(page, rawLine, index + 1, matches);
|
|
2558
|
+
scanInlineSubentries(page, rawLine, index + 1, matches);
|
|
2109
2559
|
}
|
|
2110
|
-
return
|
|
2111
|
-
};
|
|
2112
|
-
const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
|
|
2113
|
-
const trimmed = line.text.trim();
|
|
2114
|
-
if (!trimmed.startsWith(HEADING_PREFIX)) return false;
|
|
2115
|
-
const headingText = trimmed.replace(/^##\s+/u, "").trim();
|
|
2116
|
-
return gates.some((gate) => headingMatchesGate(headingText, gate));
|
|
2117
|
-
});
|
|
2118
|
-
const pageWithinZoneBounds = (zone, pageId) => {
|
|
2119
|
-
if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
|
|
2120
|
-
if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
|
|
2121
|
-
return true;
|
|
2560
|
+
return matches;
|
|
2122
2561
|
};
|
|
2123
|
-
|
|
2562
|
+
/**
|
|
2563
|
+
* Aggregates dictionary surface counts across markdown pages.
|
|
2564
|
+
*/
|
|
2565
|
+
const analyzeDictionaryMarkdownPages = (pages) => {
|
|
2566
|
+
const counts = emptyCounts();
|
|
2567
|
+
const matches = [];
|
|
2124
2568
|
for (const page of pages) {
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
};
|
|
2130
|
-
const createZoneActivationMap = (profile, pages) => {
|
|
2131
|
-
const activation = /* @__PURE__ */ new Map();
|
|
2132
|
-
for (const zone of profile.zones) {
|
|
2133
|
-
if (!zone.when?.activateAfter?.length) {
|
|
2134
|
-
activation.set(zone.name, null);
|
|
2135
|
-
continue;
|
|
2569
|
+
const pageMatches = scanDictionaryMarkdownPage(page);
|
|
2570
|
+
for (const match of pageMatches) {
|
|
2571
|
+
counts[match.kind] += 1;
|
|
2572
|
+
matches.push(match);
|
|
2136
2573
|
}
|
|
2137
|
-
activation.set(zone.name, findActivationPageId(zone, pages));
|
|
2138
2574
|
}
|
|
2139
|
-
return
|
|
2575
|
+
return {
|
|
2576
|
+
counts,
|
|
2577
|
+
matches
|
|
2578
|
+
};
|
|
2140
2579
|
};
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2580
|
+
//#endregion
|
|
2581
|
+
//#region src/dictionary/dictionary-candidates.ts
|
|
2582
|
+
const lineEntryRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2583
|
+
const inlineSubentryRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2584
|
+
const pairedFormsRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2585
|
+
const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
|
|
2586
|
+
const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN$1}$`, "u");
|
|
2587
|
+
const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
|
|
2588
|
+
const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
|
|
2589
|
+
const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
|
|
2590
|
+
const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
|
|
2591
|
+
const createLineEntryRegex = (family) => {
|
|
2592
|
+
const cached = lineEntryRegexCache.get(family);
|
|
2593
|
+
if (cached) return cached;
|
|
2594
|
+
const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
|
|
2595
|
+
const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
2596
|
+
const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
|
|
2597
|
+
lineEntryRegexCache.set(family, regex);
|
|
2598
|
+
return regex;
|
|
2147
2599
|
};
|
|
2148
|
-
const
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
return
|
|
2600
|
+
const parseWrappedCode = (text) => {
|
|
2601
|
+
const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[)\]])$/u);
|
|
2602
|
+
if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
|
|
2603
|
+
return {
|
|
2604
|
+
close: paired.groups.close,
|
|
2605
|
+
inner: paired.groups.inner.trim(),
|
|
2606
|
+
open: paired.groups.open,
|
|
2607
|
+
paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
|
|
2608
|
+
};
|
|
2609
|
+
};
|
|
2610
|
+
const collectHeadingCandidates = (pageStartOffset, line, nextLine, family, trimmed) => {
|
|
2611
|
+
if (!trimmed.startsWith("## ")) return [];
|
|
2612
|
+
const headingClass = classifyDictionaryHeading(trimmed);
|
|
2613
|
+
if (headingClass === "noise") return [];
|
|
2614
|
+
const candidate = createHeadingCandidate(pageStartOffset, line, nextLine, family, headingClass);
|
|
2615
|
+
return candidate ? [candidate] : [];
|
|
2152
2616
|
};
|
|
2153
2617
|
const createHeadingCandidate = (pageStartOffset, line, nextLine, family, headingClass) => {
|
|
2154
2618
|
if (!family.classes.includes(headingClass)) return null;
|
|
@@ -2168,19 +2632,6 @@ const createHeadingCandidate = (pageStartOffset, line, nextLine, family, heading
|
|
|
2168
2632
|
text: line.text.trim()
|
|
2169
2633
|
};
|
|
2170
2634
|
};
|
|
2171
|
-
const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
|
|
2172
|
-
const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
|
|
2173
|
-
const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
|
|
2174
|
-
const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
|
|
2175
|
-
const createLineEntryRegex = (family) => {
|
|
2176
|
-
const cached = lineEntryRegexCache.get(family);
|
|
2177
|
-
if (cached) return cached;
|
|
2178
|
-
const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
|
|
2179
|
-
const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
2180
|
-
const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
|
|
2181
|
-
lineEntryRegexCache.set(family, regex);
|
|
2182
|
-
return regex;
|
|
2183
|
-
};
|
|
2184
2635
|
const collectLineEntryCandidates = (pageStartOffset, line, family) => {
|
|
2185
2636
|
const trimmed = line.text.trim();
|
|
2186
2637
|
if (STATUS_LINE_RE.test(trimmed)) return [];
|
|
@@ -2198,17 +2649,22 @@ const collectLineEntryCandidates = (pageStartOffset, line, family) => {
|
|
|
2198
2649
|
}];
|
|
2199
2650
|
};
|
|
2200
2651
|
const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2652
|
+
let cached = inlineSubentryRegexCache.get(family);
|
|
2653
|
+
if (!cached) {
|
|
2654
|
+
const prefixes = family.prefixes.length > 0 ? family.prefixes.map(escapeRegex).join("|") : escapeRegex("و");
|
|
2655
|
+
cached = {
|
|
2656
|
+
matchRegex: new RegExp(`(^|[\\s،؛,:.])(?<lemma>(?:${prefixes})${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})\\s*:`, "gu"),
|
|
2657
|
+
stripPrefixRegex: new RegExp(`^(?:${prefixes})`, "u")
|
|
2658
|
+
};
|
|
2659
|
+
inlineSubentryRegexCache.set(family, cached);
|
|
2660
|
+
}
|
|
2205
2661
|
const candidates = [];
|
|
2206
|
-
for (const match of line.text.matchAll(
|
|
2662
|
+
for (const match of line.text.matchAll(cached.matchRegex)) {
|
|
2207
2663
|
if (!match.groups?.lemma || match.index === void 0) continue;
|
|
2208
2664
|
const lemmaIndex = match[0].indexOf(match.groups.lemma);
|
|
2209
2665
|
if (lemmaIndex < 0) continue;
|
|
2210
2666
|
const candidateStart = match.index + lemmaIndex;
|
|
2211
|
-
const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(
|
|
2667
|
+
const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(cached.stripPrefixRegex, "") : match.groups.lemma;
|
|
2212
2668
|
candidates.push({
|
|
2213
2669
|
absoluteIndex: pageStartOffset + line.start + candidateStart,
|
|
2214
2670
|
family: "inlineSubentry",
|
|
@@ -2222,18 +2678,6 @@ const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
|
|
|
2222
2678
|
}
|
|
2223
2679
|
return candidates;
|
|
2224
2680
|
};
|
|
2225
|
-
const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN}$`, "u");
|
|
2226
|
-
const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
|
|
2227
|
-
const parseWrappedCode = (text) => {
|
|
2228
|
-
const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[\])])$/u);
|
|
2229
|
-
if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
|
|
2230
|
-
return {
|
|
2231
|
-
close: paired.groups.close,
|
|
2232
|
-
inner: paired.groups.inner.trim(),
|
|
2233
|
-
open: paired.groups.open,
|
|
2234
|
-
paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
|
|
2235
|
-
};
|
|
2236
|
-
};
|
|
2237
2681
|
const collectCodeLineCandidates = (pageStartOffset, line, family) => {
|
|
2238
2682
|
const trimmed = line.text.trim();
|
|
2239
2683
|
const bare = trimmed.replace(STATUS_SUFFIX_RE, "").trim();
|
|
@@ -2271,255 +2715,470 @@ const collectPairedFormsCandidates = (pageStartOffset, line, family) => {
|
|
|
2271
2715
|
text: line.text.trim()
|
|
2272
2716
|
}];
|
|
2273
2717
|
};
|
|
2274
|
-
const
|
|
2275
|
-
|
|
2276
|
-
const normalized = normalizeIntroContextText(text);
|
|
2277
|
-
return INTRO_PHRASES.some((phrase) => normalized.startsWith(normalizeArabicForComparison(phrase)));
|
|
2718
|
+
const assertNever$1 = (value) => {
|
|
2719
|
+
throw new Error(`Unhandled dictionary candidate family: ${JSON.stringify(value)}`);
|
|
2278
2720
|
};
|
|
2279
|
-
const
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2721
|
+
const collectCandidatesForFamily = (pageStartOffset, line, nextLine, family, trimmed) => {
|
|
2722
|
+
switch (family.use) {
|
|
2723
|
+
case "heading": return collectHeadingCandidates(pageStartOffset, line, nextLine, family, trimmed);
|
|
2724
|
+
case "lineEntry": return collectLineEntryCandidates(pageStartOffset, line, family);
|
|
2725
|
+
case "inlineSubentry": return collectInlineSubentryCandidates(pageStartOffset, line, family);
|
|
2726
|
+
case "codeLine": return collectCodeLineCandidates(pageStartOffset, line, family);
|
|
2727
|
+
case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
|
|
2728
|
+
default: return assertNever$1(family);
|
|
2729
|
+
}
|
|
2730
|
+
};
|
|
2731
|
+
const familyMayMatchLine = (family, trimmed) => {
|
|
2732
|
+
switch (family.use) {
|
|
2733
|
+
case "heading": return trimmed.startsWith("## ");
|
|
2734
|
+
case "lineEntry":
|
|
2735
|
+
case "inlineSubentry":
|
|
2736
|
+
case "pairedForms": return trimmed.includes(":");
|
|
2737
|
+
case "codeLine": return /^(?:[[(])?\p{Script=Arabic}/u.test(trimmed);
|
|
2738
|
+
default: return assertNever$1(family);
|
|
2739
|
+
}
|
|
2740
|
+
};
|
|
2741
|
+
/**
|
|
2742
|
+
* Collects all family candidates for a single dictionary line within a zone.
|
|
2743
|
+
*/
|
|
2744
|
+
const collectCandidatesForLine = (pageStartOffset, line, nextLine, zone) => {
|
|
2745
|
+
const trimmed = line.text.trim();
|
|
2746
|
+
if (!trimmed) return [];
|
|
2747
|
+
const candidates = [];
|
|
2748
|
+
for (const family of zone.families) {
|
|
2749
|
+
if (!familyMayMatchLine(family, trimmed)) continue;
|
|
2750
|
+
candidates.push(...collectCandidatesForFamily(pageStartOffset, line, nextLine, family, trimmed));
|
|
2751
|
+
}
|
|
2752
|
+
return candidates;
|
|
2753
|
+
};
|
|
2754
|
+
//#endregion
|
|
2755
|
+
//#region src/dictionary/dictionary-zones.ts
|
|
2756
|
+
const normalizedStartsWith = (text, prefix) => normalizeArabicForComparison(text).startsWith(normalizeArabicForComparison(prefix));
|
|
2757
|
+
const isDelimitedPrefixMatch = (text, prefix) => {
|
|
2758
|
+
if (text === prefix) return true;
|
|
2759
|
+
if (!text.startsWith(prefix)) return false;
|
|
2760
|
+
const nextChar = text[prefix.length];
|
|
2761
|
+
return nextChar === void 0 || GATE_DELIMITER_RE.test(nextChar);
|
|
2762
|
+
};
|
|
2763
|
+
const getHeadingTextGateMatch = (gate, useFuzzy) => {
|
|
2764
|
+
if (useFuzzy) return "normalizedMatch" in gate ? gate.normalizedMatch : normalizeArabicForComparison(gate.match);
|
|
2765
|
+
return "trimmedMatch" in gate ? gate.trimmedMatch : gate.match.trim();
|
|
2766
|
+
};
|
|
2767
|
+
const buildPageLines = (content) => {
|
|
2768
|
+
const parts = content.split("\n");
|
|
2769
|
+
const lines = [];
|
|
2770
|
+
let offset = 0;
|
|
2771
|
+
for (let index = 0; index < parts.length; index++) {
|
|
2772
|
+
const text = parts[index] ?? "";
|
|
2773
|
+
lines.push({
|
|
2774
|
+
lineNumber: index + 1,
|
|
2775
|
+
start: offset,
|
|
2776
|
+
text
|
|
2777
|
+
});
|
|
2778
|
+
offset += text.length + 1;
|
|
2779
|
+
}
|
|
2780
|
+
return lines;
|
|
2781
|
+
};
|
|
2782
|
+
const headingMatchesGate = (headingText, gate) => {
|
|
2783
|
+
if (gate.use === "headingText") {
|
|
2784
|
+
const useFuzzy = gate.fuzzy ?? false;
|
|
2785
|
+
const source = useFuzzy ? normalizeArabicForComparison(headingText) : headingText.trim();
|
|
2786
|
+
const match = getHeadingTextGateMatch(gate, useFuzzy);
|
|
2787
|
+
return !!match && isDelimitedPrefixMatch(source, match);
|
|
2788
|
+
}
|
|
2789
|
+
return normalizedStartsWith(headingText, GATE_TOKEN_MAP[gate.token]);
|
|
2790
|
+
};
|
|
2791
|
+
const createPageContext = (page, boundary, content, index) => {
|
|
2792
|
+
let cachedLines;
|
|
2793
|
+
const context = {
|
|
2794
|
+
boundary,
|
|
2795
|
+
content,
|
|
2796
|
+
index,
|
|
2797
|
+
page
|
|
2798
|
+
};
|
|
2799
|
+
Object.defineProperty(context, "lines", {
|
|
2800
|
+
configurable: true,
|
|
2801
|
+
enumerable: true,
|
|
2802
|
+
get: () => {
|
|
2803
|
+
cachedLines ??= buildPageLines(content);
|
|
2804
|
+
return cachedLines;
|
|
2805
|
+
}
|
|
2806
|
+
});
|
|
2807
|
+
return context;
|
|
2808
|
+
};
|
|
2809
|
+
const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
|
|
2810
|
+
const trimmed = line.text.trim();
|
|
2811
|
+
if (!trimmed.startsWith("## ")) return false;
|
|
2812
|
+
const headingText = trimmed.slice(3).trim();
|
|
2813
|
+
return gates.some((gate) => headingMatchesGate(headingText, gate));
|
|
2814
|
+
});
|
|
2815
|
+
const pageWithinZoneBounds = (zone, pageId) => {
|
|
2816
|
+
if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
|
|
2817
|
+
if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
|
|
2818
|
+
return true;
|
|
2819
|
+
};
|
|
2820
|
+
const findActivationPageId = (zone, pages) => {
|
|
2821
|
+
for (const page of pages) {
|
|
2822
|
+
if (!pageWithinZoneBounds(zone, page.page.id)) continue;
|
|
2823
|
+
if (pageMatchesAnyGate(page, zone.when?.activateAfter ?? [])) return page.page.id;
|
|
2824
|
+
}
|
|
2825
|
+
return null;
|
|
2826
|
+
};
|
|
2827
|
+
const createZoneActivationMap = (profile, pages) => {
|
|
2828
|
+
const activation = /* @__PURE__ */ new Map();
|
|
2829
|
+
for (const zone of profile.zones) {
|
|
2830
|
+
if (!zone.when?.activateAfter?.length) {
|
|
2831
|
+
activation.set(zone.name, null);
|
|
2832
|
+
continue;
|
|
2833
|
+
}
|
|
2834
|
+
activation.set(zone.name, findActivationPageId(zone, pages));
|
|
2835
|
+
}
|
|
2836
|
+
return activation;
|
|
2837
|
+
};
|
|
2838
|
+
const pageMatchesZone = (zone, activationMap, pageId) => {
|
|
2839
|
+
if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
|
|
2840
|
+
if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
|
|
2841
|
+
if (!zone.when?.activateAfter?.length) return true;
|
|
2842
|
+
const activatedAt = activationMap.get(zone.name);
|
|
2843
|
+
return activatedAt !== null && activatedAt !== void 0 && pageId >= activatedAt;
|
|
2284
2844
|
};
|
|
2285
|
-
const
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
if (!normalized) return false;
|
|
2290
|
-
if (INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
|
|
2291
|
-
if (INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
|
|
2292
|
-
return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
|
|
2845
|
+
const resolveActiveZone = (profile, activationMap, pageId) => {
|
|
2846
|
+
let activeZone = null;
|
|
2847
|
+
for (const zone of profile.zones) if (pageMatchesZone(zone, activationMap, pageId)) activeZone = zone;
|
|
2848
|
+
return activeZone;
|
|
2293
2849
|
};
|
|
2294
|
-
const
|
|
2295
|
-
|
|
2296
|
-
if (
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
const
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
"الجوهري",
|
|
2305
|
-
"الفراء",
|
|
2306
|
-
"ثعلب",
|
|
2307
|
-
"شمر"
|
|
2308
|
-
].some((term) => normalized.startsWith(normalizeArabicForComparison(term)));
|
|
2850
|
+
const createPageContexts = (pages, pageMap, normalizedPages) => {
|
|
2851
|
+
if (normalizedPages && normalizedPages.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} normalized pages, received ${normalizedPages.length}`);
|
|
2852
|
+
if (pageMap.boundaries.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} page boundaries, received ${pageMap.boundaries.length}`);
|
|
2853
|
+
const contexts = [];
|
|
2854
|
+
for (let index = 0; index < pages.length; index++) {
|
|
2855
|
+
const page = pages[index];
|
|
2856
|
+
const boundary = pageMap.boundaries[index];
|
|
2857
|
+
if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
|
|
2858
|
+
const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
|
|
2859
|
+
contexts.push(createPageContext(page, boundary, content, index));
|
|
2309
2860
|
}
|
|
2310
|
-
return
|
|
2861
|
+
return contexts;
|
|
2311
2862
|
};
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2863
|
+
//#endregion
|
|
2864
|
+
//#region src/dictionary/profile.ts
|
|
2865
|
+
const normalizedProfileCache = /* @__PURE__ */ new WeakMap();
|
|
2866
|
+
const PREVIOUS_WORD_SCOPES = [
|
|
2867
|
+
"samePage",
|
|
2868
|
+
"pageStart",
|
|
2869
|
+
"any"
|
|
2870
|
+
];
|
|
2871
|
+
const BLOCKER_PRECISIONS = ["high", "aggressive"];
|
|
2872
|
+
const uniqueNormalizedSet = (values, normalize) => new Set(values.map(normalize).filter(Boolean));
|
|
2873
|
+
const assertNever = (value) => {
|
|
2874
|
+
throw new Error(`Unhandled dictionary profile variant: ${JSON.stringify(value)}`);
|
|
2316
2875
|
};
|
|
2317
|
-
const
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2876
|
+
const normalizeFamily = (family) => {
|
|
2877
|
+
switch (family.use) {
|
|
2878
|
+
case "heading": return {
|
|
2879
|
+
...family,
|
|
2880
|
+
allowNextLineColon: family.allowNextLineColon ?? false,
|
|
2881
|
+
allowSingleLetter: family.allowSingleLetter ?? false
|
|
2882
|
+
};
|
|
2883
|
+
case "lineEntry": return {
|
|
2884
|
+
...family,
|
|
2885
|
+
allowMultiWord: family.allowMultiWord ?? false,
|
|
2886
|
+
allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
|
|
2887
|
+
wrappers: family.wrappers ?? "none"
|
|
2888
|
+
};
|
|
2889
|
+
case "inlineSubentry": return {
|
|
2890
|
+
...family,
|
|
2891
|
+
prefixes: family.prefixes ?? ["و"],
|
|
2892
|
+
stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
|
|
2893
|
+
};
|
|
2894
|
+
case "codeLine": return {
|
|
2895
|
+
...family,
|
|
2896
|
+
wrappers: family.wrappers ?? "either"
|
|
2897
|
+
};
|
|
2898
|
+
case "pairedForms": return {
|
|
2899
|
+
...family,
|
|
2900
|
+
requireStatusTail: family.requireStatusTail ?? false,
|
|
2901
|
+
separator: family.separator ?? "comma"
|
|
2902
|
+
};
|
|
2903
|
+
default: return assertNever(family);
|
|
2904
|
+
}
|
|
2330
2905
|
};
|
|
2331
|
-
const
|
|
2332
|
-
|
|
2333
|
-
|
|
2906
|
+
const normalizeBlocker = (blocker) => {
|
|
2907
|
+
switch (blocker.use) {
|
|
2908
|
+
case "authorityIntro": return {
|
|
2909
|
+
...blocker,
|
|
2910
|
+
precision: blocker.precision ?? "high"
|
|
2911
|
+
};
|
|
2912
|
+
case "stopLemma": return {
|
|
2913
|
+
...blocker,
|
|
2914
|
+
normalizedWords: uniqueNormalizedSet(blocker.words, normalizeStopLemmaWord)
|
|
2915
|
+
};
|
|
2916
|
+
case "previousWord": return {
|
|
2917
|
+
...blocker,
|
|
2918
|
+
normalizedWords: uniqueNormalizedSet(blocker.words, normalizeArabicForComparison),
|
|
2919
|
+
scope: blocker.scope ?? "samePage"
|
|
2920
|
+
};
|
|
2921
|
+
case "previousChar": return {
|
|
2922
|
+
...blocker,
|
|
2923
|
+
charSet: new Set(blocker.chars)
|
|
2924
|
+
};
|
|
2925
|
+
case "intro": return blocker;
|
|
2926
|
+
case "pageContinuation": return {
|
|
2927
|
+
...blocker,
|
|
2928
|
+
authorityPrecision: blocker.authorityPrecision ?? "high"
|
|
2929
|
+
};
|
|
2930
|
+
default: return assertNever(blocker);
|
|
2931
|
+
}
|
|
2334
2932
|
};
|
|
2335
|
-
const
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2933
|
+
const normalizeGate = (gate) => {
|
|
2934
|
+
if (gate.use === "headingToken") return gate;
|
|
2935
|
+
const trimmedMatch = gate.match.trim();
|
|
2936
|
+
return {
|
|
2937
|
+
...gate,
|
|
2938
|
+
normalizedMatch: normalizeArabicForComparison(trimmedMatch),
|
|
2939
|
+
trimmedMatch
|
|
2940
|
+
};
|
|
2941
|
+
};
|
|
2942
|
+
const normalizeZone = (zone) => ({
|
|
2943
|
+
blockers: (zone.blockers ?? []).map(normalizeBlocker),
|
|
2944
|
+
families: zone.families.map(normalizeFamily),
|
|
2945
|
+
name: zone.name,
|
|
2946
|
+
when: zone.when ? {
|
|
2947
|
+
activateAfter: zone.when.activateAfter?.map(normalizeGate),
|
|
2948
|
+
maxPageId: zone.when.maxPageId,
|
|
2949
|
+
minPageId: zone.when.minPageId
|
|
2950
|
+
} : void 0
|
|
2349
2951
|
});
|
|
2350
|
-
const
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
}
|
|
2355
|
-
heading: {
|
|
2356
|
-
accepted: 0,
|
|
2357
|
-
rejected: 0
|
|
2358
|
-
},
|
|
2359
|
-
inlineSubentry: {
|
|
2360
|
-
accepted: 0,
|
|
2361
|
-
rejected: 0
|
|
2362
|
-
},
|
|
2363
|
-
lineEntry: {
|
|
2364
|
-
accepted: 0,
|
|
2365
|
-
rejected: 0
|
|
2366
|
-
},
|
|
2367
|
-
pairedForms: {
|
|
2368
|
-
accepted: 0,
|
|
2369
|
-
rejected: 0
|
|
2370
|
-
}
|
|
2952
|
+
const createIssue = (code, path, message, zoneName) => ({
|
|
2953
|
+
code,
|
|
2954
|
+
message,
|
|
2955
|
+
path,
|
|
2956
|
+
...zoneName ? { zoneName } : {}
|
|
2371
2957
|
});
|
|
2372
|
-
const
|
|
2373
|
-
|
|
2374
|
-
|
|
2958
|
+
const hasBlankString = (values) => values.length === 0 || values.some((value) => !value.trim());
|
|
2959
|
+
const pushBlockerIssue = (issues, code, path, message, zoneName) => {
|
|
2960
|
+
issues.push(createIssue(code, path, message, zoneName));
|
|
2375
2961
|
};
|
|
2376
|
-
const
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
if (blocker.use !== "previousWord") return false;
|
|
2380
|
-
const lastWord = extractLastArabicWord$1(pageContent, localIndex);
|
|
2381
|
-
return !!lastWord && blocker.normalizedWords.has(normalizeArabicForComparison(lastWord));
|
|
2962
|
+
const validateAuthorityPrecision = (issues, blockerPath, zoneName, code, fieldName, value, blockerUse) => {
|
|
2963
|
+
if (value === void 0 || BLOCKER_PRECISIONS.includes(value)) return;
|
|
2964
|
+
pushBlockerIssue(issues, code, `${blockerPath}.${fieldName}`, `${blockerUse} blocker in zone "${zoneName}" must use ${fieldName} "high" or "aggressive"`, zoneName);
|
|
2382
2965
|
};
|
|
2383
|
-
const
|
|
2384
|
-
if (blocker.
|
|
2385
|
-
|
|
2386
|
-
return !!previousChar && blocker.charSet.has(previousChar);
|
|
2966
|
+
const validatePreviousWordBlocker = (blocker, blockerPath, zoneName, issues) => {
|
|
2967
|
+
if (hasBlankString(blocker.words)) pushBlockerIssue(issues, "invalid_previous_words", `${blockerPath}.words`, `previousWord blocker in zone "${zoneName}" must include non-empty words`, zoneName);
|
|
2968
|
+
if (blocker.scope !== void 0 && !PREVIOUS_WORD_SCOPES.includes(blocker.scope)) pushBlockerIssue(issues, "invalid_previous_word_scope", `${blockerPath}.scope`, `previousWord blocker in zone "${zoneName}" must use scope "samePage", "pageStart", or "any"`, zoneName);
|
|
2387
2969
|
};
|
|
2388
|
-
const
|
|
2389
|
-
if (blocker.
|
|
2390
|
-
if (!(localBeforeCandidate.trim().length === 0) || pageIndex === 0) return false;
|
|
2391
|
-
const previousPage = pages[pageIndex - 1];
|
|
2392
|
-
if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
|
|
2393
|
-
const previousWord = extractLastArabicWord$1(previousPage.content);
|
|
2394
|
-
return !!previousWord && CONTINUATION_PREV_WORDS.some((word) => normalizedEquals(word, previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, "high");
|
|
2970
|
+
const validatePreviousCharBlocker = (blocker, blockerPath, zoneName, issues) => {
|
|
2971
|
+
if (blocker.chars.length === 0 || blocker.chars.some((char) => !char)) pushBlockerIssue(issues, "invalid_previous_chars", `${blockerPath}.chars`, `previousChar blocker in zone "${zoneName}" must include chars`, zoneName);
|
|
2395
2972
|
};
|
|
2396
|
-
const
|
|
2397
|
-
if (
|
|
2398
|
-
if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
|
|
2399
|
-
if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
|
|
2400
|
-
if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker)) return "previousWord";
|
|
2401
|
-
if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
|
|
2402
|
-
if (rejectsViaPageContinuationBlocker(candidate, blocker, localBeforeCandidate, pageIndex, pages)) return "pageContinuation";
|
|
2403
|
-
return null;
|
|
2973
|
+
const validateStopLemmaBlocker = (blocker, blockerPath, zoneName, issues) => {
|
|
2974
|
+
if (hasBlankString(blocker.words)) pushBlockerIssue(issues, "invalid_stop_words", `${blockerPath}.words`, `stopLemma blocker in zone "${zoneName}" must include non-empty words`, zoneName);
|
|
2404
2975
|
};
|
|
2405
|
-
const
|
|
2406
|
-
const
|
|
2407
|
-
if (
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
if (!blockerApplies(blocker, candidate.family)) continue;
|
|
2411
|
-
const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
|
|
2412
|
-
if (reason) return { reason };
|
|
2976
|
+
const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
|
|
2977
|
+
const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
|
|
2978
|
+
if (gate.use === "headingText") {
|
|
2979
|
+
if (!gate.match.trim()) issues.push(createIssue("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
|
|
2980
|
+
if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
|
|
2413
2981
|
}
|
|
2414
|
-
|
|
2982
|
+
const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
|
|
2983
|
+
if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
|
|
2984
|
+
seenActivateAfterKeys.add(dedupeKey);
|
|
2415
2985
|
};
|
|
2416
|
-
const
|
|
2417
|
-
|
|
2986
|
+
const validateFamily = (family, zone, familyIndex, issues) => {
|
|
2987
|
+
const familyPath = `zones[].families[${familyIndex}]`.replace("[]", `[${zone.name}]`);
|
|
2988
|
+
switch (family.use) {
|
|
2989
|
+
case "heading":
|
|
2990
|
+
if (family.classes.length === 0) issues.push(createIssue("empty_heading_classes", `${familyPath}.classes`, `dictionary heading family in zone "${zone.name}" must include at least one class`, zone.name));
|
|
2991
|
+
if (family.emit === "chapter" && !family.classes.includes("chapter")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "chapter" but never matches chapter headings`, zone.name));
|
|
2992
|
+
if (family.emit === "marker" && !family.classes.includes("marker")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "marker" but never matches marker headings`, zone.name));
|
|
2993
|
+
if (family.emit === "entry" && !family.classes.includes("entry")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "entry" but never matches entry headings`, zone.name));
|
|
2994
|
+
break;
|
|
2995
|
+
case "lineEntry": break;
|
|
2996
|
+
case "inlineSubentry":
|
|
2997
|
+
if (family.prefixes?.some((prefix) => !prefix.trim())) issues.push(createIssue("empty_inline_prefixes", `${familyPath}.prefixes`, `inlineSubentry prefixes must be non-empty strings`, zone.name));
|
|
2998
|
+
break;
|
|
2999
|
+
case "codeLine": break;
|
|
3000
|
+
case "pairedForms": break;
|
|
3001
|
+
default: assertNever(family);
|
|
3002
|
+
}
|
|
2418
3003
|
};
|
|
2419
|
-
const
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
3004
|
+
const validateBlocker = (blocker, zone, blockerIndex, issues) => {
|
|
3005
|
+
const blockerPath = `zones[].blockers[${blockerIndex}]`.replace("[]", `[${zone.name}]`);
|
|
3006
|
+
switch (blocker.use) {
|
|
3007
|
+
case "authorityIntro":
|
|
3008
|
+
validateAuthorityPrecision(issues, blockerPath, zone.name, "invalid_authority_intro_precision", "precision", blocker.precision, "authorityIntro");
|
|
3009
|
+
break;
|
|
3010
|
+
case "stopLemma":
|
|
3011
|
+
validateStopLemmaBlocker(blocker, blockerPath, zone.name, issues);
|
|
3012
|
+
break;
|
|
3013
|
+
case "previousWord":
|
|
3014
|
+
validatePreviousWordBlocker(blocker, blockerPath, zone.name, issues);
|
|
3015
|
+
break;
|
|
3016
|
+
case "previousChar":
|
|
3017
|
+
validatePreviousCharBlocker(blocker, blockerPath, zone.name, issues);
|
|
3018
|
+
break;
|
|
3019
|
+
case "intro": break;
|
|
3020
|
+
case "pageContinuation":
|
|
3021
|
+
validateAuthorityPrecision(issues, blockerPath, zone.name, "invalid_continuation_precision", "authorityPrecision", blocker.authorityPrecision, "pageContinuation");
|
|
3022
|
+
break;
|
|
3023
|
+
default: assertNever(blocker);
|
|
3024
|
+
}
|
|
2425
3025
|
};
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
|
|
2433
|
-
default: return assertNever$1(family);
|
|
3026
|
+
var DictionaryProfileValidationError = class extends Error {
|
|
3027
|
+
issues;
|
|
3028
|
+
constructor(issues) {
|
|
3029
|
+
super(issues.length === 1 ? issues[0].message : `Dictionary profile validation failed with ${issues.length} issues`);
|
|
3030
|
+
this.name = "DictionaryProfileValidationError";
|
|
3031
|
+
this.issues = issues;
|
|
2434
3032
|
}
|
|
2435
3033
|
};
|
|
2436
|
-
const
|
|
2437
|
-
const
|
|
2438
|
-
const
|
|
2439
|
-
if (!
|
|
2440
|
-
|
|
2441
|
-
|
|
3034
|
+
const validateZone = (zone, zoneIndex, seenZoneNames, issues) => {
|
|
3035
|
+
const zonePath = `zones[${zoneIndex}]`;
|
|
3036
|
+
const trimmedName = zone.name.trim();
|
|
3037
|
+
if (!trimmedName) issues.push(createIssue("empty_zone_name", `${zonePath}.name`, `dictionary zone name must be non-empty`));
|
|
3038
|
+
else if (seenZoneNames.has(trimmedName)) issues.push(createIssue("duplicate_zone_name", `${zonePath}.name`, `dictionary zone names must be unique; duplicated "${trimmedName}"`, trimmedName));
|
|
3039
|
+
else seenZoneNames.add(trimmedName);
|
|
3040
|
+
if (zone.families.length === 0) issues.push(createIssue("empty_zone_families", `${zonePath}.families`, `dictionary zone "${zone.name}" must declare at least one family`, zone.name));
|
|
3041
|
+
if (zone.when?.minPageId !== void 0 && zone.when?.maxPageId !== void 0 && zone.when.minPageId > zone.when.maxPageId) issues.push(createIssue("invalid_zone_page_range", `${zonePath}.when`, `dictionary zone "${zone.name}" has minPageId greater than maxPageId`, zone.name));
|
|
3042
|
+
const seenActivateAfterKeys = /* @__PURE__ */ new Set();
|
|
3043
|
+
for (let gateIndex = 0; gateIndex < (zone.when?.activateAfter?.length ?? 0); gateIndex++) validateGate(zone.when.activateAfter[gateIndex], zone, gateIndex, seenActivateAfterKeys, issues);
|
|
3044
|
+
for (let familyIndex = 0; familyIndex < zone.families.length; familyIndex++) validateFamily(zone.families[familyIndex], zone, familyIndex, issues);
|
|
3045
|
+
for (let blockerIndex = 0; blockerIndex < (zone.blockers?.length ?? 0); blockerIndex++) validateBlocker(zone.blockers[blockerIndex], zone, blockerIndex, issues);
|
|
2442
3046
|
};
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
3047
|
+
/**
|
|
3048
|
+
* Validates a dictionary profile without normalizing it.
|
|
3049
|
+
*/
|
|
3050
|
+
const validateDictionaryProfile = (profile) => {
|
|
3051
|
+
const issues = [];
|
|
3052
|
+
if (profile.version !== 2) issues.push(createIssue("invalid_version", "version", `dictionary profile version must be 2, got ${profile.version}`));
|
|
3053
|
+
if (profile.zones.length === 0) {
|
|
3054
|
+
issues.push(createIssue("missing_zones", "zones", `dictionary profile must contain at least one zone`));
|
|
3055
|
+
return issues;
|
|
3056
|
+
}
|
|
3057
|
+
const seenZoneNames = /* @__PURE__ */ new Set();
|
|
3058
|
+
for (let zoneIndex = 0; zoneIndex < profile.zones.length; zoneIndex++) validateZone(profile.zones[zoneIndex], zoneIndex, seenZoneNames, issues);
|
|
3059
|
+
return issues;
|
|
3060
|
+
};
|
|
3061
|
+
/**
|
|
3062
|
+
* Normalizes and validates a dictionary profile before runtime matching.
|
|
3063
|
+
*/
|
|
3064
|
+
const normalizeDictionaryProfile = (profile) => {
|
|
3065
|
+
const cached = normalizedProfileCache.get(profile);
|
|
3066
|
+
if (cached) return cached;
|
|
3067
|
+
const issues = validateDictionaryProfile(profile);
|
|
3068
|
+
if (issues.length > 0) throw new DictionaryProfileValidationError(issues);
|
|
3069
|
+
const normalized = {
|
|
3070
|
+
version: 2,
|
|
3071
|
+
zones: profile.zones.map(normalizeZone)
|
|
2456
3072
|
};
|
|
3073
|
+
normalizedProfileCache.set(profile, normalized);
|
|
3074
|
+
return normalized;
|
|
3075
|
+
};
|
|
3076
|
+
//#endregion
|
|
3077
|
+
//#region src/dictionary/dictionary-diagnostics.ts
|
|
3078
|
+
const createInitialKindCounts = () => ({
|
|
3079
|
+
chapter: 0,
|
|
3080
|
+
entry: 0,
|
|
3081
|
+
marker: 0
|
|
3082
|
+
});
|
|
3083
|
+
const createInitialReasonCounts = () => ({
|
|
3084
|
+
authorityIntro: 0,
|
|
3085
|
+
intro: 0,
|
|
3086
|
+
pageContinuation: 0,
|
|
3087
|
+
previousChar: 0,
|
|
3088
|
+
previousWord: 0,
|
|
3089
|
+
qualifierTail: 0,
|
|
3090
|
+
stopLemma: 0,
|
|
3091
|
+
structuralLeak: 0
|
|
3092
|
+
});
|
|
3093
|
+
const createInitialFamilyCounts = () => ({
|
|
3094
|
+
codeLine: {
|
|
3095
|
+
accepted: 0,
|
|
3096
|
+
rejected: 0
|
|
3097
|
+
},
|
|
3098
|
+
heading: {
|
|
3099
|
+
accepted: 0,
|
|
3100
|
+
rejected: 0
|
|
3101
|
+
},
|
|
3102
|
+
inlineSubentry: {
|
|
3103
|
+
accepted: 0,
|
|
3104
|
+
rejected: 0
|
|
3105
|
+
},
|
|
3106
|
+
lineEntry: {
|
|
3107
|
+
accepted: 0,
|
|
3108
|
+
rejected: 0
|
|
3109
|
+
},
|
|
3110
|
+
pairedForms: {
|
|
3111
|
+
accepted: 0,
|
|
3112
|
+
rejected: 0
|
|
3113
|
+
}
|
|
3114
|
+
});
|
|
3115
|
+
const countLemma = (map, lemma) => {
|
|
3116
|
+
if (!lemma) return;
|
|
3117
|
+
map.set(lemma, (map.get(lemma) ?? 0) + 1);
|
|
2457
3118
|
};
|
|
2458
3119
|
const pushDiagnosticSample = (samples, sampleLimit, sample) => {
|
|
2459
3120
|
if (samples.length < sampleLimit) samples.push(sample);
|
|
2460
3121
|
};
|
|
2461
3122
|
/**
|
|
2462
|
-
*
|
|
3123
|
+
* Builds a minimal `PageMap` from a pages array for use inside
|
|
3124
|
+
* `diagnoseDictionaryProfile`, which does not receive one from the segmenter.
|
|
2463
3125
|
*/
|
|
2464
|
-
const
|
|
2465
|
-
const
|
|
2466
|
-
const
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
if (
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
3126
|
+
const buildDiagnosticsPageMap = (pages, normalizedContents) => {
|
|
3127
|
+
const boundaries = [];
|
|
3128
|
+
const pageBreaks = [];
|
|
3129
|
+
let offset = 0;
|
|
3130
|
+
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
|
3131
|
+
const normalized = normalizedContents[pageIndex];
|
|
3132
|
+
boundaries.push({
|
|
3133
|
+
end: offset + normalized.length,
|
|
3134
|
+
id: pages[pageIndex].id,
|
|
3135
|
+
start: offset
|
|
3136
|
+
});
|
|
3137
|
+
if (pageIndex < pages.length - 1) {
|
|
3138
|
+
pageBreaks.push(offset + normalized.length);
|
|
3139
|
+
offset += normalized.length + 1;
|
|
3140
|
+
} else offset += normalized.length;
|
|
3141
|
+
}
|
|
3142
|
+
const findBoundary = (off) => {
|
|
3143
|
+
let lo = 0;
|
|
3144
|
+
let hi = boundaries.length - 1;
|
|
3145
|
+
while (lo <= hi) {
|
|
3146
|
+
const mid = lo + hi >>> 1;
|
|
3147
|
+
const boundary = boundaries[mid];
|
|
3148
|
+
if (off < boundary.start) {
|
|
3149
|
+
hi = mid - 1;
|
|
3150
|
+
continue;
|
|
2483
3151
|
}
|
|
3152
|
+
if (off > boundary.end) {
|
|
3153
|
+
lo = mid + 1;
|
|
3154
|
+
continue;
|
|
3155
|
+
}
|
|
3156
|
+
return boundary;
|
|
2484
3157
|
}
|
|
2485
|
-
|
|
2486
|
-
|
|
2487
|
-
return
|
|
3158
|
+
return boundaries.at(-1);
|
|
3159
|
+
};
|
|
3160
|
+
return {
|
|
3161
|
+
boundaries,
|
|
3162
|
+
getId: (off) => findBoundary(off)?.id ?? 0,
|
|
3163
|
+
pageBreaks,
|
|
3164
|
+
pageIds: pages.map((page) => page.id)
|
|
3165
|
+
};
|
|
2488
3166
|
};
|
|
2489
3167
|
/**
|
|
2490
|
-
* Collects
|
|
3168
|
+
* Collects tuning-oriented diagnostics for a dictionary profile without creating
|
|
3169
|
+
* segments. This output is intended for profile authoring workflows rather than
|
|
3170
|
+
* long-term compatibility guarantees.
|
|
2491
3171
|
*
|
|
2492
3172
|
* This is useful when tuning blockers and family choices for a new dictionary.
|
|
2493
3173
|
*/
|
|
2494
3174
|
const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
|
|
2495
3175
|
const normalizedProfile = normalizeDictionaryProfile(profile);
|
|
2496
|
-
const
|
|
2497
|
-
|
|
2498
|
-
getId: (offset) => {
|
|
2499
|
-
for (const boundary of pageMap.boundaries) if (offset >= boundary.start && offset <= boundary.end) return boundary.id;
|
|
2500
|
-
return pageMap.boundaries.at(-1)?.id ?? 0;
|
|
2501
|
-
},
|
|
2502
|
-
pageBreaks: [],
|
|
2503
|
-
pageIds: pages.map((page) => page.id)
|
|
2504
|
-
};
|
|
2505
|
-
let offset = 0;
|
|
2506
|
-
const pageContexts = createPageContexts(pages, pageMap, pages.map((page, pageIndex) => {
|
|
2507
|
-
const normalized = normalizeLineEndings(page.content);
|
|
2508
|
-
pageMap.boundaries.push({
|
|
2509
|
-
end: offset + normalized.length,
|
|
2510
|
-
id: page.id,
|
|
2511
|
-
start: offset
|
|
2512
|
-
});
|
|
2513
|
-
if (pageIndex < pages.length - 1) {
|
|
2514
|
-
pageMap.pageBreaks.push(offset + normalized.length);
|
|
2515
|
-
offset += normalized.length + 1;
|
|
2516
|
-
} else offset += normalized.length;
|
|
2517
|
-
return normalized;
|
|
2518
|
-
}));
|
|
3176
|
+
const normalizedPages = pages.map((page) => normalizeLineEndings(page.content));
|
|
3177
|
+
const pageContexts = createPageContexts(pages, buildDiagnosticsPageMap(pages, normalizedPages), normalizedPages);
|
|
2519
3178
|
const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
|
|
2520
3179
|
const sampleLimit = options.sampleLimit ?? 50;
|
|
2521
3180
|
const acceptedKinds = createInitialKindCounts();
|
|
2522
|
-
const
|
|
3181
|
+
const rejectionReasons = createInitialReasonCounts();
|
|
2523
3182
|
const familyCounts = createInitialFamilyCounts();
|
|
2524
3183
|
const zoneCounts = {};
|
|
2525
3184
|
const rejectedLemmaCounts = /* @__PURE__ */ new Map();
|
|
@@ -2551,7 +3210,7 @@ const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
|
|
|
2551
3210
|
};
|
|
2552
3211
|
if (rejection) {
|
|
2553
3212
|
rejectedCount += 1;
|
|
2554
|
-
|
|
3213
|
+
rejectionReasons[rejection.reason] += 1;
|
|
2555
3214
|
familyCounts[candidate.family].rejected += 1;
|
|
2556
3215
|
zoneCounts[zone.name].rejected += 1;
|
|
2557
3216
|
countLemma(rejectedLemmaCounts, candidate.lemma);
|
|
@@ -2580,186 +3239,59 @@ const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
|
|
|
2580
3239
|
return {
|
|
2581
3240
|
acceptedCount,
|
|
2582
3241
|
acceptedKinds,
|
|
2583
|
-
blockerHits,
|
|
2584
3242
|
familyCounts,
|
|
2585
3243
|
pageCount: pages.length,
|
|
2586
3244
|
rejectedCount,
|
|
2587
3245
|
rejectedLemmas,
|
|
3246
|
+
rejectionReasons,
|
|
2588
3247
|
samples,
|
|
2589
3248
|
zoneCounts
|
|
2590
3249
|
};
|
|
2591
3250
|
};
|
|
2592
3251
|
//#endregion
|
|
2593
|
-
//#region src/
|
|
2594
|
-
const
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
const getPatternArray = (rule, key) => {
|
|
2604
|
-
const value = rule[key];
|
|
2605
|
-
return Array.isArray(value) ? value : [];
|
|
2606
|
-
};
|
|
2607
|
-
const getPatternString = (rule, key) => {
|
|
2608
|
-
const value = rule[key];
|
|
2609
|
-
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
|
|
2610
|
-
};
|
|
2611
|
-
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
2612
|
-
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
2613
|
-
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
|
|
2614
|
-
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
2615
|
-
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
2616
|
-
};
|
|
2617
|
-
const getSpecificityScore = (rule) => {
|
|
2618
|
-
const key = getPatternKey(rule);
|
|
2619
|
-
if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
|
|
2620
|
-
return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
|
|
2621
|
-
};
|
|
2622
|
-
const createMergeKey = (rule) => {
|
|
2623
|
-
const key = getPatternKey(rule);
|
|
2624
|
-
const { [key]: _, ...rest } = rule;
|
|
2625
|
-
return `${key}|${JSON.stringify(rest)}`;
|
|
2626
|
-
};
|
|
2627
|
-
const optimizeRules = (rules) => {
|
|
2628
|
-
const output = [];
|
|
2629
|
-
const indexByMergeKey = /* @__PURE__ */ new Map();
|
|
2630
|
-
let mergedCount = 0;
|
|
2631
|
-
for (const rule of rules) {
|
|
2632
|
-
const key = getPatternKey(rule);
|
|
2633
|
-
if (!MERGEABLE_KEYS.has(key)) {
|
|
2634
|
-
output.push(rule);
|
|
2635
|
-
continue;
|
|
2636
|
-
}
|
|
2637
|
-
const mergeKey = createMergeKey(rule);
|
|
2638
|
-
const existingIndex = indexByMergeKey.get(mergeKey);
|
|
2639
|
-
if (existingIndex === void 0) {
|
|
2640
|
-
indexByMergeKey.set(mergeKey, output.length);
|
|
2641
|
-
output.push({
|
|
2642
|
-
...rule,
|
|
2643
|
-
[key]: normalizePatterns(getPatternArray(rule, key))
|
|
2644
|
-
});
|
|
2645
|
-
} else {
|
|
2646
|
-
const existing = output[existingIndex];
|
|
2647
|
-
existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
|
|
2648
|
-
mergedCount++;
|
|
2649
|
-
}
|
|
2650
|
-
}
|
|
3252
|
+
//#region src/dictionary/runtime.ts
|
|
3253
|
+
const candidateToSplitPoint = (candidate, debugMetaKey) => {
|
|
3254
|
+
const baseMeta = candidate.lemma ? {
|
|
3255
|
+
kind: candidate.kind,
|
|
3256
|
+
lemma: candidate.lemma
|
|
3257
|
+
} : { kind: candidate.kind };
|
|
3258
|
+
const meta = debugMetaKey === void 0 ? baseMeta : mergeDebugIntoMeta(baseMeta, debugMetaKey, { dictionary: {
|
|
3259
|
+
family: candidate.family,
|
|
3260
|
+
...candidate.headingClass ? { headingClass: candidate.headingClass } : {}
|
|
3261
|
+
} });
|
|
2651
3262
|
return {
|
|
2652
|
-
|
|
2653
|
-
|
|
2654
|
-
|
|
2655
|
-
};
|
|
2656
|
-
//#endregion
|
|
2657
|
-
//#region src/preprocessing/transforms.ts
|
|
2658
|
-
/** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
|
|
2659
|
-
const assertNever = (x) => {
|
|
2660
|
-
throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
|
|
2661
|
-
};
|
|
2662
|
-
/** Check if a character is whitespace (space, newline, tab, etc.) */
|
|
2663
|
-
const isWhitespace = (char) => /\s/.test(char);
|
|
2664
|
-
/**
|
|
2665
|
-
* Check if a character code is a zero-width control character.
|
|
2666
|
-
*
|
|
2667
|
-
* Covers:
|
|
2668
|
-
* - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
|
|
2669
|
-
* - U+202A–U+202E (Bidirectional Formatting)
|
|
2670
|
-
* - U+2060–U+2064 (Word Joiner, Invisible Operators)
|
|
2671
|
-
* - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
|
|
2672
|
-
*/
|
|
2673
|
-
const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
|
|
2674
|
-
/**
|
|
2675
|
-
* Remove zero-width control characters from text.
|
|
2676
|
-
*
|
|
2677
|
-
* @param text - Input text
|
|
2678
|
-
* @param mode - 'strip' (default) removes entirely, 'space' replaces with space
|
|
2679
|
-
* @returns Text with zero-width characters removed or replaced
|
|
2680
|
-
*/
|
|
2681
|
-
const removeZeroWidth = (text, mode = "strip") => {
|
|
2682
|
-
if (mode === "space") {
|
|
2683
|
-
const parts = [];
|
|
2684
|
-
let lastWasWhitespace = true;
|
|
2685
|
-
for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
|
|
2686
|
-
if (!lastWasWhitespace && parts.length > 0) {
|
|
2687
|
-
parts.push(" ");
|
|
2688
|
-
lastWasWhitespace = true;
|
|
2689
|
-
}
|
|
2690
|
-
} else {
|
|
2691
|
-
const char = text[i];
|
|
2692
|
-
parts.push(char);
|
|
2693
|
-
lastWasWhitespace = isWhitespace(char);
|
|
2694
|
-
}
|
|
2695
|
-
return parts.join("");
|
|
2696
|
-
}
|
|
2697
|
-
return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
|
|
2698
|
-
};
|
|
2699
|
-
/**
|
|
2700
|
-
* Condense multiple periods (...) into ellipsis character (…).
|
|
2701
|
-
*
|
|
2702
|
-
* Prevents `{{tarqim}}` from false-matching inside ellipsis since
|
|
2703
|
-
* the `.` in tarqim matches individual periods.
|
|
2704
|
-
*
|
|
2705
|
-
* @param text - Input text
|
|
2706
|
-
* @returns Text with period sequences replaced by ellipsis
|
|
2707
|
-
*/
|
|
2708
|
-
const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
|
|
2709
|
-
/**
|
|
2710
|
-
* Join trailing و (waw) to the next word.
|
|
2711
|
-
*
|
|
2712
|
-
* Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
|
|
2713
|
-
*
|
|
2714
|
-
* @param text - Input text
|
|
2715
|
-
* @returns Text with trailing waw joined to following word
|
|
2716
|
-
*/
|
|
2717
|
-
const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
|
|
2718
|
-
/**
|
|
2719
|
-
* Check if a page ID is within a constraint range.
|
|
2720
|
-
*/
|
|
2721
|
-
const isInRange = (pageId, constraint) => {
|
|
2722
|
-
if (constraint.min !== void 0 && pageId < constraint.min) return false;
|
|
2723
|
-
if (constraint.max !== void 0 && pageId > constraint.max) return false;
|
|
2724
|
-
return true;
|
|
2725
|
-
};
|
|
2726
|
-
/**
|
|
2727
|
-
* Normalize a transform to its object form.
|
|
2728
|
-
*/
|
|
2729
|
-
const normalizeTransform = (transform) => {
|
|
2730
|
-
if (typeof transform === "string") return { type: transform };
|
|
2731
|
-
return transform;
|
|
3263
|
+
contentStartOffset: candidate.contentStartOffset,
|
|
3264
|
+
index: candidate.absoluteIndex,
|
|
3265
|
+
meta
|
|
3266
|
+
};
|
|
2732
3267
|
};
|
|
2733
3268
|
/**
|
|
2734
|
-
*
|
|
2735
|
-
*
|
|
2736
|
-
* Transforms run in array order. Each can be limited to specific pages
|
|
2737
|
-
* via `min`/`max` constraints.
|
|
2738
|
-
*
|
|
2739
|
-
* @param content - Page content to transform
|
|
2740
|
-
* @param pageId - Page ID for constraint checking
|
|
2741
|
-
* @param transforms - Array of transforms to apply
|
|
2742
|
-
* @returns Transformed content
|
|
3269
|
+
* Collects dictionary-profile split points using the pages-only markdown surface.
|
|
2743
3270
|
*/
|
|
2744
|
-
const
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
3271
|
+
const collectDictionarySplitPoints = (pages, profile, pageMap, normalizedPages, logger, debugMetaKey) => {
|
|
3272
|
+
const normalizedProfile = normalizeDictionaryProfile(profile);
|
|
3273
|
+
const pageContexts = createPageContexts(pages, pageMap, normalizedPages);
|
|
3274
|
+
const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
|
|
3275
|
+
const splitPoints = [];
|
|
3276
|
+
logger?.debug?.("[dictionary] collecting split points", {
|
|
3277
|
+
pageCount: pages.length,
|
|
3278
|
+
zoneCount: normalizedProfile.zones.length
|
|
3279
|
+
});
|
|
3280
|
+
for (const pageContext of pageContexts) {
|
|
3281
|
+
const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
|
|
3282
|
+
if (!zone) continue;
|
|
3283
|
+
for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
|
|
3284
|
+
const line = pageContext.lines[lineIndex];
|
|
3285
|
+
const nextLine = pageContext.lines[lineIndex + 1];
|
|
3286
|
+
const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
|
|
3287
|
+
for (const candidate of candidates) {
|
|
3288
|
+
if (shouldRejectCandidate(candidate, zone, pageContext, pageContexts)) continue;
|
|
3289
|
+
splitPoints.push(candidateToSplitPoint(candidate, debugMetaKey));
|
|
3290
|
+
}
|
|
2760
3291
|
}
|
|
2761
3292
|
}
|
|
2762
|
-
|
|
3293
|
+
logger?.debug?.("[dictionary] collected split points", { splitPointCount: splitPoints.length });
|
|
3294
|
+
return splitPoints;
|
|
2763
3295
|
};
|
|
2764
3296
|
const WINDOW_PREFIX_LENGTHS = [
|
|
2765
3297
|
80,
|
|
@@ -3656,219 +4188,16 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
3656
4188
|
*
|
|
3657
4189
|
* @param content The text content
|
|
3658
4190
|
* @param targetPosition The desired split position (hard limit)
|
|
3659
|
-
* @param lookbackChars How far back to search for a safe break
|
|
3660
|
-
* @returns The new split position (index), or -1 if no safe break found
|
|
3661
|
-
*/
|
|
3662
|
-
const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
|
|
3663
|
-
const startSearch = Math.max(0, targetPosition - lookbackChars);
|
|
3664
|
-
for (let i = targetPosition - 1; i >= startSearch; i--) {
|
|
3665
|
-
const char = content[i];
|
|
3666
|
-
if (STOP_CHARACTERS.test(char)) return i + 1;
|
|
3667
|
-
}
|
|
3668
|
-
return -1;
|
|
3669
|
-
};
|
|
3670
|
-
//#endregion
|
|
3671
|
-
//#region src/segmentation/pattern-validator.ts
|
|
3672
|
-
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
3673
|
-
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
3674
|
-
const BARE_TOKEN_REGEX = (() => {
|
|
3675
|
-
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
3676
|
-
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
3677
|
-
})();
|
|
3678
|
-
const createMalformedTokenIssue = (tokenLiteral, side) => {
|
|
3679
|
-
const token = tokenLiteral.split(":", 1)[0] || void 0;
|
|
3680
|
-
return {
|
|
3681
|
-
message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
|
|
3682
|
-
suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
|
|
3683
|
-
token,
|
|
3684
|
-
type: "missing_braces"
|
|
3685
|
-
};
|
|
3686
|
-
};
|
|
3687
|
-
const detectMalformedLeftToken = (pattern) => {
|
|
3688
|
-
for (let index = 0; index < pattern.length - 1; index++) {
|
|
3689
|
-
if (pattern.slice(index, index + 2) !== "{{") continue;
|
|
3690
|
-
const closeIndex = pattern.indexOf("}}", index + 2);
|
|
3691
|
-
if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
|
|
3692
|
-
index = closeIndex + 1;
|
|
3693
|
-
}
|
|
3694
|
-
};
|
|
3695
|
-
const detectMalformedRightToken = (pattern) => {
|
|
3696
|
-
for (let index = 0; index < pattern.length - 1; index++) {
|
|
3697
|
-
if (pattern.slice(index, index + 2) !== "}}") continue;
|
|
3698
|
-
if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
|
|
3699
|
-
}
|
|
3700
|
-
};
|
|
3701
|
-
const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
|
|
3702
|
-
/**
|
|
3703
|
-
* Validates a single pattern for common issues.
|
|
3704
|
-
*/
|
|
3705
|
-
const validatePattern = (pattern, seenPatterns) => {
|
|
3706
|
-
if (!pattern.trim()) return {
|
|
3707
|
-
message: "Empty pattern is not allowed",
|
|
3708
|
-
type: "empty_pattern"
|
|
3709
|
-
};
|
|
3710
|
-
if (seenPatterns.has(pattern)) return {
|
|
3711
|
-
message: `Duplicate pattern: "${pattern}"`,
|
|
3712
|
-
pattern,
|
|
3713
|
-
type: "duplicate"
|
|
3714
|
-
};
|
|
3715
|
-
seenPatterns.add(pattern);
|
|
3716
|
-
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
3717
|
-
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
3718
|
-
const name = match[1];
|
|
3719
|
-
if (name && !KNOWN_TOKENS.has(name)) return {
|
|
3720
|
-
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
3721
|
-
suggestion: "Check spelling or use a known token",
|
|
3722
|
-
token: name,
|
|
3723
|
-
type: "unknown_token"
|
|
3724
|
-
};
|
|
3725
|
-
}
|
|
3726
|
-
const malformed = detectMalformedToken(pattern);
|
|
3727
|
-
if (malformed) return malformed;
|
|
3728
|
-
for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
|
|
3729
|
-
const [full, name] = match;
|
|
3730
|
-
const idx = match.index;
|
|
3731
|
-
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
3732
|
-
message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
|
|
3733
|
-
suggestion: `{{${full}}}`,
|
|
3734
|
-
token: name,
|
|
3735
|
-
type: "missing_braces"
|
|
3736
|
-
};
|
|
3737
|
-
}
|
|
3738
|
-
};
|
|
3739
|
-
/**
|
|
3740
|
-
* Validates an array of patterns, returning parallel array of issues.
|
|
3741
|
-
*/
|
|
3742
|
-
const validatePatternArray = (patterns) => {
|
|
3743
|
-
const seen = /* @__PURE__ */ new Set();
|
|
3744
|
-
const issues = patterns.map((p) => validatePattern(p, seen));
|
|
3745
|
-
return issues.some(Boolean) ? issues : void 0;
|
|
3746
|
-
};
|
|
3747
|
-
const applyRulePatternValidation = (result, key, patterns) => {
|
|
3748
|
-
if (!patterns) return false;
|
|
3749
|
-
const issues = validatePatternArray(patterns);
|
|
3750
|
-
if (!issues) return false;
|
|
3751
|
-
result[key] = issues;
|
|
3752
|
-
return true;
|
|
3753
|
-
};
|
|
3754
|
-
const validateTemplateRule = (rule, result) => {
|
|
3755
|
-
if (!("template" in rule)) return false;
|
|
3756
|
-
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
3757
|
-
if (!issue) return false;
|
|
3758
|
-
result.template = issue;
|
|
3759
|
-
return true;
|
|
3760
|
-
};
|
|
3761
|
-
const validateRegexRule = (rule, result) => {
|
|
3762
|
-
if (!("regex" in rule)) return false;
|
|
3763
|
-
if (!rule.regex.trim()) {
|
|
3764
|
-
result.regex = {
|
|
3765
|
-
message: "Empty pattern is not allowed",
|
|
3766
|
-
type: "empty_pattern"
|
|
3767
|
-
};
|
|
3768
|
-
return true;
|
|
3769
|
-
}
|
|
3770
|
-
try {
|
|
3771
|
-
new RegExp(rule.regex, "u");
|
|
3772
|
-
return false;
|
|
3773
|
-
} catch (error) {
|
|
3774
|
-
result.regex = {
|
|
3775
|
-
message: error instanceof Error ? error.message : String(error),
|
|
3776
|
-
pattern: rule.regex,
|
|
3777
|
-
type: "invalid_regex"
|
|
3778
|
-
};
|
|
3779
|
-
return true;
|
|
3780
|
-
}
|
|
3781
|
-
};
|
|
3782
|
-
const invalidDictionaryEntryIssue = (message) => ({
|
|
3783
|
-
message,
|
|
3784
|
-
type: "invalid_option"
|
|
3785
|
-
});
|
|
3786
|
-
const addBooleanDictionaryEntryIssue = (issues, key, value) => {
|
|
3787
|
-
if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
|
|
3788
|
-
};
|
|
3789
|
-
const addCaptureNameIssue = (issues, captureName) => {
|
|
3790
|
-
if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
3791
|
-
};
|
|
3792
|
-
const addMinLettersIssue = (issues, minLetters) => {
|
|
3793
|
-
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
3794
|
-
};
|
|
3795
|
-
const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
|
|
3796
|
-
const min = minLetters ?? 2;
|
|
3797
|
-
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
|
|
3798
|
-
};
|
|
3799
|
-
const validateDictionaryEntryRule = (rule, result) => {
|
|
3800
|
-
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
3801
|
-
const issues = {};
|
|
3802
|
-
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
3803
|
-
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
3804
|
-
addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
|
|
3805
|
-
addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
|
|
3806
|
-
addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
|
|
3807
|
-
addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
|
|
3808
|
-
addCaptureNameIssue(issues, captureName);
|
|
3809
|
-
addMinLettersIssue(issues, minLetters);
|
|
3810
|
-
addMaxLettersIssue(issues, maxLetters, minLetters);
|
|
3811
|
-
if (Object.keys(issues).length === 0) return false;
|
|
3812
|
-
result.dictionaryEntry = issues;
|
|
3813
|
-
return true;
|
|
3814
|
-
};
|
|
3815
|
-
const formatValidationIssue = (_type, issue, loc) => {
|
|
3816
|
-
if (!issue) return null;
|
|
3817
|
-
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
3818
|
-
if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
|
|
3819
|
-
if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
|
|
3820
|
-
if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
|
|
3821
|
-
return `${loc}: ${issue.message || issue.type}`;
|
|
3822
|
-
};
|
|
3823
|
-
/**
|
|
3824
|
-
* Validates split rules for common pattern issues.
|
|
3825
|
-
*
|
|
3826
|
-
* Checks for:
|
|
3827
|
-
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
3828
|
-
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
3829
|
-
* - Duplicate patterns within the same rule
|
|
3830
|
-
*
|
|
3831
|
-
* @param rules - Array of split rules to validate
|
|
3832
|
-
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
3833
|
-
*
|
|
3834
|
-
* @example
|
|
3835
|
-
* const issues = validateRules([
|
|
3836
|
-
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
3837
|
-
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
3838
|
-
* ]);
|
|
3839
|
-
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
3840
|
-
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
3841
|
-
*/
|
|
3842
|
-
const validateRules = (rules) => rules.map((rule) => {
|
|
3843
|
-
const result = {};
|
|
3844
|
-
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
|
|
3845
|
-
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
|
|
3846
|
-
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
|
|
3847
|
-
const templateIssues = validateTemplateRule(rule, result);
|
|
3848
|
-
const regexIssues = validateRegexRule(rule, result);
|
|
3849
|
-
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
3850
|
-
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
|
|
3851
|
-
});
|
|
3852
|
-
/**
|
|
3853
|
-
* Formats a validation result array into a list of human-readable error messages.
|
|
3854
|
-
*
|
|
3855
|
-
* Useful for displaying validation errors in UIs.
|
|
3856
|
-
*
|
|
3857
|
-
* @param results - The result array from `validateRules()`
|
|
3858
|
-
* @returns Array of formatted error strings
|
|
3859
|
-
*
|
|
3860
|
-
* @example
|
|
3861
|
-
* const issues = validateRules(rules);
|
|
3862
|
-
* const errors = formatValidationReport(issues);
|
|
3863
|
-
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
4191
|
+
* @param lookbackChars How far back to search for a safe break
|
|
4192
|
+
* @returns The new split position (index), or -1 if no safe break found
|
|
3864
4193
|
*/
|
|
3865
|
-
const
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
return
|
|
4194
|
+
const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
|
|
4195
|
+
const startSearch = Math.max(0, targetPosition - lookbackChars);
|
|
4196
|
+
for (let i = targetPosition - 1; i >= startSearch; i--) {
|
|
4197
|
+
const char = content[i];
|
|
4198
|
+
if (STOP_CHARACTERS.test(char)) return i + 1;
|
|
4199
|
+
}
|
|
4200
|
+
return -1;
|
|
3872
4201
|
};
|
|
3873
4202
|
//#endregion
|
|
3874
4203
|
//#region src/segmentation/breakpoint-processor.ts
|
|
@@ -4130,7 +4459,6 @@ const computeIterationWindow = (fullContent, cursorPos, currentFromIdx, fromIdx,
|
|
|
4130
4459
|
const sliceEnd = Math.max(cursorPos + 1, Math.min(sliceEndByPages, sliceEndByLength));
|
|
4131
4460
|
return {
|
|
4132
4461
|
remainingContent: fullContent.slice(cursorPos, sliceEnd),
|
|
4133
|
-
sliceEnd,
|
|
4134
4462
|
windowEndIdx
|
|
4135
4463
|
};
|
|
4136
4464
|
};
|
|
@@ -4161,31 +4489,87 @@ const updateLastBreakpointFromFound = (found, lastBreakpoint) => {
|
|
|
4161
4489
|
};
|
|
4162
4490
|
return lastBreakpoint;
|
|
4163
4491
|
};
|
|
4164
|
-
const
|
|
4165
|
-
|
|
4166
|
-
|
|
4167
|
-
|
|
4492
|
+
const buildIterativeContext = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
|
|
4493
|
+
const fullContent = segment.content;
|
|
4494
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
4495
|
+
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
4496
|
+
boundaryPositions,
|
|
4497
|
+
fromIdx,
|
|
4498
|
+
fullContentLength: fullContent.length,
|
|
4499
|
+
toIdx
|
|
4500
|
+
});
|
|
4501
|
+
return {
|
|
4502
|
+
boundaryPositions,
|
|
4503
|
+
cumulativeOffsets,
|
|
4504
|
+
debugMetaKey,
|
|
4505
|
+
expandedBreakpoints,
|
|
4506
|
+
fromIdx,
|
|
4507
|
+
fullContent,
|
|
4508
|
+
logger,
|
|
4509
|
+
maxContentLength,
|
|
4510
|
+
maxPages,
|
|
4511
|
+
normalizedPages,
|
|
4512
|
+
pageIds,
|
|
4513
|
+
prefer,
|
|
4514
|
+
segment,
|
|
4515
|
+
toIdx
|
|
4516
|
+
};
|
|
4517
|
+
};
|
|
4518
|
+
const createInitialIterativeState = (fromIdx) => ({
|
|
4519
|
+
currentFromIdx: fromIdx,
|
|
4520
|
+
cursorPos: 0,
|
|
4521
|
+
isFirstPiece: true,
|
|
4522
|
+
lastBreakpoint: null
|
|
4523
|
+
});
|
|
4524
|
+
const hasIterationWorkRemaining = (state, context) => state.cursorPos < context.fullContent.length && state.currentFromIdx <= context.toIdx;
|
|
4525
|
+
const prepareIteration = (context, state) => {
|
|
4526
|
+
if (!hasIterationWorkRemaining(state, context)) return null;
|
|
4527
|
+
const { remainingContent, windowEndIdx } = computeIterationWindow(context.fullContent, state.cursorPos, state.currentFromIdx, context.fromIdx, context.toIdx, context.pageIds, context.boundaryPositions, context.maxPages, context.maxContentLength);
|
|
4528
|
+
if (!remainingContent.trim()) return null;
|
|
4529
|
+
const actualRemainingContent = context.fullContent.slice(state.cursorPos);
|
|
4530
|
+
const actualEndPos = Math.max(state.cursorPos, context.fullContent.length - 1);
|
|
4531
|
+
return {
|
|
4532
|
+
actualRemainingContent,
|
|
4533
|
+
actualRemainingEndIdx: Math.min(findPageIndexForPosition(actualEndPos, context.boundaryPositions, context.fromIdx), context.toIdx),
|
|
4534
|
+
remainingContent,
|
|
4535
|
+
windowEndIdx,
|
|
4536
|
+
windowEndPosition: computeWindowEndPositionForIteration(remainingContent, state.cursorPos, state.currentFromIdx, context.fromIdx, windowEndIdx, context.toIdx, context.pageIds, context.boundaryPositions, context.normalizedPages, context.cumulativeOffsets, context.maxPages, context.maxContentLength, context.logger)
|
|
4537
|
+
};
|
|
4538
|
+
};
|
|
4539
|
+
const buildPageBoundaryBreakpoint = (context, state) => {
|
|
4540
|
+
const pageBoundaryIdx = context.expandedBreakpoints.findIndex((bp) => bp.regex === null);
|
|
4541
|
+
return pageBoundaryIdx >= 0 ? {
|
|
4542
|
+
breakpointIndex: pageBoundaryIdx,
|
|
4543
|
+
rule: { pattern: "" }
|
|
4544
|
+
} : state.lastBreakpoint;
|
|
4545
|
+
};
|
|
4546
|
+
const appendPieceAndAdvance = (context, state, breakPos, pieceContent, result, contentLengthSplit) => {
|
|
4547
|
+
let { actualEndIdx, actualStartIdx } = computePiecePages(state.cursorPos, breakPos, context.boundaryPositions, context.fromIdx, context.toIdx);
|
|
4548
|
+
if (actualStartIdx < state.currentFromIdx) {
|
|
4549
|
+
context.logger?.warn?.("[breakpoints] Page attribution drift detected; clamping actualStartIdx", {
|
|
4168
4550
|
actualStartIdx,
|
|
4169
|
-
currentFromIdx
|
|
4551
|
+
currentFromIdx: state.currentFromIdx
|
|
4170
4552
|
});
|
|
4171
|
-
actualStartIdx = currentFromIdx;
|
|
4553
|
+
actualStartIdx = state.currentFromIdx;
|
|
4172
4554
|
}
|
|
4173
|
-
if (maxPages === 0) {
|
|
4174
|
-
actualEndIdx = Math.min(actualEndIdx, currentFromIdx);
|
|
4175
|
-
actualStartIdx = Math.min(actualStartIdx, currentFromIdx);
|
|
4176
|
-
} else if (maxPages > 0) {
|
|
4177
|
-
const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, toIdx, pageIds, maxPages);
|
|
4555
|
+
if (context.maxPages === 0) {
|
|
4556
|
+
actualEndIdx = Math.min(actualEndIdx, state.currentFromIdx);
|
|
4557
|
+
actualStartIdx = Math.min(actualStartIdx, state.currentFromIdx);
|
|
4558
|
+
} else if (context.maxPages > 0) {
|
|
4559
|
+
const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, context.toIdx, context.pageIds, context.maxPages);
|
|
4178
4560
|
actualEndIdx = Math.min(actualEndIdx, maxAllowedEndIdx);
|
|
4179
4561
|
}
|
|
4180
|
-
const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey,
|
|
4181
|
-
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, true);
|
|
4562
|
+
const meta = getSegmentMetaWithDebug(state.isFirstPiece, context.debugMetaKey, context.segment.meta, state.lastBreakpoint, contentLengthSplit);
|
|
4563
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, context.pageIds, meta, true);
|
|
4182
4564
|
if (pieceSeg) result.push(pieceSeg);
|
|
4183
|
-
const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
4565
|
+
const next = advanceCursorAndIndex(context.fullContent, breakPos, actualEndIdx, context.toIdx, context.pageIds, context.normalizedPages);
|
|
4184
4566
|
let nextFromIdx = next.currentFromIdx;
|
|
4185
|
-
if (maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, boundaryPositions, fromIdx);
|
|
4567
|
+
if (context.maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, context.boundaryPositions, context.fromIdx);
|
|
4186
4568
|
return {
|
|
4569
|
+
...state,
|
|
4187
4570
|
currentFromIdx: nextFromIdx,
|
|
4188
|
-
cursorPos: next.cursorPos
|
|
4571
|
+
cursorPos: next.cursorPos,
|
|
4572
|
+
isFirstPiece: false
|
|
4189
4573
|
};
|
|
4190
4574
|
};
|
|
4191
4575
|
const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength) => {
|
|
@@ -4201,109 +4585,84 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
|
|
|
4201
4585
|
* For maxPages=0 with maxContentLength: if current page's remaining content fits,
|
|
4202
4586
|
* create a segment and advance to next page without applying breakpoints.
|
|
4203
4587
|
*/
|
|
4204
|
-
const tryHandleCurrentPageFit = (
|
|
4205
|
-
if (maxPages !== 0 || !maxContentLength || currentFromIdx >= actualRemainingEndIdx) return
|
|
4206
|
-
const
|
|
4207
|
-
const
|
|
4208
|
-
|
|
4209
|
-
|
|
4210
|
-
const
|
|
4211
|
-
|
|
4212
|
-
|
|
4213
|
-
const pageBoundaryBreakpoint =
|
|
4214
|
-
|
|
4215
|
-
|
|
4216
|
-
|
|
4217
|
-
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
4218
|
-
const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segmentMeta, pageBoundaryBreakpoint);
|
|
4219
|
-
const seg = createSegment(currentPageRemainingContent, pageIds[currentFromIdx], void 0, includeMeta ? meta : void 0);
|
|
4588
|
+
const tryHandleCurrentPageFit = (context, state, actualRemainingEndIdx, result) => {
|
|
4589
|
+
if (context.maxPages !== 0 || !context.maxContentLength || state.currentFromIdx >= actualRemainingEndIdx) return null;
|
|
4590
|
+
const boundaryIdx = state.currentFromIdx - context.fromIdx + 1;
|
|
4591
|
+
const currentPageEndPos = context.boundaryPositions[boundaryIdx] ?? context.fullContent.length;
|
|
4592
|
+
const currentPageRemainingContent = context.fullContent.slice(state.cursorPos, currentPageEndPos).trim();
|
|
4593
|
+
if (!currentPageRemainingContent) return null;
|
|
4594
|
+
const currentPageFitsInLength = currentPageRemainingContent.length <= context.maxContentLength;
|
|
4595
|
+
const currentPageHasExclusions = hasAnyExclusionsInRange(context.expandedBreakpoints, context.pageIds, state.currentFromIdx, state.currentFromIdx);
|
|
4596
|
+
if (!currentPageFitsInLength || currentPageHasExclusions) return null;
|
|
4597
|
+
const pageBoundaryBreakpoint = buildPageBoundaryBreakpoint(context, state);
|
|
4598
|
+
const includeMeta = state.isFirstPiece || Boolean(context.debugMetaKey);
|
|
4599
|
+
const meta = getSegmentMetaWithDebug(state.isFirstPiece, context.debugMetaKey, context.segment.meta, pageBoundaryBreakpoint);
|
|
4600
|
+
const seg = createSegment(currentPageRemainingContent, context.pageIds[state.currentFromIdx], void 0, includeMeta ? meta : void 0);
|
|
4220
4601
|
if (seg) result.push(seg);
|
|
4221
|
-
let newCursorPos = currentPageEndPos;
|
|
4222
|
-
while (newCursorPos < fullContent.length && /\s/.test(fullContent[newCursorPos])) newCursorPos++;
|
|
4223
4602
|
return {
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4603
|
+
...state,
|
|
4604
|
+
currentFromIdx: state.currentFromIdx + 1,
|
|
4605
|
+
cursorPos: skipWhitespace(context.fullContent, currentPageEndPos),
|
|
4606
|
+
isFirstPiece: false,
|
|
4607
|
+
lastBreakpoint: pageBoundaryBreakpoint
|
|
4608
|
+
};
|
|
4609
|
+
};
|
|
4610
|
+
const tryFinalizeIteration = (context, state, prepared, result) => handleOversizedSegmentFit(prepared.actualRemainingContent, state.currentFromIdx, prepared.actualRemainingEndIdx, context.pageIds, context.expandedBreakpoints, context.maxPages, context.maxContentLength, state.isFirstPiece, context.debugMetaKey, context.segment.meta, state.lastBreakpoint, result);
|
|
4611
|
+
const applyBreakpointToIteration = (context, state, prepared, iteration, result) => {
|
|
4612
|
+
context.logger?.trace?.(`[breakpoints] iteration=${iteration}`, {
|
|
4613
|
+
currentFromIdx: state.currentFromIdx,
|
|
4614
|
+
cursorPos: state.cursorPos,
|
|
4615
|
+
windowEndIdx: prepared.windowEndIdx,
|
|
4616
|
+
windowEndPosition: prepared.windowEndPosition
|
|
4617
|
+
});
|
|
4618
|
+
const found = findBreakOffsetForWindow(prepared.remainingContent, state.currentFromIdx, prepared.windowEndIdx, context.toIdx, prepared.windowEndPosition, context.pageIds, context.expandedBreakpoints, context.cumulativeOffsets, context.normalizedPages, context.prefer, context.maxContentLength);
|
|
4619
|
+
const breakOffset = ensureProgressingBreakOffset(found.breakOffset, prepared.remainingContent, state.cursorPos, context.maxContentLength, context.logger);
|
|
4620
|
+
const nextState = {
|
|
4621
|
+
...state,
|
|
4622
|
+
lastBreakpoint: updateLastBreakpointFromFound(found, state.lastBreakpoint)
|
|
4623
|
+
};
|
|
4624
|
+
const breakPos = state.cursorPos + breakOffset;
|
|
4625
|
+
const pieceContent = context.fullContent.slice(state.cursorPos, breakPos).trim();
|
|
4626
|
+
if (!pieceContent) return {
|
|
4627
|
+
...nextState,
|
|
4628
|
+
cursorPos: breakPos,
|
|
4629
|
+
isFirstPiece: false
|
|
4228
4630
|
};
|
|
4631
|
+
return appendPieceAndAdvance(context, nextState, breakPos, pieceContent, result, found.contentLengthSplit);
|
|
4229
4632
|
};
|
|
4230
4633
|
const processOversizedSegmentIterative = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
|
|
4231
4634
|
const result = [];
|
|
4232
|
-
const fullContent = segment.content;
|
|
4233
4635
|
const pageCount = toIdx - fromIdx + 1;
|
|
4234
4636
|
logger?.debug?.("[breakpoints] processOversizedSegment: Using iterative path", {
|
|
4235
|
-
contentLength:
|
|
4637
|
+
contentLength: segment.content.length,
|
|
4236
4638
|
fromIdx,
|
|
4237
4639
|
maxContentLength,
|
|
4238
4640
|
maxPages,
|
|
4239
4641
|
pageCount,
|
|
4240
4642
|
toIdx
|
|
4241
4643
|
});
|
|
4242
|
-
|
|
4243
|
-
let
|
|
4244
|
-
let
|
|
4245
|
-
|
|
4246
|
-
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
4250
|
-
fullContentLength: fullContent.length,
|
|
4251
|
-
toIdx
|
|
4252
|
-
});
|
|
4253
|
-
const MAX_SAFE_ITERATIONS = 1e5;
|
|
4254
|
-
let didHitMaxIterations = true;
|
|
4255
|
-
for (let i = 1; i <= MAX_SAFE_ITERATIONS; i++) {
|
|
4256
|
-
if (cursorPos >= fullContent.length || currentFromIdx > toIdx) {
|
|
4257
|
-
didHitMaxIterations = false;
|
|
4258
|
-
break;
|
|
4259
|
-
}
|
|
4260
|
-
const { remainingContent, windowEndIdx } = computeIterationWindow(fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength);
|
|
4261
|
-
if (!remainingContent.trim()) {
|
|
4262
|
-
didHitMaxIterations = false;
|
|
4263
|
-
break;
|
|
4264
|
-
}
|
|
4265
|
-
const actualRemainingContent = fullContent.slice(cursorPos);
|
|
4266
|
-
const actualEndPos = Math.max(cursorPos, fullContent.length - 1);
|
|
4267
|
-
const actualRemainingEndIdx = Math.min(findPageIndexForPosition(actualEndPos, boundaryPositions, fromIdx), toIdx);
|
|
4268
|
-
const currentPageFit = tryHandleCurrentPageFit(fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result);
|
|
4269
|
-
if (currentPageFit.handled) {
|
|
4270
|
-
cursorPos = currentPageFit.newCursorPos;
|
|
4271
|
-
currentFromIdx = currentPageFit.newFromIdx;
|
|
4272
|
-
lastBreakpoint = currentPageFit.newLastBreakpoint;
|
|
4273
|
-
isFirstPiece = false;
|
|
4644
|
+
const context = buildIterativeContext(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
|
|
4645
|
+
let state = createInitialIterativeState(fromIdx);
|
|
4646
|
+
for (let iteration = 1;; iteration++) {
|
|
4647
|
+
const prepared = prepareIteration(context, state);
|
|
4648
|
+
if (!prepared) break;
|
|
4649
|
+
const currentPageFitState = tryHandleCurrentPageFit(context, state, prepared.actualRemainingEndIdx, result);
|
|
4650
|
+
if (currentPageFitState) {
|
|
4651
|
+
state = currentPageFitState;
|
|
4274
4652
|
continue;
|
|
4275
4653
|
}
|
|
4276
|
-
if (
|
|
4277
|
-
|
|
4654
|
+
if (tryFinalizeIteration(context, state, prepared, result)) break;
|
|
4655
|
+
const nextState = applyBreakpointToIteration(context, state, prepared, iteration, result);
|
|
4656
|
+
if (nextState.cursorPos <= state.cursorPos) {
|
|
4657
|
+
context.logger?.error?.("[breakpoints] Iterative splitting stalled; aborting to avoid an infinite loop", {
|
|
4658
|
+
cursorPos: state.cursorPos,
|
|
4659
|
+
iteration,
|
|
4660
|
+
nextCursorPos: nextState.cursorPos
|
|
4661
|
+
});
|
|
4278
4662
|
break;
|
|
4279
4663
|
}
|
|
4280
|
-
|
|
4281
|
-
logger?.trace?.(`[breakpoints] iteration=${i}`, {
|
|
4282
|
-
currentFromIdx,
|
|
4283
|
-
cursorPos,
|
|
4284
|
-
windowEndIdx,
|
|
4285
|
-
windowEndPosition
|
|
4286
|
-
});
|
|
4287
|
-
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
|
|
4288
|
-
const breakOffset = ensureProgressingBreakOffset(found.breakOffset, remainingContent, cursorPos, maxContentLength, logger);
|
|
4289
|
-
lastBreakpoint = updateLastBreakpointFromFound(found, lastBreakpoint);
|
|
4290
|
-
const breakPos = cursorPos + breakOffset;
|
|
4291
|
-
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
4292
|
-
if (!pieceContent) {
|
|
4293
|
-
cursorPos = breakPos;
|
|
4294
|
-
isFirstPiece = false;
|
|
4295
|
-
continue;
|
|
4296
|
-
}
|
|
4297
|
-
const next = appendPieceAndAdvance(fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result, logger, found.contentLengthSplit);
|
|
4298
|
-
cursorPos = next.cursorPos;
|
|
4299
|
-
currentFromIdx = next.currentFromIdx;
|
|
4300
|
-
isFirstPiece = false;
|
|
4664
|
+
state = nextState;
|
|
4301
4665
|
}
|
|
4302
|
-
if (didHitMaxIterations) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
|
|
4303
|
-
cursorPos,
|
|
4304
|
-
fullContentLength: fullContent.length,
|
|
4305
|
-
iterations: MAX_SAFE_ITERATIONS
|
|
4306
|
-
});
|
|
4307
4666
|
logger?.debug?.("[breakpoints] processOversizedSegment: Complete", { resultCount: result.length });
|
|
4308
4667
|
return result;
|
|
4309
4668
|
};
|
|
@@ -4377,6 +4736,120 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
4377
4736
|
return result;
|
|
4378
4737
|
};
|
|
4379
4738
|
//#endregion
|
|
4739
|
+
//#region src/dictionary/arabic-dictionary-rule.ts
|
|
4740
|
+
const uniqueCanonicalWords = (words) => {
|
|
4741
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4742
|
+
const result = [];
|
|
4743
|
+
for (const word of words) {
|
|
4744
|
+
const normalized = normalizeArabicForComparison(word);
|
|
4745
|
+
if (!normalized || seen.has(normalized)) continue;
|
|
4746
|
+
seen.add(normalized);
|
|
4747
|
+
result.push(word);
|
|
4748
|
+
}
|
|
4749
|
+
return result;
|
|
4750
|
+
};
|
|
4751
|
+
const buildStopAlternation = (stopWords) => {
|
|
4752
|
+
const unique = uniqueCanonicalWords(stopWords);
|
|
4753
|
+
if (unique.length === 0) return "";
|
|
4754
|
+
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
4755
|
+
};
|
|
4756
|
+
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
4757
|
+
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
4758
|
+
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
4759
|
+
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
4760
|
+
};
|
|
4761
|
+
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
4762
|
+
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
4763
|
+
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
4764
|
+
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
4765
|
+
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
4766
|
+
};
|
|
4767
|
+
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
4768
|
+
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
4769
|
+
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
4770
|
+
if (!/^[A-Za-z_]\w*$/.test(captureName)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
4771
|
+
};
|
|
4772
|
+
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
4773
|
+
validateDictionaryEntryOptions({
|
|
4774
|
+
captureName,
|
|
4775
|
+
maxLetters,
|
|
4776
|
+
minLetters
|
|
4777
|
+
});
|
|
4778
|
+
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
4779
|
+
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
4780
|
+
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
4781
|
+
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
4782
|
+
const stopAlternation = buildStopAlternation(stopWords);
|
|
4783
|
+
const lemmaBody = buildHeadwordBody({
|
|
4784
|
+
allowCommaSeparated,
|
|
4785
|
+
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
4786
|
+
stopAlternation,
|
|
4787
|
+
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
4788
|
+
unit: lemmaUnit
|
|
4789
|
+
});
|
|
4790
|
+
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
4791
|
+
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
4792
|
+
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
4793
|
+
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
4794
|
+
allowParenthesized,
|
|
4795
|
+
allowWhitespaceBeforeColon,
|
|
4796
|
+
captureName: prefixedCaptureName,
|
|
4797
|
+
headwordBody: lemmaBody
|
|
4798
|
+
});
|
|
4799
|
+
return {
|
|
4800
|
+
captureNames: [prefixedCaptureName],
|
|
4801
|
+
regex
|
|
4802
|
+
};
|
|
4803
|
+
};
|
|
4804
|
+
/**
|
|
4805
|
+
* Creates a reusable split rule for Arabic dictionary entries.
|
|
4806
|
+
*
|
|
4807
|
+
* The returned rule preserves authoring intent as a serializable
|
|
4808
|
+
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
4809
|
+
* regex string.
|
|
4810
|
+
*
|
|
4811
|
+
* @example
|
|
4812
|
+
* createArabicDictionaryEntryRule({
|
|
4813
|
+
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
4814
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
4815
|
+
* })
|
|
4816
|
+
*
|
|
4817
|
+
* @example
|
|
4818
|
+
* createArabicDictionaryEntryRule({
|
|
4819
|
+
* allowParenthesized: true,
|
|
4820
|
+
* allowWhitespaceBeforeColon: true,
|
|
4821
|
+
* allowCommaSeparated: true,
|
|
4822
|
+
* stopWords: ['الليث', 'العجاج'],
|
|
4823
|
+
* })
|
|
4824
|
+
*/
|
|
4825
|
+
/**
|
|
4826
|
+
* @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
|
|
4827
|
+
* whole-book dictionary segmentation. Keep this helper for advanced single-rule
|
|
4828
|
+
* composition inside a broader `SplitRule[]` pipeline.
|
|
4829
|
+
*/
|
|
4830
|
+
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
4831
|
+
validateDictionaryEntryOptions({
|
|
4832
|
+
captureName,
|
|
4833
|
+
maxLetters,
|
|
4834
|
+
minLetters
|
|
4835
|
+
});
|
|
4836
|
+
return {
|
|
4837
|
+
dictionaryEntry: {
|
|
4838
|
+
allowCommaSeparated,
|
|
4839
|
+
allowParenthesized,
|
|
4840
|
+
allowWhitespaceBeforeColon,
|
|
4841
|
+
captureName,
|
|
4842
|
+
maxLetters,
|
|
4843
|
+
midLineSubentries,
|
|
4844
|
+
minLetters,
|
|
4845
|
+
stopWords: uniqueCanonicalWords(stopWords)
|
|
4846
|
+
},
|
|
4847
|
+
meta,
|
|
4848
|
+
pageStartPrevWordStoplist,
|
|
4849
|
+
samePagePrevWordStoplist
|
|
4850
|
+
};
|
|
4851
|
+
};
|
|
4852
|
+
//#endregion
|
|
4380
4853
|
//#region src/segmentation/rule-regex.ts
|
|
4381
4854
|
/**
|
|
4382
4855
|
* Checks if a regex pattern contains standard (anonymous) capturing groups.
|
|
@@ -5319,425 +5792,566 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner, hasDict
|
|
|
5319
5792
|
const createSegmentsFromSplitPoints = () => {
|
|
5320
5793
|
const result = [];
|
|
5321
5794
|
for (let i = 0; i < splitPoints.length; i++) {
|
|
5322
|
-
const sp = splitPoints[i];
|
|
5323
|
-
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
5324
|
-
const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
5325
|
-
if (s) result.push(s);
|
|
5326
|
-
}
|
|
5327
|
-
return result;
|
|
5328
|
-
};
|
|
5329
|
-
const segments = [];
|
|
5330
|
-
if (!splitPoints.length) {
|
|
5331
|
-
const firstId = pageMap.getId(0);
|
|
5332
|
-
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
5333
|
-
const s = createSegment(0, content.length);
|
|
5334
|
-
if (s) segments.push(s);
|
|
5335
|
-
}
|
|
5336
|
-
return segments;
|
|
5337
|
-
}
|
|
5338
|
-
if (splitPoints[0].index > 0) {
|
|
5339
|
-
const firstId = pageMap.getId(0);
|
|
5340
|
-
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
5341
|
-
const s = createSegment(0, splitPoints[0].index);
|
|
5342
|
-
if (s) segments.push(s);
|
|
5343
|
-
}
|
|
5344
|
-
}
|
|
5345
|
-
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
5346
|
-
};
|
|
5347
|
-
//#endregion
|
|
5348
|
-
//#region src/validation/validate-segments.ts
|
|
5349
|
-
/**
|
|
5350
|
-
* Creates a short preview string of text content for error reporting.
|
|
5351
|
-
* Truncates content exceeding PREVIEW_LIMIT.
|
|
5352
|
-
*/
|
|
5353
|
-
const buildPreview = (text) => {
|
|
5354
|
-
const normalized = text.replace(/\s+/g, " ").trim();
|
|
5355
|
-
if (normalized.length <= 140) return normalized;
|
|
5356
|
-
return `${normalized.slice(0, 140)}...`;
|
|
5357
|
-
};
|
|
5358
|
-
/**
|
|
5359
|
-
* Creates a lightweight snapshot of a segment for inclusion in validation checks.
|
|
5360
|
-
*/
|
|
5361
|
-
const buildSegmentSnapshot = (segment) => ({
|
|
5362
|
-
contentPreview: buildPreview(segment.content),
|
|
5363
|
-
from: segment.from,
|
|
5364
|
-
to: segment.to
|
|
5365
|
-
});
|
|
5366
|
-
/**
|
|
5367
|
-
* Normalizes page content by applying preprocessing transforms and standardizing line endings.
|
|
5368
|
-
*/
|
|
5369
|
-
const normalizePages = (pages, options) => {
|
|
5370
|
-
const transforms = options.preprocess ?? [];
|
|
5371
|
-
return pages.map((page) => {
|
|
5372
|
-
return {
|
|
5373
|
-
content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
|
|
5374
|
-
id: page.id
|
|
5375
|
-
};
|
|
5376
|
-
});
|
|
5377
|
-
};
|
|
5378
|
-
/**
|
|
5379
|
-
* Joins all page content into a single string with boundary tracking.
|
|
5380
|
-
* Returns the joined string and a list of boundary mappings (start/end indices for each page).
|
|
5381
|
-
*/
|
|
5382
|
-
const buildJoinedContent = (pages, joiner) => {
|
|
5383
|
-
const boundaries = [];
|
|
5384
|
-
const joined = pages.map((p) => p.content).join(joiner);
|
|
5385
|
-
let offset = 0;
|
|
5386
|
-
for (let i = 0; i < pages.length; i++) {
|
|
5387
|
-
const content = pages[i].content;
|
|
5388
|
-
const start = offset;
|
|
5389
|
-
const end = start + content.length;
|
|
5390
|
-
boundaries.push({
|
|
5391
|
-
end,
|
|
5392
|
-
id: pages[i].id,
|
|
5393
|
-
start
|
|
5394
|
-
});
|
|
5395
|
-
offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
|
|
5795
|
+
const sp = splitPoints[i];
|
|
5796
|
+
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
5797
|
+
const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
5798
|
+
if (s) result.push(s);
|
|
5799
|
+
}
|
|
5800
|
+
return result;
|
|
5801
|
+
};
|
|
5802
|
+
const segments = [];
|
|
5803
|
+
if (!splitPoints.length) {
|
|
5804
|
+
const firstId = pageMap.getId(0);
|
|
5805
|
+
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
5806
|
+
const s = createSegment(0, content.length);
|
|
5807
|
+
if (s) segments.push(s);
|
|
5808
|
+
}
|
|
5809
|
+
return segments;
|
|
5810
|
+
}
|
|
5811
|
+
if (splitPoints[0].index > 0) {
|
|
5812
|
+
const firstId = pageMap.getId(0);
|
|
5813
|
+
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
5814
|
+
const s = createSegment(0, splitPoints[0].index);
|
|
5815
|
+
if (s) segments.push(s);
|
|
5816
|
+
}
|
|
5396
5817
|
}
|
|
5818
|
+
return [...segments, ...createSegmentsFromSplitPoints()];
|
|
5819
|
+
};
|
|
5820
|
+
//#endregion
|
|
5821
|
+
//#region src/analysis/segmentation-advisor.ts
|
|
5822
|
+
const ZERO_WIDTH_REGEX = /[\u061C\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/gu;
|
|
5823
|
+
const ELLIPSIS_REGEX = /\.{3,}/g;
|
|
5824
|
+
const TRAILING_WAW_REGEX = /\sو\s+(?=[\p{Script=Arabic}])/gu;
|
|
5825
|
+
const STRUCTURAL_META_BY_TOKEN = {
|
|
5826
|
+
bab: "chapter",
|
|
5827
|
+
basmalah: "basmalah",
|
|
5828
|
+
fasl: "section",
|
|
5829
|
+
kitab: "book"
|
|
5830
|
+
};
|
|
5831
|
+
const NUMBER_TOKENS = [
|
|
5832
|
+
"numbered",
|
|
5833
|
+
"raqms",
|
|
5834
|
+
"raqm",
|
|
5835
|
+
"nums",
|
|
5836
|
+
"num"
|
|
5837
|
+
];
|
|
5838
|
+
const DEFAULT_BREAKPOINTS = [{
|
|
5839
|
+
pattern: "{{tarqim}}\\s*",
|
|
5840
|
+
split: "after"
|
|
5841
|
+
}, ""];
|
|
5842
|
+
const resolveOptions = (pages, options = {}) => {
|
|
5843
|
+
const minCount = pages.length >= 25 ? 3 : 2;
|
|
5397
5844
|
return {
|
|
5398
|
-
|
|
5399
|
-
|
|
5845
|
+
maxRules: options.maxRules ?? 4,
|
|
5846
|
+
minLineStartCount: options.minLineStartCount ?? minCount,
|
|
5847
|
+
minRepeatingCount: options.minRepeatingCount ?? minCount,
|
|
5848
|
+
sampleSegments: options.sampleSegments ?? 5,
|
|
5849
|
+
topLineStarts: options.topLineStarts ?? 12,
|
|
5850
|
+
topRepeatingSequences: options.topRepeatingSequences ?? 8
|
|
5400
5851
|
};
|
|
5401
5852
|
};
|
|
5402
|
-
|
|
5403
|
-
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5408
|
-
|
|
5409
|
-
|
|
5410
|
-
|
|
5411
|
-
|
|
5412
|
-
|
|
5413
|
-
|
|
5414
|
-
|
|
5853
|
+
const countMatches = (text, regex) => text.match(regex)?.length ?? 0;
|
|
5854
|
+
const getDetections = (pages) => pages.reduce((acc, page) => ({
|
|
5855
|
+
ellipsisCount: acc.ellipsisCount + countMatches(page.content, ELLIPSIS_REGEX),
|
|
5856
|
+
trailingWawCount: acc.trailingWawCount + countMatches(page.content, TRAILING_WAW_REGEX),
|
|
5857
|
+
zeroWidthCount: acc.zeroWidthCount + countMatches(page.content, ZERO_WIDTH_REGEX)
|
|
5858
|
+
}), {
|
|
5859
|
+
ellipsisCount: 0,
|
|
5860
|
+
trailingWawCount: 0,
|
|
5861
|
+
zeroWidthCount: 0
|
|
5862
|
+
});
|
|
5863
|
+
const getPreprocessSuggestions = (detections) => {
|
|
5864
|
+
const suggestions = [];
|
|
5865
|
+
if (detections.zeroWidthCount > 0) suggestions.push({
|
|
5866
|
+
count: detections.zeroWidthCount,
|
|
5867
|
+
reason: "Invisible directional/zero-width marks can break anchors and token matching.",
|
|
5868
|
+
transform: "removeZeroWidth"
|
|
5869
|
+
});
|
|
5870
|
+
if (detections.ellipsisCount > 0) suggestions.push({
|
|
5871
|
+
count: detections.ellipsisCount,
|
|
5872
|
+
reason: "Repeated periods often cause noisy punctuation breakpoints.",
|
|
5873
|
+
transform: "condenseEllipsis"
|
|
5874
|
+
});
|
|
5875
|
+
if (detections.trailingWawCount > 0) suggestions.push({
|
|
5876
|
+
count: detections.trailingWawCount,
|
|
5877
|
+
reason: "Separated waw prefixes are a common digitization artifact in Arabic corpora.",
|
|
5878
|
+
transform: "fixTrailingWaw"
|
|
5879
|
+
});
|
|
5880
|
+
return suggestions;
|
|
5881
|
+
};
|
|
5882
|
+
const extractTokenNames = (pattern) => [...pattern.matchAll(/\{\{(\w+)(?::[^}]+)?\}\}/g)].map((match) => match[1]);
|
|
5883
|
+
const getStructuralMeta = (tokens) => {
|
|
5884
|
+
for (const token of tokens) if (token in STRUCTURAL_META_BY_TOKEN) return STRUCTURAL_META_BY_TOKEN[token];
|
|
5885
|
+
};
|
|
5886
|
+
const applyFirstTokenReplacement = (pattern, token, replacement) => {
|
|
5887
|
+
const target = `{{${token}}}`;
|
|
5888
|
+
return pattern.includes(target) ? pattern.replace(target, replacement) : pattern;
|
|
5889
|
+
};
|
|
5890
|
+
const addNamedCaptures = (pattern) => {
|
|
5891
|
+
let next = pattern;
|
|
5892
|
+
if (next.includes("{{numbered}}")) next = next.replace("{{numbered}}", "{{raqms:num}} {{dash}} ");
|
|
5893
|
+
else for (const token of NUMBER_TOKENS) {
|
|
5894
|
+
const replacement = token === "num" ? "{{num:num}}" : `{{${token}:num}}`;
|
|
5895
|
+
const replaced = applyFirstTokenReplacement(next, token, replacement);
|
|
5896
|
+
if (replaced !== next) {
|
|
5897
|
+
next = replaced;
|
|
5898
|
+
break;
|
|
5899
|
+
}
|
|
5415
5900
|
}
|
|
5416
|
-
if (
|
|
5417
|
-
|
|
5418
|
-
|
|
5419
|
-
|
|
5420
|
-
|
|
5421
|
-
|
|
5422
|
-
|
|
5423
|
-
|
|
5424
|
-
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
const
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5431
|
-
|
|
5432
|
-
|
|
5433
|
-
|
|
5434
|
-
|
|
5435
|
-
|
|
5436
|
-
|
|
5437
|
-
|
|
5438
|
-
|
|
5439
|
-
|
|
5440
|
-
|
|
5441
|
-
|
|
5442
|
-
|
|
5443
|
-
|
|
5444
|
-
|
|
5445
|
-
|
|
5446
|
-
|
|
5447
|
-
|
|
5448
|
-
|
|
5449
|
-
|
|
5450
|
-
|
|
5451
|
-
|
|
5452
|
-
|
|
5453
|
-
|
|
5454
|
-
|
|
5901
|
+
if (next.includes("{{rumuz}}")) next = next.replace("{{rumuz}}", "{{rumuz:source}}");
|
|
5902
|
+
return next;
|
|
5903
|
+
};
|
|
5904
|
+
const findTokenIndex = (pattern, token) => {
|
|
5905
|
+
const plainIndex = pattern.indexOf(`{{${token}}}`);
|
|
5906
|
+
const namedIndex = pattern.indexOf(`{{${token}:`);
|
|
5907
|
+
if (plainIndex === -1) return namedIndex;
|
|
5908
|
+
if (namedIndex === -1) return plainIndex;
|
|
5909
|
+
return Math.min(plainIndex, namedIndex);
|
|
5910
|
+
};
|
|
5911
|
+
const trimNumberBoundaryPattern = (pattern) => {
|
|
5912
|
+
const stopTokens = [
|
|
5913
|
+
"naql",
|
|
5914
|
+
"bab",
|
|
5915
|
+
"basmalah",
|
|
5916
|
+
"fasl",
|
|
5917
|
+
"kitab"
|
|
5918
|
+
];
|
|
5919
|
+
let end = pattern.length;
|
|
5920
|
+
for (const token of stopTokens) {
|
|
5921
|
+
const index = findTokenIndex(pattern, token);
|
|
5922
|
+
if (index >= 0) end = Math.min(end, index);
|
|
5923
|
+
}
|
|
5924
|
+
return pattern.slice(0, end).trimEnd();
|
|
5925
|
+
};
|
|
5926
|
+
const getRuleMeta = (tokens) => {
|
|
5927
|
+
const structural = getStructuralMeta(tokens);
|
|
5928
|
+
if (structural) return { type: structural };
|
|
5929
|
+
if (tokens.includes("naql") || tokens.some((token) => NUMBER_TOKENS.includes(token))) return { type: "entry" };
|
|
5930
|
+
};
|
|
5931
|
+
const getSuggestionConfidence = (tokens, shape) => {
|
|
5932
|
+
if (getStructuralMeta(tokens)) return "high";
|
|
5933
|
+
if (tokens.some((token) => NUMBER_TOKENS.includes(token)) || tokens.includes("naql")) return "high";
|
|
5934
|
+
if (shape === "sequence" && tokens.includes("rumuz")) return "medium";
|
|
5935
|
+
return tokens.length > 0 ? "medium" : "low";
|
|
5936
|
+
};
|
|
5937
|
+
const getSuggestionReason = (tokens, source) => {
|
|
5938
|
+
const structural = getStructuralMeta(tokens);
|
|
5939
|
+
if (structural) return `Repeated structural marker suggests ${structural}-style boundaries.`;
|
|
5940
|
+
if (tokens.some((token) => NUMBER_TOKENS.includes(token))) return "Repeated numbering marker is a strong candidate for entry boundaries.";
|
|
5941
|
+
if (tokens.includes("naql")) return source === "line-start" ? "Repeated transmission phrase appears at line starts and can anchor segments." : "Repeated transmission phrase inside prose is a good candidate for template-based splitting.";
|
|
5942
|
+
return source === "line-start" ? "Frequent line-start signature is worth trying as a structural boundary." : "Frequent tokenized sequence may help split continuous prose.";
|
|
5943
|
+
};
|
|
5944
|
+
const createRule = (pattern, tokens, shape) => {
|
|
5945
|
+
const fuzzy = shouldDefaultToFuzzy(pattern);
|
|
5946
|
+
const meta = getRuleMeta(tokens);
|
|
5947
|
+
if (shape === "line-start") {
|
|
5948
|
+
if (getStructuralMeta(tokens)) return meta ? {
|
|
5949
|
+
fuzzy,
|
|
5950
|
+
lineStartsWith: [pattern],
|
|
5951
|
+
meta,
|
|
5952
|
+
split: "at"
|
|
5953
|
+
} : {
|
|
5954
|
+
fuzzy,
|
|
5955
|
+
lineStartsWith: [pattern],
|
|
5956
|
+
split: "at"
|
|
5455
5957
|
};
|
|
5456
|
-
|
|
5457
|
-
const
|
|
5458
|
-
|
|
5459
|
-
|
|
5460
|
-
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
|
|
5464
|
-
|
|
5465
|
-
|
|
5466
|
-
|
|
5467
|
-
} : void 0,
|
|
5468
|
-
severity: "error",
|
|
5469
|
-
type
|
|
5958
|
+
if (tokens.some((token) => NUMBER_TOKENS.includes(token))) {
|
|
5959
|
+
const captured = addNamedCaptures(trimNumberBoundaryPattern(pattern));
|
|
5960
|
+
return meta ? {
|
|
5961
|
+
fuzzy,
|
|
5962
|
+
lineStartsAfter: [captured],
|
|
5963
|
+
meta,
|
|
5964
|
+
split: "at"
|
|
5965
|
+
} : {
|
|
5966
|
+
fuzzy,
|
|
5967
|
+
lineStartsAfter: [captured],
|
|
5968
|
+
split: "at"
|
|
5470
5969
|
};
|
|
5471
5970
|
}
|
|
5472
|
-
|
|
5473
|
-
|
|
5474
|
-
|
|
5475
|
-
|
|
5476
|
-
|
|
5477
|
-
|
|
5478
|
-
|
|
5479
|
-
|
|
5480
|
-
|
|
5481
|
-
severity: "error",
|
|
5482
|
-
type
|
|
5971
|
+
return meta ? {
|
|
5972
|
+
fuzzy,
|
|
5973
|
+
lineStartsWith: [pattern],
|
|
5974
|
+
meta,
|
|
5975
|
+
split: "at"
|
|
5976
|
+
} : {
|
|
5977
|
+
fuzzy,
|
|
5978
|
+
lineStartsWith: [pattern],
|
|
5979
|
+
split: "at"
|
|
5483
5980
|
};
|
|
5484
5981
|
}
|
|
5982
|
+
const captured = addNamedCaptures(pattern);
|
|
5983
|
+
return meta ? {
|
|
5984
|
+
fuzzy,
|
|
5985
|
+
meta,
|
|
5986
|
+
split: "at",
|
|
5987
|
+
template: captured
|
|
5988
|
+
} : {
|
|
5989
|
+
fuzzy,
|
|
5990
|
+
split: "at",
|
|
5991
|
+
template: captured
|
|
5992
|
+
};
|
|
5485
5993
|
};
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
|
|
5499
|
-
|
|
5500
|
-
idx = joined.indexOf(content, idx + 1);
|
|
5501
|
-
if (idx >= searchEnd) break;
|
|
5502
|
-
count++;
|
|
5503
|
-
}
|
|
5504
|
-
return matches;
|
|
5994
|
+
const createLineStartSuggestion = (pattern) => {
|
|
5995
|
+
const tokens = extractTokenNames(pattern.pattern);
|
|
5996
|
+
return {
|
|
5997
|
+
confidence: getSuggestionConfidence(tokens, "line-start"),
|
|
5998
|
+
count: pattern.count,
|
|
5999
|
+
example: {
|
|
6000
|
+
pageId: pattern.examples[0]?.pageId ?? -1,
|
|
6001
|
+
text: pattern.examples[0]?.line ?? ""
|
|
6002
|
+
},
|
|
6003
|
+
pattern: pattern.pattern,
|
|
6004
|
+
reason: getSuggestionReason(tokens, "line-start"),
|
|
6005
|
+
rule: createRule(pattern.pattern, tokens, "line-start"),
|
|
6006
|
+
source: "line-start"
|
|
6007
|
+
};
|
|
5505
6008
|
};
|
|
5506
|
-
|
|
5507
|
-
|
|
5508
|
-
|
|
5509
|
-
|
|
5510
|
-
|
|
5511
|
-
|
|
5512
|
-
|
|
5513
|
-
|
|
5514
|
-
|
|
5515
|
-
|
|
5516
|
-
|
|
5517
|
-
|
|
5518
|
-
|
|
5519
|
-
|
|
5520
|
-
|
|
5521
|
-
|
|
5522
|
-
|
|
5523
|
-
|
|
5524
|
-
|
|
6009
|
+
const createRepeatingSuggestion = (pattern) => {
|
|
6010
|
+
const tokens = extractTokenNames(pattern.pattern);
|
|
6011
|
+
return {
|
|
6012
|
+
confidence: getSuggestionConfidence(tokens, "sequence"),
|
|
6013
|
+
count: pattern.count,
|
|
6014
|
+
example: {
|
|
6015
|
+
pageId: pattern.examples[0]?.pageId ?? -1,
|
|
6016
|
+
text: pattern.examples[0]?.text ?? ""
|
|
6017
|
+
},
|
|
6018
|
+
pattern: pattern.pattern,
|
|
6019
|
+
reason: getSuggestionReason(tokens, "repeating-sequence"),
|
|
6020
|
+
rule: createRule(pattern.pattern, tokens, "sequence"),
|
|
6021
|
+
source: "repeating-sequence"
|
|
6022
|
+
};
|
|
6023
|
+
};
|
|
6024
|
+
const confidenceScore = (confidence) => confidence === "high" ? 3 : confidence === "medium" ? 2 : 1;
|
|
6025
|
+
const sourceScore = (mode, source) => {
|
|
6026
|
+
if (mode === "structured") return source === "line-start" ? 3 : 1;
|
|
6027
|
+
if (mode === "continuous") return source === "repeating-sequence" ? 3 : 1;
|
|
6028
|
+
return source === "line-start" ? 3 : 2;
|
|
6029
|
+
};
|
|
6030
|
+
const compareSuggestions = (mode, left, right) => sourceScore(mode, right.source) - sourceScore(mode, left.source) || confidenceScore(right.confidence) - confidenceScore(left.confidence) || right.count - left.count || left.pattern.localeCompare(right.pattern);
|
|
6031
|
+
const dedupeSuggestions = (suggestions) => {
|
|
6032
|
+
const seen = /* @__PURE__ */ new Set();
|
|
6033
|
+
const deduped = [];
|
|
6034
|
+
for (const suggestion of suggestions) {
|
|
6035
|
+
const key = JSON.stringify(suggestion.rule);
|
|
6036
|
+
if (seen.has(key)) continue;
|
|
6037
|
+
seen.add(key);
|
|
6038
|
+
deduped.push(suggestion);
|
|
5525
6039
|
}
|
|
5526
|
-
|
|
5527
|
-
|
|
5528
|
-
|
|
5529
|
-
|
|
5530
|
-
|
|
5531
|
-
|
|
5532
|
-
|
|
5533
|
-
|
|
5534
|
-
|
|
5535
|
-
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
|
|
5541
|
-
|
|
5542
|
-
|
|
5543
|
-
|
|
6040
|
+
return deduped;
|
|
6041
|
+
};
|
|
6042
|
+
const chooseAssessment = (pages, lineStarts, repeatingSequences) => {
|
|
6043
|
+
const totalLines = pages.reduce((sum, page) => sum + page.content.split("\n").length, 0);
|
|
6044
|
+
const topLine = lineStarts[0]?.count ?? 0;
|
|
6045
|
+
const topSequence = repeatingSequences[0]?.count ?? 0;
|
|
6046
|
+
const hasDenseLineBreaks = totalLines > pages.length;
|
|
6047
|
+
if (topLine >= Math.max(2, topSequence) && hasDenseLineBreaks) return {
|
|
6048
|
+
mode: "structured",
|
|
6049
|
+
reason: "Frequent repeated line-start markers dominate and the text has strong line structure."
|
|
6050
|
+
};
|
|
6051
|
+
if (topSequence > topLine && !hasDenseLineBreaks) return {
|
|
6052
|
+
mode: "continuous",
|
|
6053
|
+
reason: "Tokenized prose sequences are stronger than line-start signals and the pages are mostly continuous text."
|
|
6054
|
+
};
|
|
6055
|
+
return {
|
|
6056
|
+
mode: "mixed",
|
|
6057
|
+
reason: "The book shows both structural line markers and inline recurring sequences."
|
|
6058
|
+
};
|
|
6059
|
+
};
|
|
6060
|
+
const getRecommendedOptions = (mode, suggestions, maxRules, preprocess) => {
|
|
6061
|
+
const primarySource = mode === "continuous" ? "repeating-sequence" : "line-start";
|
|
6062
|
+
const sourceMatched = suggestions.filter((suggestion) => suggestion.source === primarySource);
|
|
6063
|
+
const selectedRules = (sourceMatched.length > 0 ? sourceMatched : suggestions).slice(0, maxRules).map((suggestion) => suggestion.rule);
|
|
6064
|
+
const optimized = optimizeRules(selectedRules);
|
|
6065
|
+
const baseOptions = primarySource === "line-start" ? {
|
|
6066
|
+
pageJoiner: "newline",
|
|
6067
|
+
rules: optimized.rules
|
|
6068
|
+
} : { rules: optimized.rules };
|
|
6069
|
+
return {
|
|
6070
|
+
optimization: {
|
|
6071
|
+
mergedCount: optimized.mergedCount,
|
|
6072
|
+
optimizedRuleCount: optimized.rules.length,
|
|
6073
|
+
originalRuleCount: selectedRules.length
|
|
6074
|
+
},
|
|
6075
|
+
options: preprocess.length > 0 ? {
|
|
6076
|
+
...baseOptions,
|
|
6077
|
+
preprocess
|
|
6078
|
+
} : baseOptions
|
|
6079
|
+
};
|
|
6080
|
+
};
|
|
6081
|
+
const evaluateRecommendation = (pages, options, sampleSegments) => {
|
|
6082
|
+
if ((options.rules?.length ?? 0) === 0) return { segmentSamples: [] };
|
|
6083
|
+
try {
|
|
6084
|
+
const segments = segmentPages(pages, options);
|
|
6085
|
+
const validation = validateSegments(pages, options, segments);
|
|
6086
|
+
const totalLength = segments.reduce((sum, segment) => sum + segment.content.length, 0);
|
|
6087
|
+
const multiPageSegments = segments.filter((segment) => segment.to !== void 0 && segment.to !== segment.from).length;
|
|
6088
|
+
return {
|
|
6089
|
+
evaluation: {
|
|
6090
|
+
averageSegmentLength: segments.length === 0 ? 0 : totalLength / segments.length,
|
|
6091
|
+
maxSegmentLength: Math.max(0, ...segments.map((segment) => segment.content.length)),
|
|
6092
|
+
multiPageSegments,
|
|
6093
|
+
segmentCount: segments.length,
|
|
6094
|
+
validation
|
|
5544
6095
|
},
|
|
5545
|
-
|
|
5546
|
-
|
|
5547
|
-
|
|
5548
|
-
|
|
5549
|
-
}
|
|
5550
|
-
})];
|
|
6096
|
+
segmentSamples: segments.slice(0, sampleSegments)
|
|
6097
|
+
};
|
|
6098
|
+
} catch {
|
|
6099
|
+
return { segmentSamples: [] };
|
|
5551
6100
|
}
|
|
5552
|
-
|
|
6101
|
+
};
|
|
6102
|
+
const toTemplateFallbackRule = (rule) => {
|
|
6103
|
+
if (!("lineStartsAfter" in rule) || !Array.isArray(rule.lineStartsAfter) || rule.lineStartsAfter.length !== 1) return null;
|
|
6104
|
+
return rule.meta ? {
|
|
6105
|
+
meta: rule.meta,
|
|
6106
|
+
split: rule.split,
|
|
6107
|
+
template: `^${rule.lineStartsAfter[0]}`
|
|
6108
|
+
} : {
|
|
6109
|
+
split: rule.split,
|
|
6110
|
+
template: `^${rule.lineStartsAfter[0]}`
|
|
6111
|
+
};
|
|
6112
|
+
};
|
|
6113
|
+
const getTemplateFallbackOptions = (options) => {
|
|
6114
|
+
if ((options.rules?.length ?? 0) === 0) return null;
|
|
6115
|
+
const fallbackRules = options.rules?.map(toTemplateFallbackRule).filter((rule) => rule !== null);
|
|
6116
|
+
if (!fallbackRules || fallbackRules.length !== options.rules?.length || fallbackRules.length === 0) return null;
|
|
6117
|
+
return options.preprocess ? {
|
|
6118
|
+
pageJoiner: "newline",
|
|
6119
|
+
preprocess: options.preprocess,
|
|
6120
|
+
rules: fallbackRules
|
|
6121
|
+
} : {
|
|
6122
|
+
pageJoiner: "newline",
|
|
6123
|
+
rules: fallbackRules
|
|
6124
|
+
};
|
|
6125
|
+
};
|
|
6126
|
+
const shouldUseTemplateFallback = (primary, fallback) => {
|
|
6127
|
+
if (!fallback) return false;
|
|
6128
|
+
if (!primary) return true;
|
|
6129
|
+
return fallback.segmentCount > primary.segmentCount && fallback.validation.summary.issues <= primary.validation.summary.issues;
|
|
6130
|
+
};
|
|
6131
|
+
const getBreakpointSuggestions = (pages, evaluation) => {
|
|
6132
|
+
const averagePageLength = pages.length === 0 ? 0 : pages.reduce((sum, page) => sum + page.content.length, 0) / pages.length;
|
|
6133
|
+
if (!((evaluation?.multiPageSegments ?? 0) > 0 || (evaluation?.maxSegmentLength ?? 0) > 4e3 || averagePageLength > 2500)) return [];
|
|
6134
|
+
return [{
|
|
6135
|
+
breakpoints: DEFAULT_BREAKPOINTS,
|
|
6136
|
+
maxPages: 1,
|
|
6137
|
+
prefer: "longer",
|
|
6138
|
+
reason: "Some segments are likely to grow large enough that sentence punctuation plus page-boundary fallback is worth testing."
|
|
6139
|
+
}];
|
|
5553
6140
|
};
|
|
5554
6141
|
/**
|
|
5555
|
-
*
|
|
5556
|
-
*
|
|
6142
|
+
* Generate a machine-readable draft segmentation report for AI agents.
|
|
6143
|
+
*
|
|
6144
|
+
* This helper is intentionally deterministic: it inspects pages, drafts
|
|
6145
|
+
* candidate rules, validates them, and evaluates its own recommendation.
|
|
5557
6146
|
*/
|
|
5558
|
-
const
|
|
5559
|
-
const
|
|
5560
|
-
|
|
5561
|
-
const
|
|
5562
|
-
const
|
|
5563
|
-
const
|
|
5564
|
-
|
|
5565
|
-
|
|
5566
|
-
|
|
5567
|
-
|
|
5568
|
-
|
|
5569
|
-
|
|
5570
|
-
|
|
5571
|
-
|
|
5572
|
-
|
|
6147
|
+
const suggestSegmentationOptions = (pages, options = {}) => {
|
|
6148
|
+
const resolved = resolveOptions(pages, options);
|
|
6149
|
+
const detections = getDetections(pages);
|
|
6150
|
+
const preprocessSuggestions = getPreprocessSuggestions(detections);
|
|
6151
|
+
const preprocess = preprocessSuggestions.map((suggestion) => suggestion.transform);
|
|
6152
|
+
const lineStarts = analyzeCommonLineStarts(pages, {
|
|
6153
|
+
minCount: resolved.minLineStartCount,
|
|
6154
|
+
sortBy: "count",
|
|
6155
|
+
topK: resolved.topLineStarts
|
|
6156
|
+
});
|
|
6157
|
+
const repeatingSequences = analyzeRepeatingSequences(pages, {
|
|
6158
|
+
maxElements: 3,
|
|
6159
|
+
minCount: resolved.minRepeatingCount,
|
|
6160
|
+
minElements: 1,
|
|
6161
|
+
topK: resolved.topRepeatingSequences
|
|
6162
|
+
});
|
|
6163
|
+
const assessment = chooseAssessment(pages, lineStarts, repeatingSequences);
|
|
6164
|
+
const lineSuggestions = lineStarts.map(createLineStartSuggestion);
|
|
6165
|
+
const repeatingSuggestions = repeatingSequences.map(createRepeatingSuggestion);
|
|
6166
|
+
const ruleSuggestions = dedupeSuggestions([...lineSuggestions, ...repeatingSuggestions]).sort((left, right) => compareSuggestions(assessment.mode, left, right));
|
|
6167
|
+
const { optimization, options: recommendedOptions } = getRecommendedOptions(assessment.mode, ruleSuggestions, resolved.maxRules, preprocess);
|
|
6168
|
+
const primary = evaluateRecommendation(pages, recommendedOptions, resolved.sampleSegments);
|
|
6169
|
+
const fallbackOptions = getTemplateFallbackOptions(recommendedOptions);
|
|
6170
|
+
const fallback = fallbackOptions ? evaluateRecommendation(pages, fallbackOptions, resolved.sampleSegments) : void 0;
|
|
6171
|
+
const finalOptions = shouldUseTemplateFallback(primary.evaluation, fallback?.evaluation) && fallbackOptions ? fallbackOptions : recommendedOptions;
|
|
6172
|
+
const finalEvaluation = finalOptions === fallbackOptions && fallback ? fallback : primary;
|
|
6173
|
+
const ruleValidation = validateRules(finalOptions.rules ?? []).filter((result) => result !== void 0);
|
|
6174
|
+
const ruleValidationErrors = formatValidationReport(ruleValidation);
|
|
6175
|
+
return {
|
|
6176
|
+
assessment,
|
|
6177
|
+
breakpointSuggestions: getBreakpointSuggestions(pages, finalEvaluation.evaluation),
|
|
6178
|
+
evaluation: finalEvaluation.evaluation,
|
|
6179
|
+
lineStarts,
|
|
6180
|
+
optimization,
|
|
6181
|
+
preprocess: {
|
|
6182
|
+
detections,
|
|
6183
|
+
suggestions: preprocessSuggestions
|
|
5573
6184
|
},
|
|
5574
|
-
|
|
5575
|
-
|
|
6185
|
+
recommendedOptions: finalOptions,
|
|
6186
|
+
repeatingSequences,
|
|
6187
|
+
ruleSuggestions,
|
|
6188
|
+
ruleValidation,
|
|
6189
|
+
ruleValidationErrors,
|
|
6190
|
+
segmentSamples: finalEvaluation.segmentSamples
|
|
6191
|
+
};
|
|
5576
6192
|
};
|
|
6193
|
+
//#endregion
|
|
6194
|
+
//#region src/detection.ts
|
|
6195
|
+
/**
|
|
6196
|
+
* Token detection order - more specific patterns first to avoid partial matches.
|
|
6197
|
+
* Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
|
|
6198
|
+
*
|
|
6199
|
+
* Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
|
|
6200
|
+
*/
|
|
6201
|
+
const TOKEN_PRIORITY_ORDER = [
|
|
6202
|
+
"basmalah",
|
|
6203
|
+
"kitab",
|
|
6204
|
+
"bab",
|
|
6205
|
+
"fasl",
|
|
6206
|
+
"naql",
|
|
6207
|
+
"rumuz",
|
|
6208
|
+
"numbered",
|
|
6209
|
+
"raqms",
|
|
6210
|
+
"raqm",
|
|
6211
|
+
"tarqim",
|
|
6212
|
+
"bullet",
|
|
6213
|
+
"dash",
|
|
6214
|
+
"harf"
|
|
6215
|
+
];
|
|
5577
6216
|
/**
|
|
5578
|
-
*
|
|
5579
|
-
*
|
|
6217
|
+
* Gets the token detection priority order.
|
|
6218
|
+
* Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
|
|
5580
6219
|
*/
|
|
5581
|
-
const
|
|
5582
|
-
const
|
|
5583
|
-
const
|
|
5584
|
-
const
|
|
5585
|
-
|
|
5586
|
-
|
|
5587
|
-
|
|
5588
|
-
|
|
5589
|
-
|
|
5590
|
-
|
|
5591
|
-
|
|
5592
|
-
|
|
5593
|
-
|
|
5594
|
-
|
|
5595
|
-
|
|
5596
|
-
|
|
5597
|
-
return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
|
|
5598
|
-
actual: {
|
|
5599
|
-
from: segment.from,
|
|
5600
|
-
to: segment.to
|
|
5601
|
-
},
|
|
5602
|
-
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
5603
|
-
expected: {
|
|
5604
|
-
from: actualFromId,
|
|
5605
|
-
to: actualToId
|
|
5606
|
-
},
|
|
5607
|
-
matchIndex: match.start
|
|
5608
|
-
}, pageMap)];
|
|
5609
|
-
}
|
|
5610
|
-
}
|
|
5611
|
-
return [createIssue("content_not_found", segment, segmentIndex, {
|
|
5612
|
-
evidence: `Segment content (${content.length} chars) not found in expected window.`,
|
|
5613
|
-
hint: "Check page boundary attribution in segmenter.ts."
|
|
5614
|
-
}, pageMap)];
|
|
5615
|
-
}
|
|
5616
|
-
const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
|
|
5617
|
-
if (alignedMatches.length > 0) {
|
|
5618
|
-
const primary = alignedMatches[0];
|
|
5619
|
-
return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
|
|
5620
|
-
}
|
|
5621
|
-
const primary = rawMatches[0];
|
|
5622
|
-
const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
|
|
5623
|
-
const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
|
|
5624
|
-
return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
|
|
5625
|
-
actual: {
|
|
5626
|
-
from: segment.from,
|
|
5627
|
-
to: segment.to
|
|
5628
|
-
},
|
|
5629
|
-
evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
|
|
5630
|
-
expected: {
|
|
5631
|
-
from: actualFromId,
|
|
5632
|
-
to: actualToId
|
|
5633
|
-
},
|
|
5634
|
-
matchIndex: primary.start
|
|
5635
|
-
}, pageMap)];
|
|
6220
|
+
const getTokenPriority = () => {
|
|
6221
|
+
const allTokens = getAvailableTokens();
|
|
6222
|
+
const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
|
|
6223
|
+
const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
|
|
6224
|
+
return [...prioritized, ...remaining];
|
|
6225
|
+
};
|
|
6226
|
+
const isRumuzStandalone = (text, startIndex, endIndex) => {
|
|
6227
|
+
const before = startIndex > 0 ? text[startIndex - 1] : "";
|
|
6228
|
+
const after = endIndex < text.length ? text[endIndex] : "";
|
|
6229
|
+
const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
|
|
6230
|
+
const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
|
|
6231
|
+
const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
|
|
6232
|
+
const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
|
|
6233
|
+
const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
|
|
6234
|
+
const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
|
|
6235
|
+
return leftOk && rightOk;
|
|
5636
6236
|
};
|
|
5637
6237
|
/**
|
|
5638
|
-
*
|
|
6238
|
+
* Analyzes text and returns all detected token patterns with their positions.
|
|
6239
|
+
* Patterns are detected in priority order to avoid partial matches.
|
|
6240
|
+
*
|
|
6241
|
+
* @param text - The text to analyze for token patterns
|
|
6242
|
+
* @returns Array of detected patterns sorted by position
|
|
6243
|
+
*
|
|
6244
|
+
* @example
|
|
6245
|
+
* detectTokenPatterns("٣٤ - حدثنا")
|
|
6246
|
+
* // Returns: [
|
|
6247
|
+
* // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
|
|
6248
|
+
* // { token: 'dash', match: '-', index: 3, endIndex: 4 },
|
|
6249
|
+
* // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
|
|
6250
|
+
* // ]
|
|
5639
6251
|
*/
|
|
5640
|
-
const
|
|
5641
|
-
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
5645
|
-
|
|
6252
|
+
const detectTokenPatterns = (text) => {
|
|
6253
|
+
if (!text) return [];
|
|
6254
|
+
const results = [];
|
|
6255
|
+
const coveredRanges = [];
|
|
6256
|
+
const isPositionCovered = (start, end) => {
|
|
6257
|
+
return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
|
|
6258
|
+
};
|
|
6259
|
+
for (const tokenName of getTokenPriority()) {
|
|
6260
|
+
const pattern = TOKEN_PATTERNS[tokenName];
|
|
6261
|
+
if (!pattern) continue;
|
|
6262
|
+
try {
|
|
6263
|
+
const regex = new RegExp(`(${pattern})`, "gu");
|
|
6264
|
+
let match;
|
|
6265
|
+
while ((match = regex.exec(text)) !== null) {
|
|
6266
|
+
const startIndex = match.index;
|
|
6267
|
+
const endIndex = startIndex + match[0].length;
|
|
6268
|
+
if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
|
|
6269
|
+
if (isPositionCovered(startIndex, endIndex)) continue;
|
|
6270
|
+
results.push({
|
|
6271
|
+
endIndex,
|
|
6272
|
+
index: startIndex,
|
|
6273
|
+
match: match[0],
|
|
6274
|
+
token: tokenName
|
|
6275
|
+
});
|
|
6276
|
+
coveredRanges.push([startIndex, endIndex]);
|
|
6277
|
+
}
|
|
6278
|
+
} catch {}
|
|
5646
6279
|
}
|
|
5647
|
-
return
|
|
6280
|
+
return results.sort((a, b) => a.index - b.index);
|
|
5648
6281
|
};
|
|
5649
6282
|
/**
|
|
5650
|
-
*
|
|
5651
|
-
*
|
|
6283
|
+
* Generates a template pattern from text using detected tokens.
|
|
6284
|
+
* Replaces matched portions with {{token}} syntax.
|
|
6285
|
+
*
|
|
6286
|
+
* @param text - Original text
|
|
6287
|
+
* @param detected - Array of detected patterns from detectTokenPatterns
|
|
6288
|
+
* @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
|
|
6289
|
+
*
|
|
6290
|
+
* @example
|
|
6291
|
+
* const detected = detectTokenPatterns("٣٤ - ");
|
|
6292
|
+
* generateTemplateFromText("٣٤ - ", detected);
|
|
6293
|
+
* // Returns: "{{raqms}} {{dash}} "
|
|
5652
6294
|
*/
|
|
5653
|
-
const
|
|
5654
|
-
if (!
|
|
5655
|
-
|
|
5656
|
-
|
|
5657
|
-
const
|
|
5658
|
-
|
|
5659
|
-
const idx = joined.indexOf(segment.content, searchStart);
|
|
5660
|
-
if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
|
|
5661
|
-
return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
|
|
6295
|
+
const generateTemplateFromText = (text, detected) => {
|
|
6296
|
+
if (!text || detected.length === 0) return text;
|
|
6297
|
+
let template = text;
|
|
6298
|
+
const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
|
|
6299
|
+
for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
|
|
6300
|
+
return template;
|
|
5662
6301
|
};
|
|
5663
6302
|
/**
|
|
5664
|
-
*
|
|
6303
|
+
* Determines the best pattern type for auto-generated rules based on detected patterns.
|
|
6304
|
+
*
|
|
6305
|
+
* @param detected - Array of detected patterns
|
|
6306
|
+
* @returns Suggested pattern type and whether to use fuzzy matching
|
|
5665
6307
|
*/
|
|
5666
|
-
const
|
|
5667
|
-
|
|
5668
|
-
|
|
5669
|
-
|
|
5670
|
-
|
|
5671
|
-
|
|
5672
|
-
|
|
5673
|
-
|
|
5674
|
-
|
|
5675
|
-
|
|
5676
|
-
|
|
5677
|
-
|
|
5678
|
-
|
|
5679
|
-
|
|
5680
|
-
|
|
5681
|
-
|
|
5682
|
-
|
|
5683
|
-
|
|
5684
|
-
|
|
5685
|
-
|
|
5686
|
-
|
|
5687
|
-
|
|
5688
|
-
|
|
6308
|
+
const suggestPatternConfig = (detected) => {
|
|
6309
|
+
const hasStructuralToken = detected.some((d) => [
|
|
6310
|
+
"basmalah",
|
|
6311
|
+
"kitab",
|
|
6312
|
+
"bab",
|
|
6313
|
+
"fasl"
|
|
6314
|
+
].includes(d.token));
|
|
6315
|
+
const hasNumberedPattern = detected.some((d) => [
|
|
6316
|
+
"raqms",
|
|
6317
|
+
"raqm",
|
|
6318
|
+
"numbered"
|
|
6319
|
+
].includes(d.token));
|
|
6320
|
+
if (hasStructuralToken) return {
|
|
6321
|
+
fuzzy: true,
|
|
6322
|
+
metaType: detected.find((d) => [
|
|
6323
|
+
"kitab",
|
|
6324
|
+
"bab",
|
|
6325
|
+
"fasl"
|
|
6326
|
+
].includes(d.token))?.token || "chapter",
|
|
6327
|
+
patternType: "lineStartsWith"
|
|
6328
|
+
};
|
|
6329
|
+
if (hasNumberedPattern) return {
|
|
6330
|
+
fuzzy: false,
|
|
6331
|
+
metaType: "hadith",
|
|
6332
|
+
patternType: "lineStartsAfter"
|
|
6333
|
+
};
|
|
6334
|
+
return {
|
|
6335
|
+
fuzzy: false,
|
|
6336
|
+
patternType: "lineStartsAfter"
|
|
6337
|
+
};
|
|
5689
6338
|
};
|
|
5690
6339
|
/**
|
|
5691
|
-
*
|
|
5692
|
-
* checks for:
|
|
5693
|
-
* - Page existence (invalid IDs)
|
|
5694
|
-
* - Content fidelity (content must exist in pages)
|
|
5695
|
-
* - Page attribution (from/to must match content location)
|
|
5696
|
-
* - Page constraints (maxPages violations)
|
|
6340
|
+
* Analyzes text and generates a complete suggested rule configuration.
|
|
5697
6341
|
*
|
|
5698
|
-
* @param
|
|
5699
|
-
* @
|
|
5700
|
-
* @param segments The output segments to validate
|
|
5701
|
-
* @param validationOptions Optional settings for validation behavior
|
|
5702
|
-
* @returns A detailed validation report
|
|
6342
|
+
* @param text - Highlighted text from the page
|
|
6343
|
+
* @returns Suggested rule configuration or null if no patterns detected
|
|
5703
6344
|
*/
|
|
5704
|
-
const
|
|
5705
|
-
const
|
|
5706
|
-
|
|
5707
|
-
const boundaryMap = /* @__PURE__ */ new Map();
|
|
5708
|
-
const pageMap = /* @__PURE__ */ new Map();
|
|
5709
|
-
for (const b of boundaries) boundaryMap.set(b.id, b);
|
|
5710
|
-
for (const p of normalizedPages) pageMap.set(p.id, p);
|
|
5711
|
-
const pageIds = new Set(normalizedPages.map((p) => p.id));
|
|
5712
|
-
const maxPages = options.maxPages;
|
|
5713
|
-
const issues = [];
|
|
5714
|
-
for (let i = 0; i < segments.length; i++) {
|
|
5715
|
-
const segment = segments[i];
|
|
5716
|
-
if (!pageIds.has(segment.from)) {
|
|
5717
|
-
issues.push(createIssue("page_not_found", segment, i));
|
|
5718
|
-
continue;
|
|
5719
|
-
}
|
|
5720
|
-
if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
|
|
5721
|
-
const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
|
|
5722
|
-
if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
|
|
5723
|
-
const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
|
|
5724
|
-
issues.push(...attributionIssues);
|
|
5725
|
-
}
|
|
5726
|
-
const errors = issues.filter((issue) => issue.severity === "error").length;
|
|
5727
|
-
const warnings = issues.filter((issue) => issue.severity === "warn").length;
|
|
6345
|
+
const analyzeTextForRule = (text) => {
|
|
6346
|
+
const detected = detectTokenPatterns(text);
|
|
6347
|
+
if (detected.length === 0) return null;
|
|
5728
6348
|
return {
|
|
5729
|
-
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
errors,
|
|
5733
|
-
issues: issues.length,
|
|
5734
|
-
pageCount: pages.length,
|
|
5735
|
-
segmentCount: segments.length,
|
|
5736
|
-
warnings
|
|
5737
|
-
}
|
|
6349
|
+
detected,
|
|
6350
|
+
template: generateTemplateFromText(text, detected),
|
|
6351
|
+
...suggestPatternConfig(detected)
|
|
5738
6352
|
};
|
|
5739
6353
|
};
|
|
5740
6354
|
//#endregion
|
|
5741
|
-
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
|
|
6355
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
|
|
5742
6356
|
|
|
5743
6357
|
//# sourceMappingURL=index.mjs.map
|