flappa-doormal 2.20.0 → 2.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -710,7 +710,7 @@ const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch
710
710
  const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
711
711
  //#endregion
712
712
  //#region src/analysis/line-starts.ts
713
- const resolveOptions$1 = (options = {}) => ({
713
+ const resolveOptions$2 = (options = {}) => ({
714
714
  includeFirstWordFallback: options.includeFirstWordFallback ?? true,
715
715
  lineFilter: options.lineFilter,
716
716
  maxExamples: options.maxExamples ?? 1,
@@ -939,7 +939,7 @@ const processPage = (page, tokenPriority, opts, acc) => {
939
939
  * Analyze pages and return the most common line-start patterns (top K).
940
940
  */
941
941
  const analyzeCommonLineStarts = (pages, options = {}) => {
942
- const opts = resolveOptions$1(options);
942
+ const opts = resolveOptions$2(options);
943
943
  const tokenPriority = buildTokenPriority();
944
944
  const acc = /* @__PURE__ */ new Map();
945
945
  for (const page of pages) processPage(page, tokenPriority, opts, acc);
@@ -952,7 +952,7 @@ const analyzeCommonLineStarts = (pages, options = {}) => {
952
952
  };
953
953
  //#endregion
954
954
  //#region src/analysis/repeating-sequences.ts
955
- const resolveOptions = (options) => {
955
+ const resolveOptions$1 = (options) => {
956
956
  const minElements = Math.max(1, options?.minElements ?? 1);
957
957
  return {
958
958
  contextChars: options?.contextChars ?? 50,
@@ -1106,7 +1106,7 @@ const extractPageNgrams = (page, items, opts, stats) => {
1106
1106
  * use `analyzeCommonLineStarts()` instead.
1107
1107
  */
1108
1108
  const analyzeRepeatingSequences = (pages, options) => {
1109
- const opts = resolveOptions(options);
1109
+ const opts = resolveOptions$1(options);
1110
1110
  const stats = /* @__PURE__ */ new Map();
1111
1111
  for (const page of pages) {
1112
1112
  if (!page.content) continue;
@@ -1119,636 +1119,821 @@ const analyzeRepeatingSequences = (pages, options) => {
1119
1119
  }));
1120
1120
  };
1121
1121
  //#endregion
1122
- //#region src/detection.ts
1122
+ //#region src/types/rules.ts
1123
1123
  /**
1124
- * Token detection order - more specific patterns first to avoid partial matches.
1125
- * Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
1124
+ * Pattern type key names for split rules.
1126
1125
  *
1127
- * Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
1126
+ * Use this array to dynamically iterate over pattern types in UIs,
1127
+ * or use the `PatternTypeKey` type for type-safe string unions.
1128
+ *
1129
+ * @example
1130
+ * // Build a dropdown/select in UI
1131
+ * PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
1132
+ *
1133
+ * @example
1134
+ * // Type-safe pattern key validation
1135
+ * const validateKey = (k: string): k is PatternTypeKey =>
1136
+ * (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
1128
1137
  */
1129
- const TOKEN_PRIORITY_ORDER = [
1130
- "basmalah",
1131
- "kitab",
1132
- "bab",
1133
- "fasl",
1134
- "naql",
1135
- "rumuz",
1136
- "numbered",
1137
- "raqms",
1138
- "raqm",
1139
- "tarqim",
1140
- "bullet",
1141
- "dash",
1142
- "harf"
1138
+ const PATTERN_TYPE_KEYS = [
1139
+ "lineStartsWith",
1140
+ "lineStartsAfter",
1141
+ "lineEndsWith",
1142
+ "template",
1143
+ "regex",
1144
+ "dictionaryEntry"
1143
1145
  ];
1146
+ //#endregion
1147
+ //#region src/optimization/optimize-rules.ts
1148
+ const MERGEABLE_KEYS = new Set([
1149
+ "lineStartsWith",
1150
+ "lineStartsAfter",
1151
+ "lineEndsWith"
1152
+ ]);
1144
1153
  /**
1145
- * Gets the token detection priority order.
1146
- * Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
1154
+ * Get the pattern type key for a rule.
1147
1155
  */
1148
- const getTokenPriority = () => {
1149
- const allTokens = getAvailableTokens();
1150
- const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
1151
- const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
1152
- return [...prioritized, ...remaining];
1156
+ const getPatternKey = (rule) => PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
1157
+ const getPatternArray = (rule, key) => {
1158
+ const value = rule[key];
1159
+ return Array.isArray(value) ? value : [];
1153
1160
  };
1154
- const isRumuzStandalone = (text, startIndex, endIndex) => {
1155
- const before = startIndex > 0 ? text[startIndex - 1] : "";
1156
- const after = endIndex < text.length ? text[endIndex] : "";
1157
- const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
1158
- const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
1159
- const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
1160
- const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
1161
- const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
1162
- const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
1163
- return leftOk && rightOk;
1161
+ const getPatternString = (rule, key) => {
1162
+ const value = rule[key];
1163
+ return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
1164
1164
  };
1165
- /**
1166
- * Analyzes text and returns all detected token patterns with their positions.
1167
- * Patterns are detected in priority order to avoid partial matches.
1168
- *
1169
- * @param text - The text to analyze for token patterns
1170
- * @returns Array of detected patterns sorted by position
1171
- *
1172
- * @example
1173
- * detectTokenPatterns("٣٤ - حدثنا")
1174
- * // Returns: [
1175
- * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
1176
- * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
1177
- * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
1178
- * // ]
1179
- */
1180
- const detectTokenPatterns = (text) => {
1181
- if (!text) return [];
1182
- const results = [];
1183
- const coveredRanges = [];
1184
- const isPositionCovered = (start, end) => {
1185
- return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
1186
- };
1187
- for (const tokenName of getTokenPriority()) {
1188
- const pattern = TOKEN_PATTERNS[tokenName];
1189
- if (!pattern) continue;
1190
- try {
1191
- const regex = new RegExp(`(${pattern})`, "gu");
1192
- let match;
1193
- while ((match = regex.exec(text)) !== null) {
1194
- const startIndex = match.index;
1195
- const endIndex = startIndex + match[0].length;
1196
- if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
1197
- if (isPositionCovered(startIndex, endIndex)) continue;
1198
- results.push({
1199
- endIndex,
1200
- index: startIndex,
1201
- match: match[0],
1202
- token: tokenName
1203
- });
1204
- coveredRanges.push([startIndex, endIndex]);
1205
- }
1206
- } catch {}
1207
- }
1208
- return results.sort((a, b) => a.index - b.index);
1165
+ const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
1166
+ const serializePrimitive = (value) => {
1167
+ if (value === void 0) return "undefined";
1168
+ if (typeof value === "number") return Number.isFinite(value) ? JSON.stringify(value) : JSON.stringify(String(value));
1169
+ if (typeof value === "bigint") return JSON.stringify(`${value}n`);
1170
+ if (typeof value === "symbol") return JSON.stringify(value.toString());
1171
+ return JSON.stringify(value);
1172
+ };
1173
+ const stableSerializeArray = (values, seen) => `[${values.map((value) => stableSerializeValue(value, seen)).join(",")}]`;
1174
+ const stableSerializeObject = (value, seen) => {
1175
+ if (seen.has(value)) throw new TypeError("Cannot optimize rules with circular option values");
1176
+ seen.add(value);
1177
+ const serialized = Object.entries(value).filter(([, entryValue]) => entryValue !== void 0).sort(([left], [right]) => left.localeCompare(right)).map(([entryKey, entryValue]) => `${JSON.stringify(entryKey)}:${stableSerializeValue(entryValue, seen)}`).join(",");
1178
+ seen.delete(value);
1179
+ return `{${serialized}}`;
1180
+ };
1181
+ const stableSerializeValue = (value, seen) => {
1182
+ if (typeof value === "function") return JSON.stringify(`[Function:${value.name || "anonymous"}]`);
1183
+ if (!value || typeof value !== "object") return serializePrimitive(value);
1184
+ if (Array.isArray(value)) return stableSerializeArray(value, seen);
1185
+ if (value instanceof Date) return JSON.stringify(value.toISOString());
1186
+ if (value instanceof RegExp) return JSON.stringify(value.toString());
1187
+ return stableSerializeObject(value, seen);
1188
+ };
1189
+ const stableSerialize = (value) => stableSerializeValue(value, /* @__PURE__ */ new WeakSet());
1190
+ const getDictionaryEntrySpecificityScore = (rule) => {
1191
+ if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
1192
+ const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
1193
+ return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
1209
1194
  };
1210
- /**
1211
- * Generates a template pattern from text using detected tokens.
1212
- * Replaces matched portions with {{token}} syntax.
1213
- *
1214
- * @param text - Original text
1215
- * @param detected - Array of detected patterns from detectTokenPatterns
1216
- * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
1217
- *
1218
- * @example
1219
- * const detected = detectTokenPatterns("٣٤ - ");
1220
- * generateTemplateFromText("٣٤ - ", detected);
1221
- * // Returns: "{{raqms}} {{dash}} "
1222
- */
1223
- const generateTemplateFromText = (text, detected) => {
1224
- if (!text || detected.length === 0) return text;
1225
- let template = text;
1226
- const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
1227
- for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
1228
- return template;
1195
+ const getSpecificityScore = (rule) => {
1196
+ const key = getPatternKey(rule);
1197
+ if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
1198
+ return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
1229
1199
  };
1230
- /**
1231
- * Determines the best pattern type for auto-generated rules based on detected patterns.
1232
- *
1233
- * @param detected - Array of detected patterns
1234
- * @returns Suggested pattern type and whether to use fuzzy matching
1235
- */
1236
- const suggestPatternConfig = (detected) => {
1237
- const hasStructuralToken = detected.some((d) => [
1238
- "basmalah",
1239
- "kitab",
1240
- "bab",
1241
- "fasl"
1242
- ].includes(d.token));
1243
- const hasNumberedPattern = detected.some((d) => [
1244
- "raqms",
1245
- "raqm",
1246
- "numbered"
1247
- ].includes(d.token));
1248
- if (hasStructuralToken) return {
1249
- fuzzy: true,
1250
- metaType: detected.find((d) => [
1251
- "kitab",
1252
- "bab",
1253
- "fasl"
1254
- ].includes(d.token))?.token || "chapter",
1255
- patternType: "lineStartsWith"
1256
- };
1257
- if (hasNumberedPattern) return {
1258
- fuzzy: false,
1259
- metaType: "hadith",
1260
- patternType: "lineStartsAfter"
1261
- };
1200
+ const createMergeKey = (rule) => {
1201
+ const key = getPatternKey(rule);
1202
+ return `${key}|${stableSerialize(Object.fromEntries(Object.entries(rule).filter(([field]) => field !== key)))}`;
1203
+ };
1204
+ const optimizeRules = (rules) => {
1205
+ const output = [];
1206
+ const indexByMergeKey = /* @__PURE__ */ new Map();
1207
+ let mergedCount = 0;
1208
+ for (const rule of rules) {
1209
+ const key = getPatternKey(rule);
1210
+ if (!MERGEABLE_KEYS.has(key)) {
1211
+ output.push(rule);
1212
+ continue;
1213
+ }
1214
+ const mergeKey = createMergeKey(rule);
1215
+ const existingIndex = indexByMergeKey.get(mergeKey);
1216
+ if (existingIndex === void 0) {
1217
+ indexByMergeKey.set(mergeKey, output.length);
1218
+ output.push({
1219
+ ...rule,
1220
+ [key]: normalizePatterns(getPatternArray(rule, key))
1221
+ });
1222
+ } else {
1223
+ const existing = output[existingIndex];
1224
+ existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
1225
+ mergedCount++;
1226
+ }
1227
+ }
1262
1228
  return {
1263
- fuzzy: false,
1264
- patternType: "lineStartsAfter"
1229
+ mergedCount,
1230
+ rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
1265
1231
  };
1266
1232
  };
1267
- /**
1268
- * Analyzes text and generates a complete suggested rule configuration.
1269
- *
1270
- * @param text - Highlighted text from the page
1271
- * @returns Suggested rule configuration or null if no patterns detected
1272
- */
1273
- const analyzeTextForRule = (text) => {
1274
- const detected = detectTokenPatterns(text);
1275
- if (detected.length === 0) return null;
1233
+ //#endregion
1234
+ //#region src/segmentation/pattern-validator.ts
1235
+ const KNOWN_TOKENS = new Set(getAvailableTokens());
1236
+ const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
1237
+ const BARE_TOKEN_REGEX = (() => {
1238
+ const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
1239
+ return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
1240
+ })();
1241
+ const createMalformedTokenIssue = (tokenLiteral, side) => {
1242
+ const token = tokenLiteral.split(":", 1)[0] || void 0;
1276
1243
  return {
1277
- detected,
1278
- template: generateTemplateFromText(text, detected),
1279
- ...suggestPatternConfig(detected)
1244
+ message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
1245
+ suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
1246
+ token,
1247
+ type: "missing_braces"
1280
1248
  };
1281
1249
  };
1282
- //#endregion
1283
- //#region src/dictionary/arabic-dictionary-rule.ts
1284
- const uniqueCanonicalWords = (words) => {
1285
- const seen = /* @__PURE__ */ new Set();
1286
- const result = [];
1287
- for (const word of words) {
1288
- const normalized = normalizeArabicForComparison(word);
1289
- if (!normalized || seen.has(normalized)) continue;
1290
- seen.add(normalized);
1291
- result.push(word);
1250
+ const detectMalformedLeftToken = (pattern) => {
1251
+ for (let index = 0; index < pattern.length - 1; index++) {
1252
+ if (pattern.slice(index, index + 2) !== "{{") continue;
1253
+ const closeIndex = pattern.indexOf("}}", index + 2);
1254
+ if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
1255
+ index = closeIndex + 1;
1292
1256
  }
1293
- return result;
1294
- };
1295
- const buildStopAlternation = (stopWords) => {
1296
- const unique = uniqueCanonicalWords(stopWords);
1297
- if (unique.length === 0) return "";
1298
- return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
1299
1257
  };
1300
- const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1301
- if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1302
- const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1303
- return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1258
+ const detectMalformedRightToken = (pattern) => {
1259
+ for (let index = 0; index < pattern.length - 1; index++) {
1260
+ if (pattern.slice(index, index + 2) !== "}}") continue;
1261
+ if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
1262
+ }
1304
1263
  };
1305
- const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1306
- const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1307
- const withCapture = `(?<${captureName}>${headwordBody})`;
1308
- if (!allowParenthesized) return `${withCapture}${colon}`;
1309
- return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1264
+ const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
1265
+ /**
1266
+ * Validates a single pattern for common issues.
1267
+ */
1268
+ const validatePattern = (pattern, seenPatterns) => {
1269
+ if (!pattern.trim()) return {
1270
+ message: "Empty pattern is not allowed",
1271
+ type: "empty_pattern"
1272
+ };
1273
+ if (seenPatterns.has(pattern)) return {
1274
+ message: `Duplicate pattern: "${pattern}"`,
1275
+ pattern,
1276
+ type: "duplicate"
1277
+ };
1278
+ seenPatterns.add(pattern);
1279
+ TOKEN_INSIDE_BRACES.lastIndex = 0;
1280
+ for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
1281
+ const name = match[1];
1282
+ if (name && !KNOWN_TOKENS.has(name)) return {
1283
+ message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
1284
+ suggestion: "Check spelling or use a known token",
1285
+ token: name,
1286
+ type: "unknown_token"
1287
+ };
1288
+ }
1289
+ const malformed = detectMalformedToken(pattern);
1290
+ if (malformed) return malformed;
1291
+ for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
1292
+ const [full, name] = match;
1293
+ const idx = match.index;
1294
+ if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
1295
+ message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
1296
+ suggestion: `{{${full}}}`,
1297
+ token: name,
1298
+ type: "missing_braces"
1299
+ };
1300
+ }
1310
1301
  };
1311
- const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
1312
- if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1313
- if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1314
- if (!/^[A-Za-z_]\w*$/.test(captureName)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1302
+ /**
1303
+ * Validates an array of patterns, returning parallel array of issues.
1304
+ */
1305
+ const validatePatternArray = (patterns) => {
1306
+ const seen = /* @__PURE__ */ new Set();
1307
+ const issues = patterns.map((p) => validatePattern(p, seen));
1308
+ return issues.some(Boolean) ? issues : void 0;
1315
1309
  };
1316
- const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
1317
- validateDictionaryEntryOptions({
1318
- captureName,
1319
- maxLetters,
1320
- minLetters
1321
- });
1322
- const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1323
- const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1324
- const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1325
- const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1326
- const stopAlternation = buildStopAlternation(stopWords);
1327
- const lemmaBody = buildHeadwordBody({
1328
- allowCommaSeparated,
1329
- colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1330
- stopAlternation,
1331
- stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1332
- unit: lemmaUnit
1333
- });
1334
- const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
1335
- const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
1336
- const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
1337
- const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
1338
- allowParenthesized,
1339
- allowWhitespaceBeforeColon,
1340
- captureName: prefixedCaptureName,
1341
- headwordBody: lemmaBody
1342
- });
1343
- return {
1344
- captureNames: [prefixedCaptureName],
1345
- regex
1346
- };
1310
+ const applyRulePatternValidation = (result, key, patterns) => {
1311
+ if (!patterns) return false;
1312
+ const issues = validatePatternArray(patterns);
1313
+ if (!issues) return false;
1314
+ result[key] = issues;
1315
+ return true;
1316
+ };
1317
+ const validateTemplateRule = (rule, result) => {
1318
+ if (!("template" in rule)) return false;
1319
+ const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
1320
+ if (!issue) return false;
1321
+ result.template = issue;
1322
+ return true;
1323
+ };
1324
+ const validateRegexRule = (rule, result) => {
1325
+ if (!("regex" in rule)) return false;
1326
+ if (!rule.regex.trim()) {
1327
+ result.regex = {
1328
+ message: "Empty pattern is not allowed",
1329
+ type: "empty_pattern"
1330
+ };
1331
+ return true;
1332
+ }
1333
+ try {
1334
+ new RegExp(rule.regex, "u");
1335
+ return false;
1336
+ } catch (error) {
1337
+ result.regex = {
1338
+ message: error instanceof Error ? error.message : String(error),
1339
+ pattern: rule.regex,
1340
+ type: "invalid_regex"
1341
+ };
1342
+ return true;
1343
+ }
1344
+ };
1345
+ const invalidDictionaryEntryIssue = (message) => ({
1346
+ message,
1347
+ type: "invalid_option"
1348
+ });
1349
+ const addBooleanDictionaryEntryIssue = (issues, key, value) => {
1350
+ if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
1351
+ };
1352
+ const addCaptureNameIssue = (issues, captureName) => {
1353
+ if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
1354
+ };
1355
+ const addMinLettersIssue = (issues, minLetters) => {
1356
+ if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
1357
+ };
1358
+ const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
1359
+ const min = minLetters ?? 2;
1360
+ if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
1361
+ };
1362
+ const validateDictionaryEntryRule = (rule, result) => {
1363
+ if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
1364
+ const issues = {};
1365
+ const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
1366
+ if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
1367
+ addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
1368
+ addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
1369
+ addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
1370
+ addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
1371
+ addCaptureNameIssue(issues, captureName);
1372
+ addMinLettersIssue(issues, minLetters);
1373
+ addMaxLettersIssue(issues, maxLetters, minLetters);
1374
+ if (Object.keys(issues).length === 0) return false;
1375
+ result.dictionaryEntry = issues;
1376
+ return true;
1377
+ };
1378
+ const formatValidationIssue = (_type, issue, loc) => {
1379
+ if (!issue) return null;
1380
+ if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
1381
+ if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
1382
+ if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
1383
+ if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
1384
+ return `${loc}: ${issue.message || issue.type}`;
1347
1385
  };
1348
1386
  /**
1349
- * Creates a reusable split rule for Arabic dictionary entries.
1387
+ * Validates split rules for common pattern issues.
1350
1388
  *
1351
- * The returned rule preserves authoring intent as a serializable
1352
- * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1353
- * regex string.
1389
+ * Checks for:
1390
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
1391
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
1392
+ * - Duplicate patterns within the same rule
1354
1393
  *
1355
- * @example
1356
- * createArabicDictionaryEntryRule({
1357
- * stopWords: ['وقيل', 'ويقال', 'قال'],
1358
- * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1359
- * })
1394
+ * @param rules - Array of split rules to validate
1395
+ * @returns Array parallel to input with validation results (undefined if no issues)
1360
1396
  *
1361
1397
  * @example
1362
- * createArabicDictionaryEntryRule({
1363
- * allowParenthesized: true,
1364
- * allowWhitespaceBeforeColon: true,
1365
- * allowCommaSeparated: true,
1366
- * stopWords: ['الليث', 'العجاج'],
1367
- * })
1398
+ * const issues = validateRules([
1399
+ * { lineStartsAfter: ['raqms:num'] }, // Missing braces
1400
+ * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
1401
+ * ]);
1402
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
1403
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
1368
1404
  */
1405
+ const validateRules = (rules) => rules.map((rule) => {
1406
+ const result = {};
1407
+ const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
1408
+ const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
1409
+ const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
1410
+ const templateIssues = validateTemplateRule(rule, result);
1411
+ const regexIssues = validateRegexRule(rule, result);
1412
+ const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
1413
+ return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
1414
+ });
1369
1415
  /**
1370
- * @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
1371
- * whole-book dictionary segmentation. Keep this helper for advanced single-rule
1372
- * composition inside a broader `SplitRule[]` pipeline.
1416
+ * Formats a validation result array into a list of human-readable error messages.
1417
+ *
1418
+ * Useful for displaying validation errors in UIs.
1419
+ *
1420
+ * @param results - The result array from `validateRules()`
1421
+ * @returns Array of formatted error strings
1422
+ *
1423
+ * @example
1424
+ * const issues = validateRules(rules);
1425
+ * const errors = formatValidationReport(issues);
1426
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
1373
1427
  */
1374
- const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1375
- validateDictionaryEntryOptions({
1376
- captureName,
1377
- maxLetters,
1378
- minLetters
1379
- });
1380
- return {
1381
- dictionaryEntry: {
1382
- allowCommaSeparated,
1383
- allowParenthesized,
1384
- allowWhitespaceBeforeColon,
1385
- captureName,
1386
- maxLetters,
1387
- midLineSubentries,
1388
- minLetters,
1389
- stopWords: uniqueCanonicalWords(stopWords)
1390
- },
1391
- meta,
1392
- pageStartPrevWordStoplist,
1393
- samePagePrevWordStoplist
1394
- };
1428
+ const formatValidationReport = (results) => results.flatMap((result, i) => {
1429
+ if (!result) return [];
1430
+ return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
1431
+ });
1432
+ const formatValidationIssues = (type, issues, ruleNumber) => {
1433
+ if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
1434
+ return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
1395
1435
  };
1396
1436
  //#endregion
1397
- //#region src/dictionary/heading-classifier.ts
1398
- const HEADING_PREFIX$1 = "## ";
1399
- const CODE_LINE_PATTERN$1 = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
1400
- const ARABIC_WORD_PATTERN = ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN;
1401
- const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
1402
- const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
1403
- const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN$1})(?:[)\\]])?$`, "u");
1404
- const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
1405
- const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
1406
- const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
1407
- const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
1408
- const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
1409
- const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
1410
- const COLON_NOISE_RE = /^.+:\s*.+$/u;
1411
- const CHAPTER_TERMS = [
1412
- "باب",
1413
- "فصل",
1414
- "كتاب",
1415
- "حرف",
1416
- "أبواب"
1417
- ];
1418
- const MARKER_PREFIXES = [
1419
- "بسم الله",
1420
- "توكلت على الله",
1421
- "آخر كتاب",
1422
- "ويتلوه"
1423
- ];
1424
- const NOISE_TOKENS = [
1425
- "قال",
1426
- "وقيل",
1427
- "ويقال",
1428
- "وفي",
1429
- "يعني",
1430
- "فإذا"
1431
- ];
1432
- const emptyCounts = () => ({
1433
- chapter: 0,
1434
- cluster: 0,
1435
- codeLine: 0,
1436
- entry: 0,
1437
- inlineSubentry: 0,
1438
- lineEntry: 0,
1439
- marker: 0,
1440
- noise: 0,
1441
- pairedForms: 0
1442
- });
1443
- const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
1444
- const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
1445
- const isDelimitedPrefixMatch$1 = (text, prefix) => {
1446
- if (text === prefix) return true;
1447
- if (!text.startsWith(prefix)) return false;
1448
- const nextChar = text[prefix.length];
1449
- return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
1437
+ //#region src/preprocessing/transforms.ts
1438
+ /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
1439
+ const assertNever$2 = (x) => {
1440
+ throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
1450
1441
  };
1451
- const isCodeHeading = (text) => {
1452
- if (CODE_LINE_RE.test(text)) return true;
1453
- const words = text.trim().split(/\s+/u).filter(Boolean);
1454
- return words.length === 1 && (words[0]?.length ?? 0) === 1;
1442
+ /** Check if a character is whitespace (space, newline, tab, etc.) */
1443
+ const isWhitespace = (char) => /\s/.test(char);
1444
+ /**
1445
+ * Check if a character code is a zero-width control character.
1446
+ *
1447
+ * Covers:
1448
+ * - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
1449
+ * - U+202A–U+202E (Bidirectional Formatting)
1450
+ * - U+2060–U+2064 (Word Joiner, Invisible Operators)
1451
+ * - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
1452
+ */
1453
+ const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
1454
+ /**
1455
+ * Remove zero-width control characters from text.
1456
+ *
1457
+ * @param text - Input text
1458
+ * @param mode - 'strip' (default) removes entirely, 'space' replaces with space
1459
+ * @returns Text with zero-width characters removed or replaced
1460
+ */
1461
+ const removeZeroWidth = (text, mode = "strip") => {
1462
+ if (mode === "space") {
1463
+ const parts = [];
1464
+ let lastWasWhitespace = true;
1465
+ for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
1466
+ if (!lastWasWhitespace && parts.length > 0) {
1467
+ parts.push(" ");
1468
+ lastWasWhitespace = true;
1469
+ }
1470
+ } else {
1471
+ const char = text[i];
1472
+ parts.push(char);
1473
+ lastWasWhitespace = isWhitespace(char);
1474
+ }
1475
+ return parts.join("");
1476
+ }
1477
+ return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
1455
1478
  };
1456
- const looksLikeNoiseHeading = (text) => {
1457
- const normalized = normalizeArabicForComparison(text);
1458
- const wordCount = text.trim().split(/\s+/u).filter(Boolean).length;
1459
- if (/(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\s])/u.test(text)) return false;
1460
- if (wordCount >= 8 && COLON_NOISE_RE.test(text)) return true;
1461
- return NOISE_TOKENS.some((token) => normalized.includes(normalizeArabicForComparison(token))) && wordCount >= 4;
1479
+ /**
1480
+ * Condense multiple periods (...) into ellipsis character (…).
1481
+ *
1482
+ * Prevents `{{tarqim}}` from false-matching inside ellipsis since
1483
+ * the `.` in tarqim matches individual periods.
1484
+ *
1485
+ * @param text - Input text
1486
+ * @returns Text with period sequences replaced by ellipsis
1487
+ */
1488
+ const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
1489
+ /**
1490
+ * Join trailing و (waw) to the next word.
1491
+ *
1492
+ * Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
1493
+ *
1494
+ * @param text - Input text
1495
+ * @returns Text with trailing waw joined to following word
1496
+ */
1497
+ const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
1498
+ /**
1499
+ * Check if a page ID is within a constraint range.
1500
+ */
1501
+ const isInRange = (pageId, constraint) => {
1502
+ if (constraint.min !== void 0 && pageId < constraint.min) return false;
1503
+ if (constraint.max !== void 0 && pageId > constraint.max) return false;
1504
+ return true;
1505
+ };
1506
+ /**
1507
+ * Normalize a transform to its object form.
1508
+ */
1509
+ const normalizeTransform = (transform) => {
1510
+ if (typeof transform === "string") return { type: transform };
1511
+ return transform;
1512
+ };
1513
+ /**
1514
+ * Apply preprocessing transforms to a page's content.
1515
+ *
1516
+ * Transforms run in array order. Each can be limited to specific pages
1517
+ * via `min`/`max` constraints.
1518
+ *
1519
+ * @param content - Page content to transform
1520
+ * @param pageId - Page ID for constraint checking
1521
+ * @param transforms - Array of transforms to apply
1522
+ * @returns Transformed content
1523
+ */
1524
+ const applyPreprocessToPage = (content, pageId, transforms) => {
1525
+ let result = content;
1526
+ for (const transform of transforms) {
1527
+ const rule = normalizeTransform(transform);
1528
+ if (!isInRange(pageId, rule)) continue;
1529
+ switch (rule.type) {
1530
+ case "removeZeroWidth":
1531
+ result = removeZeroWidth(result, rule.mode ?? "strip");
1532
+ break;
1533
+ case "condenseEllipsis":
1534
+ result = condenseEllipsis(result);
1535
+ break;
1536
+ case "fixTrailingWaw":
1537
+ result = fixTrailingWaw(result);
1538
+ break;
1539
+ default: assertNever$2(rule.type);
1540
+ }
1541
+ }
1542
+ return result;
1462
1543
  };
1544
+ //#endregion
1545
+ //#region src/validation/validate-segments.ts
1463
1546
  /**
1464
- * Classifies a markdown heading line produced by `convertContentToMarkdown()`.
1547
+ * Creates a short preview string of text content for error reporting.
1548
+ * Truncates content exceeding PREVIEW_LIMIT.
1465
1549
  */
1466
- const classifyDictionaryHeading = (line) => {
1467
- const text = line.startsWith(HEADING_PREFIX$1) ? line.slice(3).trim() : line.trim();
1468
- const unwrapped = stripLeadingWrappers(text);
1469
- if (!text) return "noise";
1470
- if (CHAPTER_HEADING_RE.test(text) || CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizeArabicForComparison(unwrapped), normalizeArabicForComparison(term)))) return "chapter";
1471
- if (looksLikeNoiseHeading(text)) return "noise";
1472
- if (isCodeHeading(text)) return "marker";
1473
- if (MARKER_PREFIXES.some((token) => normalizeArabicForComparison(unwrapped).startsWith(normalizeArabicForComparison(token)))) return "marker";
1474
- if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
1475
- if (CLUSTER_HEADING_RE.test(text)) return "cluster";
1476
- return "entry";
1550
+ const buildPreview = (text) => {
1551
+ const normalized = text.replace(/\s+/g, " ").trim();
1552
+ if (normalized.length <= 140) return normalized;
1553
+ return `${normalized.slice(0, 140)}...`;
1477
1554
  };
1478
- const createHeadingMatch = (kind, page, rawLine, lineNumber) => ({
1479
- kind,
1480
- lemma: kind === "entry" ? rawLine.slice(3).trim() : void 0,
1481
- line: lineNumber,
1482
- pageId: page.id,
1483
- text: rawLine
1484
- });
1485
- const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
1486
- kind,
1487
- lemma,
1488
- line: lineNumber,
1489
- pageId: page.id,
1490
- text
1555
+ /**
1556
+ * Creates a lightweight snapshot of a segment for inclusion in validation checks.
1557
+ */
1558
+ const buildSegmentSnapshot = (segment) => ({
1559
+ contentPreview: buildPreview(segment.content),
1560
+ from: segment.from,
1561
+ to: segment.to
1491
1562
  });
1492
- const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
1493
- if (!rawLine.startsWith(HEADING_PREFIX$1)) return false;
1494
- const kind = classifyDictionaryHeading(rawLine);
1495
- matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
1496
- return true;
1497
- };
1498
- const scanLineEntry = (page, rawLine, lineNumber, matches) => {
1499
- const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
1500
- if (!lineEntry?.groups?.lemma) return;
1501
- matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
1502
- };
1503
- const scanPairedForms = (page, rawLine, lineNumber, matches) => {
1504
- const pairedForms = rawLine.match(PAIRED_FORMS_RE);
1505
- if (!pairedForms?.groups?.forms) return;
1506
- matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
1507
- };
1508
- const scanCodeLine = (page, rawLine, lineNumber, matches) => {
1509
- const codeLine = rawLine.match(CODE_LINE_RE);
1510
- if (!codeLine?.groups?.codes) return;
1511
- matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
1512
- };
1513
- const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
1514
- for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
1515
- if (!match.groups?.lemma) continue;
1516
- matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
1517
- }
1518
- };
1519
1563
  /**
1520
- * Extracts dictionary surface matches from a markdown page.
1564
+ * Normalizes page content by applying preprocessing transforms and standardizing line endings.
1521
1565
  */
1522
- const scanDictionaryMarkdownPage = (page) => {
1523
- const lines = page.content.split(/\n/u);
1524
- const matches = [];
1525
- for (let index = 0; index < lines.length; index++) {
1526
- const rawLine = lines[index]?.trim() ?? "";
1527
- if (!rawLine) continue;
1528
- if (scanHeadingLine(page, rawLine, index + 1, matches)) continue;
1529
- scanLineEntry(page, rawLine, index + 1, matches);
1530
- scanPairedForms(page, rawLine, index + 1, matches);
1531
- scanCodeLine(page, rawLine, index + 1, matches);
1532
- scanInlineSubentries(page, rawLine, index + 1, matches);
1533
- }
1534
- return matches;
1566
+ const normalizePages = (pages, options) => {
1567
+ const transforms = options.preprocess ?? [];
1568
+ return pages.map((page) => {
1569
+ return {
1570
+ content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
1571
+ id: page.id
1572
+ };
1573
+ });
1535
1574
  };
1536
1575
  /**
1537
- * Aggregates dictionary surface counts across markdown pages.
1576
+ * Joins all page content into a single string with boundary tracking.
1577
+ * Returns the joined string and a list of boundary mappings (start/end indices for each page).
1538
1578
  */
1539
- const analyzeDictionaryMarkdownPages = (pages) => {
1540
- const counts = emptyCounts();
1541
- const matches = [];
1542
- for (const page of pages) {
1543
- const pageMatches = scanDictionaryMarkdownPage(page);
1544
- for (const match of pageMatches) {
1545
- counts[match.kind] += 1;
1546
- matches.push(match);
1547
- }
1579
+ const buildJoinedContent = (pages, joiner) => {
1580
+ const boundaries = [];
1581
+ const joined = pages.map((p) => p.content).join(joiner);
1582
+ let offset = 0;
1583
+ for (let i = 0; i < pages.length; i++) {
1584
+ const content = pages[i].content;
1585
+ const start = offset;
1586
+ const end = start + content.length;
1587
+ boundaries.push({
1588
+ end,
1589
+ id: pages[i].id,
1590
+ start
1591
+ });
1592
+ offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
1548
1593
  }
1549
1594
  return {
1550
- counts,
1551
- matches
1595
+ boundaries,
1596
+ joined
1552
1597
  };
1553
1598
  };
1554
- //#endregion
1555
- //#region src/dictionary/profile.ts
1556
- const normalizedProfileCache = /* @__PURE__ */ new WeakMap();
1557
- const normalizeStopLemmaWord = (word) => normalizeArabicForComparison(word).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
1558
- const uniqueNormalizedSet = (values, normalize) => new Set(values.map(normalize).filter(Boolean));
1559
- const assertNever$2 = (value) => {
1560
- throw new Error(`Unhandled dictionary profile variant: ${JSON.stringify(value)}`);
1561
- };
1562
- const normalizeFamily = (family) => {
1563
- switch (family.use) {
1564
- case "heading": return {
1565
- ...family,
1566
- allowNextLineColon: family.allowNextLineColon ?? false,
1567
- allowSingleLetter: family.allowSingleLetter ?? false
1568
- };
1569
- case "lineEntry": return {
1570
- ...family,
1571
- allowMultiWord: family.allowMultiWord ?? false,
1572
- allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
1573
- wrappers: family.wrappers ?? "none"
1574
- };
1575
- case "inlineSubentry": return {
1576
- ...family,
1577
- prefixes: family.prefixes ?? ["و"],
1578
- stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
1579
- };
1580
- case "codeLine": return {
1581
- ...family,
1582
- wrappers: family.wrappers ?? "either"
1583
- };
1584
- case "pairedForms": return {
1585
- ...family,
1586
- requireStatusTail: family.requireStatusTail ?? false,
1587
- separator: family.separator ?? "comma"
1588
- };
1589
- default: return assertNever$2(family);
1599
+ /**
1600
+ * Binary search to find which page ID corresponds to a character offset in the joined content.
1601
+ * Returns undefined if the offset falls within a joiner gap or outside bounds.
1602
+ */
1603
+ const findBoundaryIdForOffset = (offset, boundaries) => {
1604
+ let lo = 0;
1605
+ let hi = boundaries.length - 1;
1606
+ while (lo <= hi) {
1607
+ const mid = lo + hi >>> 1;
1608
+ const boundary = boundaries[mid];
1609
+ if (offset < boundary.start) hi = mid - 1;
1610
+ else if (offset > boundary.end) lo = mid + 1;
1611
+ else return boundary.id;
1590
1612
  }
1613
+ if (boundaries.length === 0) return;
1614
+ const last = boundaries.at(-1);
1615
+ return offset > last.end ? last.id : void 0;
1591
1616
  };
1592
- const normalizeBlocker = (blocker) => {
1593
- switch (blocker.use) {
1594
- case "authorityIntro": return {
1595
- ...blocker,
1596
- precision: blocker.precision ?? "high"
1597
- };
1598
- case "stopLemma": return {
1599
- ...blocker,
1600
- normalizedWords: uniqueNormalizedSet(blocker.words, normalizeStopLemmaWord)
1617
+ /**
1618
+ * Helper to construct a standardized validation issue object.
1619
+ */
1620
+ const createIssue$1 = (type, segment, segmentIndex, overrides = {}, pageMap) => {
1621
+ const segmentSnapshot = buildSegmentSnapshot(segment);
1622
+ const page = pageMap?.get(segment.from);
1623
+ const matchIndex = overrides.matchIndex;
1624
+ const { matchIndex: _ignored, ...restOverrides } = overrides;
1625
+ const base = {
1626
+ actual: {
1627
+ from: segment.from,
1628
+ to: segment.to
1629
+ },
1630
+ segment: segmentSnapshot,
1631
+ segmentIndex,
1632
+ ...restOverrides
1633
+ };
1634
+ switch (type) {
1635
+ case "page_not_found": return {
1636
+ ...base,
1637
+ evidence: overrides.evidence ?? `Segment.from=${segment.from} does not exist in input pages.`,
1638
+ hint: "Check page IDs passed into segmentPages() and validateSegments().",
1639
+ severity: "error",
1640
+ type
1601
1641
  };
1602
- case "previousWord": return {
1603
- ...blocker,
1604
- normalizedWords: uniqueNormalizedSet(blocker.words, normalizeArabicForComparison)
1642
+ case "content_not_found": return {
1643
+ ...base,
1644
+ evidence: overrides.evidence ?? "Segment content not found in any page content.",
1645
+ hint: overrides.hint ?? "Check preprocessing options, joiner settings, or whitespace normalization.",
1646
+ pageContext: page ? {
1647
+ pageId: page.id,
1648
+ pagePreview: buildPreview(page.content)
1649
+ } : void 0,
1650
+ severity: "error",
1651
+ type
1605
1652
  };
1606
- case "previousChar": return {
1607
- ...blocker,
1608
- charSet: new Set(blocker.chars)
1653
+ case "page_attribution_mismatch": {
1654
+ const matchedFromId = overrides.expected?.from ?? overrides.actual?.from ?? segment.from;
1655
+ const actualPage = pageMap?.get(matchedFromId);
1656
+ return {
1657
+ ...base,
1658
+ evidence: overrides.evidence ?? `Content found in joined content at page ${matchedFromId}, but segment.from=${segment.from}.`,
1659
+ hint: overrides.hint ?? "Check duplicate content handling and boundary detection rules.",
1660
+ pageContext: actualPage ? {
1661
+ matchIndex: matchIndex ?? -1,
1662
+ pageId: actualPage.id,
1663
+ pagePreview: buildPreview(actualPage.content)
1664
+ } : void 0,
1665
+ severity: "error",
1666
+ type
1667
+ };
1668
+ }
1669
+ case "max_pages_violation": return {
1670
+ ...base,
1671
+ evidence: overrides.evidence ?? `Segment spans pages ${segment.from}-${overrides.actual?.to}.`,
1672
+ hint: overrides.hint ?? "Check maxPages windowing in breakpoint-processor.ts and page constraints.",
1673
+ severity: "error",
1674
+ type
1675
+ };
1676
+ default: return {
1677
+ ...base,
1678
+ severity: "error",
1679
+ type
1609
1680
  };
1610
- case "intro":
1611
- case "pageContinuation": return blocker;
1612
- default: return assertNever$2(blocker);
1613
- }
1614
- };
1615
- const normalizeZone = (zone) => ({
1616
- blockers: (zone.blockers ?? []).map(normalizeBlocker),
1617
- families: zone.families.map(normalizeFamily),
1618
- name: zone.name,
1619
- when: zone.when ? {
1620
- activateAfter: zone.when.activateAfter,
1621
- maxPageId: zone.when.maxPageId,
1622
- minPageId: zone.when.minPageId
1623
- } : void 0
1624
- });
1625
- const createIssue$1 = (code, path, message, zoneName) => ({
1626
- code,
1627
- message,
1628
- path,
1629
- ...zoneName ? { zoneName } : {}
1630
- });
1631
- const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
1632
- const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
1633
- if (gate.use === "headingText") {
1634
- if (!gate.match.trim()) issues.push(createIssue$1("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
1635
- if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue$1("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
1636
1681
  }
1637
- const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
1638
- if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue$1("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
1639
- seenActivateAfterKeys.add(dedupeKey);
1640
1682
  };
1641
- const validateFamily = (family, zone, familyIndex, issues) => {
1642
- const familyPath = `zones[].families[${familyIndex}]`.replace("[]", `[${zone.name}]`);
1643
- switch (family.use) {
1644
- case "heading":
1645
- if (family.classes.length === 0) issues.push(createIssue$1("empty_heading_classes", `${familyPath}.classes`, `dictionary heading family in zone "${zone.name}" must include at least one class`, zone.name));
1646
- if (family.emit === "chapter" && !family.classes.includes("chapter")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "chapter" but never matches chapter headings`, zone.name));
1647
- if (family.emit === "marker" && !family.classes.includes("marker")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "marker" but never matches marker headings`, zone.name));
1648
- if (family.emit === "entry" && !family.classes.includes("entry")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "entry" but never matches entry headings`, zone.name));
1649
- break;
1650
- case "lineEntry": break;
1651
- case "inlineSubentry":
1652
- if (family.prefixes?.some((prefix) => !prefix.trim())) issues.push(createIssue$1("empty_inline_prefixes", `${familyPath}.prefixes`, `inlineSubentry prefixes must be non-empty strings`, zone.name));
1653
- break;
1654
- case "codeLine": break;
1655
- case "pairedForms": break;
1656
- default: assertNever$2(family);
1683
+ /**
1684
+ * Finds all occurrences of a content string within the joined text.
1685
+ * Respects search limits to avoid performance cliffs on highly repetitive content.
1686
+ */
1687
+ const findJoinedMatches = (content, joined, searchStart, searchEnd, limit = Infinity) => {
1688
+ const matches = [];
1689
+ if (!content || searchStart >= searchEnd) return matches;
1690
+ let idx = joined.indexOf(content, searchStart);
1691
+ let count = 0;
1692
+ while (idx >= 0 && idx < searchEnd && count < limit) {
1693
+ matches.push({
1694
+ end: idx + content.length - 1,
1695
+ start: idx
1696
+ });
1697
+ idx = joined.indexOf(content, idx + 1);
1698
+ if (idx >= searchEnd) break;
1699
+ count++;
1657
1700
  }
1701
+ return matches;
1658
1702
  };
1659
- const validateBlocker = (blocker, zone, blockerIndex, issues) => {
1660
- const blockerPath = `zones[].blockers[${blockerIndex}]`.replace("[]", `[${zone.name}]`);
1661
- switch (blocker.use) {
1662
- case "stopLemma":
1663
- if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_stop_words", `${blockerPath}.words`, `stopLemma blocker in zone "${zone.name}" must include non-empty words`, zone.name));
1664
- break;
1665
- case "previousWord":
1666
- if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_previous_words", `${blockerPath}.words`, `previousWord blocker in zone "${zone.name}" must include non-empty words`, zone.name));
1667
- break;
1668
- case "previousChar":
1669
- if (blocker.chars.length === 0 || blocker.chars.some((char) => !char)) issues.push(createIssue$1("invalid_previous_chars", `${blockerPath}.chars`, `previousChar blocker in zone "${zone.name}" must include chars`, zone.name));
1670
- break;
1671
- case "authorityIntro":
1672
- case "intro":
1673
- case "pageContinuation": break;
1674
- default: assertNever$2(blocker);
1703
+ /**
1704
+ * Verifies that a matched segment falls within the allowed maxTerms/maxPages constraints.
1705
+ * Checks both implicit spans (calculated from match end) and explicit segment.to claims.
1706
+ */
1707
+ const checkMaxPagesViolation = (segment, segmentIndex, maxPages, matchEnd, _expectedBoundaryEnd, boundaries) => {
1708
+ const actualToId = findBoundaryIdForOffset(matchEnd, boundaries);
1709
+ if (actualToId === void 0) return [];
1710
+ if (maxPages === 0) {
1711
+ if (actualToId !== segment.from) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
1712
+ actual: {
1713
+ from: segment.from,
1714
+ to: actualToId
1715
+ },
1716
+ evidence: `Segment spans pages ${segment.from}-${actualToId} in joined content (maxPages=0).`,
1717
+ expected: {
1718
+ from: segment.from,
1719
+ to: segment.from
1720
+ }
1721
+ })];
1675
1722
  }
1676
- };
1677
- var DictionaryProfileValidationError = class extends Error {
1678
- issues;
1679
- constructor(issues) {
1680
- super(issues.length === 1 ? issues[0].message : `Dictionary profile validation failed with ${issues.length} issues`);
1681
- this.name = "DictionaryProfileValidationError";
1682
- this.issues = issues;
1723
+ if (segment.to !== void 0) {
1724
+ if (actualToId > segment.to) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
1725
+ actual: {
1726
+ from: segment.from,
1727
+ to: actualToId
1728
+ },
1729
+ evidence: `Segment content ends on page ${actualToId} but segment.to is ${segment.to}.`,
1730
+ expected: {
1731
+ from: segment.from,
1732
+ to: segment.to
1733
+ }
1734
+ })];
1735
+ } else if (maxPages !== void 0) {
1736
+ const span = actualToId - segment.from;
1737
+ if (span > maxPages) return [createIssue$1("max_pages_violation", segment, segmentIndex, {
1738
+ actual: {
1739
+ from: segment.from,
1740
+ to: actualToId
1741
+ },
1742
+ evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
1743
+ expected: {
1744
+ from: segment.from,
1745
+ to: segment.from + maxPages
1746
+ }
1747
+ })];
1683
1748
  }
1749
+ return [];
1684
1750
  };
1685
- const validateZone = (zone, zoneIndex, seenZoneNames, issues) => {
1686
- const zonePath = `zones[${zoneIndex}]`;
1687
- const trimmedName = zone.name.trim();
1688
- if (!trimmedName) issues.push(createIssue$1("empty_zone_name", `${zonePath}.name`, `dictionary zone name must be non-empty`));
1689
- else if (seenZoneNames.has(trimmedName)) issues.push(createIssue$1("duplicate_zone_name", `${zonePath}.name`, `dictionary zone names must be unique; duplicated "${trimmedName}"`, trimmedName));
1690
- else seenZoneNames.add(trimmedName);
1691
- if (zone.families.length === 0) issues.push(createIssue$1("empty_zone_families", `${zonePath}.families`, `dictionary zone "${zone.name}" must declare at least one family`, zone.name));
1692
- if (zone.when?.minPageId !== void 0 && zone.when?.maxPageId !== void 0 && zone.when.minPageId > zone.when.maxPageId) issues.push(createIssue$1("invalid_zone_page_range", `${zonePath}.when`, `dictionary zone "${zone.name}" has minPageId greater than maxPageId`, zone.name));
1693
- const seenActivateAfterKeys = /* @__PURE__ */ new Set();
1694
- for (let gateIndex = 0; gateIndex < (zone.when?.activateAfter?.length ?? 0); gateIndex++) validateGate(zone.when.activateAfter[gateIndex], zone, gateIndex, seenActivateAfterKeys, issues);
1695
- for (let familyIndex = 0; familyIndex < zone.families.length; familyIndex++) validateFamily(zone.families[familyIndex], zone, familyIndex, issues);
1696
- for (let blockerIndex = 0; blockerIndex < (zone.blockers?.length ?? 0); blockerIndex++) validateBlocker(zone.blockers[blockerIndex], zone, blockerIndex, issues);
1751
+ /**
1752
+ * Handles validation when content is not found in the expected boundary window.
1753
+ * Fallback strategy: search entire document if segment matches existing content elsewhere.
1754
+ */
1755
+ const handleMissingBoundary = (segment, segmentIndex, joined, boundaries, pageMap) => {
1756
+ const matches = findJoinedMatches(segment.content, joined, 0, joined.length, 1);
1757
+ if (matches.length === 0) return [createIssue$1("content_not_found", segment, segmentIndex, { evidence: "Segment content not found in any page content." }, pageMap)];
1758
+ const match = matches[0];
1759
+ const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
1760
+ const actualToId = findBoundaryIdForOffset(match.end, boundaries);
1761
+ return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
1762
+ actual: {
1763
+ from: segment.from,
1764
+ to: segment.to
1765
+ },
1766
+ evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
1767
+ expected: {
1768
+ from: actualFromId,
1769
+ to: actualToId
1770
+ },
1771
+ matchIndex: match.start
1772
+ }, pageMap)];
1697
1773
  };
1698
1774
  /**
1699
- * Validates a dictionary profile without normalizing it.
1775
+ * Performs a widened search when the direct check fails.
1776
+ * Includes a small buffer around the expected position, and optionally a full-document search for short segments.
1700
1777
  */
1701
- const validateDictionaryProfile = (profile) => {
1702
- const issues = [];
1703
- if (profile.version !== 2) issues.push(createIssue$1("invalid_version", "version", `dictionary profile version must be 2, got ${profile.version}`));
1704
- if (profile.zones.length === 0) {
1705
- issues.push(createIssue$1("missing_zones", "zones", `dictionary profile must contain at least one zone`));
1706
- return issues;
1778
+ const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions) => {
1779
+ const content = segment.content;
1780
+ const bufferSize = 1e3;
1781
+ const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
1782
+ if (rawMatches.length === 0) {
1783
+ const threshold = validationOptions?.fullSearchThreshold ?? 500;
1784
+ if (content.length < threshold) {
1785
+ const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
1786
+ const validMatch = fullMatches.find((m) => {
1787
+ return findBoundaryIdForOffset(m.start, boundaries) === segment.from;
1788
+ });
1789
+ if (validMatch) return checkMaxPagesViolation(segment, segmentIndex, maxPages, validMatch.end, expectedBoundary.end, boundaries);
1790
+ if (fullMatches.length > 0) {
1791
+ const match = fullMatches[0];
1792
+ const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
1793
+ const actualToId = findBoundaryIdForOffset(match.end, boundaries);
1794
+ return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
1795
+ actual: {
1796
+ from: segment.from,
1797
+ to: segment.to
1798
+ },
1799
+ evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
1800
+ expected: {
1801
+ from: actualFromId,
1802
+ to: actualToId
1803
+ },
1804
+ matchIndex: match.start
1805
+ }, pageMap)];
1806
+ }
1807
+ }
1808
+ return [createIssue$1("content_not_found", segment, segmentIndex, {
1809
+ evidence: `Segment content (${content.length} chars) not found in expected window.`,
1810
+ hint: "Check page boundary attribution in segmenter.ts."
1811
+ }, pageMap)];
1707
1812
  }
1708
- const seenZoneNames = /* @__PURE__ */ new Set();
1709
- for (let zoneIndex = 0; zoneIndex < profile.zones.length; zoneIndex++) validateZone(profile.zones[zoneIndex], zoneIndex, seenZoneNames, issues);
1710
- return issues;
1813
+ const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
1814
+ if (alignedMatches.length > 0) {
1815
+ const primary = alignedMatches[0];
1816
+ return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
1817
+ }
1818
+ const primary = rawMatches[0];
1819
+ const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
1820
+ const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
1821
+ return [createIssue$1("page_attribution_mismatch", segment, segmentIndex, {
1822
+ actual: {
1823
+ from: segment.from,
1824
+ to: segment.to
1825
+ },
1826
+ evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
1827
+ expected: {
1828
+ from: actualFromId,
1829
+ to: actualToId
1830
+ },
1831
+ matchIndex: primary.start
1832
+ }, pageMap)];
1711
1833
  };
1712
1834
  /**
1713
- * Normalizes and validates a dictionary profile before runtime matching.
1835
+ * Calculates the search range end index based on segment.to or strict bounds.
1714
1836
  */
1715
- const normalizeDictionaryProfile = (profile) => {
1716
- const cached = normalizedProfileCache.get(profile);
1717
- if (cached) return cached;
1718
- const issues = validateDictionaryProfile(profile);
1719
- if (issues.length > 0) throw new DictionaryProfileValidationError(issues);
1720
- const normalized = {
1721
- version: 2,
1722
- zones: profile.zones.map(normalizeZone)
1723
- };
1724
- normalizedProfileCache.set(profile, normalized);
1725
- return normalized;
1837
+ const getSearchRange = (segment, expectedBoundary, boundaryMap, joinedLength) => {
1838
+ let searchEnd = expectedBoundary.end + 1;
1839
+ if (segment.to !== void 0) {
1840
+ const endBoundary = boundaryMap.get(segment.to);
1841
+ if (endBoundary) searchEnd = endBoundary.end + 1;
1842
+ else searchEnd = Math.min(joinedLength, expectedBoundary.end + 5e4);
1843
+ }
1844
+ return searchEnd;
1726
1845
  };
1727
- //#endregion
1728
- //#region src/types/rules.ts
1729
1846
  /**
1730
- * Pattern type key names for split rules.
1731
- *
1732
- * Use this array to dynamically iterate over pattern types in UIs,
1733
- * or use the `PatternTypeKey` type for type-safe string unions.
1734
- *
1735
- * @example
1736
- * // Build a dropdown/select in UI
1737
- * PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
1847
+ * Validates attribution for a single segment by searching for its content in the joined text.
1848
+ * Returns issues if content is missing, mis-attributed, or violates page limits.
1849
+ */
1850
+ const getAttributionIssues = (segment, segmentIndex, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions) => {
1851
+ if (!segment.content) return [createIssue$1("content_not_found", segment, segmentIndex, { evidence: "Segment content is empty." }, pageMap)];
1852
+ const expectedBoundary = boundaryMap.get(segment.from);
1853
+ if (!expectedBoundary) return handleMissingBoundary(segment, segmentIndex, joined, boundaries, pageMap);
1854
+ const searchEnd = getSearchRange(segment, expectedBoundary, boundaryMap, joined.length);
1855
+ const searchStart = expectedBoundary.start;
1856
+ const idx = joined.indexOf(segment.content, searchStart);
1857
+ if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
1858
+ return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
1859
+ };
1860
+ /**
1861
+ * Performs purely static checks on the segment metadata (Ids and spans) before expensive content searching.
1862
+ */
1863
+ const checkStaticMaxPages = (segment, index, maxPages) => {
1864
+ if (maxPages === void 0 || segment.to === void 0) return null;
1865
+ if (maxPages === 0) {
1866
+ if (segment.to !== segment.from) return createIssue$1("max_pages_violation", segment, index, {
1867
+ evidence: "maxPages=0 requires all segments to stay within one page.",
1868
+ expected: {
1869
+ from: segment.from,
1870
+ to: segment.from
1871
+ },
1872
+ hint: "Check boundary detection in breakpoint-utils.ts."
1873
+ });
1874
+ return null;
1875
+ }
1876
+ const span = segment.to - segment.from;
1877
+ if (span > maxPages) return createIssue$1("max_pages_violation", segment, index, {
1878
+ evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
1879
+ expected: {
1880
+ from: segment.from,
1881
+ to: segment.from + maxPages
1882
+ },
1883
+ hint: "Check breakpoint windowing and page attribution in breakpoint-processor.ts."
1884
+ });
1885
+ return null;
1886
+ };
1887
+ /**
1888
+ * Validates a list of segments against the source pages.
1889
+ * checks for:
1890
+ * - Page existence (invalid IDs)
1891
+ * - Content fidelity (content must exist in pages)
1892
+ * - Page attribution (from/to must match content location)
1893
+ * - Page constraints (maxPages violations)
1738
1894
  *
1739
- * @example
1740
- * // Type-safe pattern key validation
1741
- * const validateKey = (k: string): k is PatternTypeKey =>
1742
- * (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
1895
+ * @param pages Input pages used for segmentation
1896
+ * @param options Operations used during segmentation (for preprocessing/joining consistency)
1897
+ * @param segments The output segments to validate
1898
+ * @param validationOptions Optional settings for validation behavior
1899
+ * @returns A detailed validation report
1743
1900
  */
1744
- const PATTERN_TYPE_KEYS = [
1745
- "lineStartsWith",
1746
- "lineStartsAfter",
1747
- "lineEndsWith",
1748
- "template",
1749
- "regex",
1750
- "dictionaryEntry"
1751
- ];
1901
+ const validateSegments = (pages, options, segments, validationOptions) => {
1902
+ const normalizedPages = normalizePages(pages, options);
1903
+ const { boundaries, joined } = buildJoinedContent(normalizedPages, options.pageJoiner === "newline" ? "\n" : " ");
1904
+ const boundaryMap = /* @__PURE__ */ new Map();
1905
+ const pageMap = /* @__PURE__ */ new Map();
1906
+ for (const b of boundaries) boundaryMap.set(b.id, b);
1907
+ for (const p of normalizedPages) pageMap.set(p.id, p);
1908
+ const pageIds = new Set(normalizedPages.map((p) => p.id));
1909
+ const maxPages = options.maxPages;
1910
+ const issues = [];
1911
+ for (let i = 0; i < segments.length; i++) {
1912
+ const segment = segments[i];
1913
+ if (!pageIds.has(segment.from)) {
1914
+ issues.push(createIssue$1("page_not_found", segment, i));
1915
+ continue;
1916
+ }
1917
+ if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue$1("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
1918
+ const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
1919
+ if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
1920
+ const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
1921
+ issues.push(...attributionIssues);
1922
+ }
1923
+ const errors = issues.filter((issue) => issue.severity === "error").length;
1924
+ const warnings = issues.filter((issue) => issue.severity === "warn").length;
1925
+ return {
1926
+ issues,
1927
+ ok: issues.length === 0,
1928
+ summary: {
1929
+ errors,
1930
+ issues: issues.length,
1931
+ pageCount: pages.length,
1932
+ segmentCount: segments.length,
1933
+ warnings
1934
+ }
1935
+ };
1936
+ };
1752
1937
  //#endregion
1753
1938
  //#region src/segmentation/debug-meta.ts
1754
1939
  const resolveDebugConfig = (debug) => {
@@ -1843,7 +2028,14 @@ const getSegmentDebugReason = (segment, options) => {
1843
2028
  return getDebugReason(segment.meta, options);
1844
2029
  };
1845
2030
  //#endregion
1846
- //#region src/dictionary/runtime.ts
2031
+ //#region src/dictionary/constants.ts
2032
+ /**
2033
+ * Shared constants used by the dictionary runtime: phrase lists, regex patterns,
2034
+ * keyword sets, and structural-leak detection data.
2035
+ *
2036
+ * Keeping these here allows both runtime.ts and heading-classifier.ts to import
2037
+ * from a single source of truth without circular dependencies.
2038
+ */
1847
2039
  const INTRO_PHRASES = [
1848
2040
  "وقال",
1849
2041
  "قال",
@@ -1994,7 +2186,16 @@ const CONTINUATION_PREV_WORDS = [
1994
2186
  "ثم",
1995
2187
  "وجل"
1996
2188
  ];
1997
- const AUTHORITY_RE = /^(?:(?:و)?قال\s+(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\b|(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\s+\S+)/u;
2189
+ const NORMALIZED_AUTHORITY_INTRO_PATTERN = [
2190
+ "أبو",
2191
+ "ابن",
2192
+ "ثعلب",
2193
+ "الليث",
2194
+ "الأزهري",
2195
+ "الجوهري",
2196
+ "الفراء"
2197
+ ].map((term) => escapeRegex(normalizeArabicForComparison(term))).join("|");
2198
+ const NORMALIZED_AUTHORITY_RE = new RegExp(`^(?:(?:و)?قال\\s+(?:${NORMALIZED_AUTHORITY_INTRO_PATTERN})(?=$|[\\s:،؛,.])|(?:${NORMALIZED_AUTHORITY_INTRO_PATTERN})\\s+\\S+)`, "u");
1998
2199
  const AUTHORITY_HEAD_WORDS = [
1999
2200
  "الأزهري",
2000
2201
  "الأصمعي",
@@ -2015,13 +2216,22 @@ const AUTHORITY_HEAD_WORDS = [
2015
2216
  "ثعلب",
2016
2217
  "شمر"
2017
2218
  ];
2219
+ /** Aggressive-precision authority terms (subset used for fast startsWith checks). */
2220
+ const AUTHORITY_AGGRESSIVE_TERMS = [
2221
+ "الليث",
2222
+ "الأزهري",
2223
+ "الأصمعي",
2224
+ "الجوهري",
2225
+ "الفراء",
2226
+ "ثعلب",
2227
+ "شمر"
2228
+ ];
2018
2229
  const STRONG_SENTENCE_TERMINATORS$1 = /[.!?؟؛۔…]$/u;
2019
- const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
2020
- const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
2230
+ const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»""'''()[\]{}<>]+$/u;
2231
+ const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»""'''()[\]{}<>.,!?؟؛،:]+$/u;
2021
2232
  const ARABIC_WORD_REGEX$1 = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
2022
- const HEADING_PREFIX = "## ";
2023
- const CODE_LINE_PATTERN = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
2024
- const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN})$`, "u");
2233
+ const CODE_LINE_PATTERN$1 = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
2234
+ const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1})$`, "u");
2025
2235
  const STATUS_TAIL_PATTERN = "(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)";
2026
2236
  const GATE_TOKEN_MAP = {
2027
2237
  bab: "باب",
@@ -2029,18 +2239,38 @@ const GATE_TOKEN_MAP = {
2029
2239
  kitab: "كتاب"
2030
2240
  };
2031
2241
  const GATE_DELIMITER_RE = /[\s:،؛()[\]{}\-–—]/u;
2032
- const assertNever$1 = (value) => {
2033
- throw new Error(`Unhandled dictionary runtime variant: ${JSON.stringify(value)}`);
2034
- };
2035
- const lineEntryRegexCache = /* @__PURE__ */ new WeakMap();
2036
- const inlineSubentryRegexCache = /* @__PURE__ */ new WeakMap();
2037
- const pairedFormsRegexCache = /* @__PURE__ */ new WeakMap();
2242
+ const normalizeStopLemmaWord = (text) => normalizeArabicForComparison(text).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
2243
+ /** Pre-normalized intro phrases for startsWith / endsWith checks. */
2244
+ const NORMALIZED_INTRO_PHRASES = INTRO_PHRASES.map(normalizeArabicForComparison);
2245
+ /** Pre-normalized intro tail phrases for endsWith checks. */
2246
+ const NORMALIZED_INTRO_TAIL_PHRASES = INTRO_TAIL_PHRASES.map(normalizeArabicForComparison);
2247
+ /** Pre-normalized authority head words as a Set for O(1) lookup. */
2248
+ const NORMALIZED_AUTHORITY_HEAD_WORDS_SET = new Set(AUTHORITY_HEAD_WORDS.map(normalizeStopLemmaWord));
2249
+ /** Pre-normalized aggressive authority terms for startsWith checks. */
2250
+ const NORMALIZED_AUTHORITY_AGGRESSIVE_TERMS = AUTHORITY_AGGRESSIVE_TERMS.map(normalizeArabicForComparison);
2251
+ /** Pre-normalized qualifier tail prefixes for startsWith checks. */
2252
+ const NORMALIZED_QUALIFIER_TAIL_PREFIXES = QUALIFIER_TAIL_PREFIXES.map(normalizeArabicForComparison);
2253
+ /** Pre-normalized structural lemma prefixes for startsWith checks. */
2254
+ const NORMALIZED_STRUCTURAL_LEMMA_PREFIXES = STRUCTURAL_LEMMA_PREFIXES.map(normalizeArabicForComparison);
2255
+ /** Pre-normalized structural line keywords for includes checks. */
2256
+ const NORMALIZED_STRUCTURAL_LINE_KEYWORDS = STRUCTURAL_LINE_KEYWORDS.map(normalizeArabicForComparison);
2257
+ /** Pre-normalized continuation prev words as a Set for O(1) lookup. */
2258
+ const NORMALIZED_CONTINUATION_PREV_WORDS_SET = new Set(CONTINUATION_PREV_WORDS.map(normalizeArabicForComparison));
2259
+ /** Pre-normalized 'ولل' prefix. */
2260
+ const NORMALIZED_WLAL_PREFIX = normalizeArabicForComparison("ولل");
2261
+ //#endregion
2262
+ //#region src/dictionary/dictionary-blockers.ts
2263
+ /**
2264
+ * Limit backwards scans to a small suffix; dictionary blockers only need the
2265
+ * immediate local context rather than an unbounded full-page search.
2266
+ */
2267
+ const LAST_ARABIC_WORD_LOOKBACK_CHARS = 256;
2268
+ const MAX_INTRO_CONTEXT_CHARS = 240;
2269
+ const IGNORABLE_BOUNDARY_CHAR_RE = /(?:\s|\u200B|\u200C|\u200D|\u200E|\u200F|\u061C)/u;
2038
2270
  const trimTrailingPageWrapNoise$1 = (text) => text.trimEnd().replace(TRAILING_PAGE_WRAP_NOISE$1, "");
2039
- const endsWithStrongSentenceTerminator$1 = (pageContent) => {
2040
- return STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
2041
- };
2271
+ const endsWithStrongSentenceTerminator$1 = (pageContent) => STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
2042
2272
  const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
2043
- const windowStart = Math.max(0, endExclusive - 256);
2273
+ const windowStart = Math.max(0, endExclusive - LAST_ARABIC_WORD_LOOKBACK_CHARS);
2044
2274
  const withoutTrailingDelimiters = trimTrailingPageWrapNoise$1(text.slice(windowStart, endExclusive)).replace(TRAILING_WORD_DELIMITERS$1, "");
2045
2275
  let lastMatch = "";
2046
2276
  ARABIC_WORD_REGEX$1.lastIndex = 0;
@@ -2050,105 +2280,339 @@ const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
2050
2280
  const previousNonWhitespaceChar = (text, endExclusive = text.length) => {
2051
2281
  for (let index = endExclusive - 1; index >= 0; index--) {
2052
2282
  const char = text[index];
2053
- if (char && !/\s/u.test(char)) return char;
2283
+ if (char && !IGNORABLE_BOUNDARY_CHAR_RE.test(char)) return char;
2054
2284
  }
2055
2285
  return "";
2056
2286
  };
2057
- const normalizedEquals = (left, right) => normalizeArabicForComparison(left) === normalizeArabicForComparison(right);
2058
- const normalizedStartsWith = (text, prefix) => normalizeArabicForComparison(text).startsWith(normalizeArabicForComparison(prefix));
2059
- const normalizeStopLemma = (text) => normalizeArabicForComparison(text).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
2060
- const getTrailingContext = (text, endExclusive, maxChars = 240) => text.slice(Math.max(0, endExclusive - maxChars), endExclusive);
2061
- const isDelimitedPrefixMatch = (text, prefix) => {
2287
+ const isAtPageStart = (text, endExclusive) => {
2288
+ for (let index = endExclusive - 1; index >= 0; index--) {
2289
+ const char = text[index];
2290
+ if (char && !IGNORABLE_BOUNDARY_CHAR_RE.test(char)) return false;
2291
+ }
2292
+ return true;
2293
+ };
2294
+ const normalizeStopLemma = normalizeStopLemmaWord;
2295
+ const getTrailingContext = (text, endExclusive, maxChars = MAX_INTRO_CONTEXT_CHARS) => text.slice(Math.max(0, endExclusive - maxChars), endExclusive);
2296
+ const normalizeIntroContextText = (text) => normalizeArabicForComparison(text).replace(/[/\\]+/gu, " ").replace(/[«»""'''()[\]{}]+/gu, " ").replace(/\s+/gu, " ").trim();
2297
+ const normalizeForIntroTailCheck = (text) => normalizeIntroContextText(text).replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
2298
+ const isIntroCandidate = (text) => {
2299
+ const normalized = normalizeIntroContextText(text);
2300
+ return NORMALIZED_INTRO_PHRASES.some((phrase) => normalized.startsWith(phrase));
2301
+ };
2302
+ const endsWithIntroContext = (text) => {
2303
+ const trimmed = text.trimEnd();
2304
+ if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
2305
+ const normalized = normalizeForIntroTailCheck(trimmed);
2306
+ if (!normalized) return false;
2307
+ if (NORMALIZED_INTRO_PHRASES.some((phrase) => normalized.endsWith(phrase))) return true;
2308
+ if (NORMALIZED_INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(phrase))) return true;
2309
+ return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
2310
+ };
2311
+ const isAuthorityCandidate = (text, precision) => {
2312
+ const head = normalizeStopLemma(text.split(":", 1)[0] ?? text);
2313
+ if (head && NORMALIZED_AUTHORITY_HEAD_WORDS_SET.has(head)) return true;
2314
+ const normalized = normalizeIntroContextText(text);
2315
+ if (NORMALIZED_AUTHORITY_RE.test(normalized)) return true;
2316
+ if (precision === "aggressive") return NORMALIZED_AUTHORITY_AGGRESSIVE_TERMS.some((term) => normalized.startsWith(term));
2317
+ return false;
2318
+ };
2319
+ const hasBlockedQualifierTail = (lemma) => {
2320
+ const parts = lemma.split(/[،,]/u).map((part) => part.trim()).filter(Boolean);
2321
+ if (parts.length < 2) return false;
2322
+ const tail = normalizeArabicForComparison(parts.slice(1).join(" "));
2323
+ return NORMALIZED_QUALIFIER_TAIL_PREFIXES.some((prefix) => tail.startsWith(prefix));
2324
+ };
2325
+ const looksLikeStructuralLeak = (candidate) => {
2326
+ if (!candidate.lemma) return false;
2327
+ const normalizedLemma = normalizeArabicForComparison(candidate.lemma);
2328
+ if (candidate.kind === "entry" && (/^[^\p{Script=Arabic}\d]+/u.test(candidate.lemma) || candidate.lemma.includes("{") || candidate.lemma.includes("}") || candidate.lemma.includes("##"))) return true;
2329
+ if (candidate.kind === "entry" && BARE_CODE_LEMMA_RE.test(candidate.lemma) && (candidate.text === candidate.lemma || candidate.text === `## ${candidate.lemma}` || candidate.text.startsWith(`## ${candidate.lemma}`) || candidate.text.startsWith(`${candidate.lemma}\n## `))) return true;
2330
+ if (candidate.family !== "pairedForms" && candidate.lemma.split(/\s+/u).filter(Boolean).length > 4) return true;
2331
+ if (NORMALIZED_STRUCTURAL_LEMMA_PREFIXES.some((prefix) => normalizedLemma.startsWith(prefix))) return true;
2332
+ if (normalizedLemma.startsWith(NORMALIZED_WLAL_PREFIX)) return true;
2333
+ const structuralText = candidate.text.startsWith("## ") ? candidate.text.slice(3).trim() : candidate.text;
2334
+ if (/^[\d\u0660-\u0669]+\s*-\s*\([^)]+\)(?:\s+##.*)?$/u.test(structuralText)) return true;
2335
+ const normalizedText = normalizeArabicForComparison(structuralText);
2336
+ if (STRUCTURAL_LINE_PATTERNS.some((pattern) => pattern.test(structuralText))) return NORMALIZED_STRUCTURAL_LINE_KEYWORDS.some((keyword) => normalizedText.includes(keyword));
2337
+ return false;
2338
+ };
2339
+ const blockerApplies = (blocker, family) => !blocker.appliesTo || blocker.appliesTo.includes(family);
2340
+ const rejectsViaIntroBlocker = (candidate, blocker, localBeforeCandidate) => {
2341
+ if (blocker.use !== "intro") return false;
2342
+ return isIntroCandidate(candidate.probeText) || endsWithIntroContext(localBeforeCandidate);
2343
+ };
2344
+ const rejectsViaAuthorityBlocker = (candidate, blocker) => blocker.use === "authorityIntro" && isAuthorityCandidate(candidate.probeText, blocker.precision);
2345
+ const rejectsViaStopLemmaBlocker = (candidate, blocker) => {
2346
+ if (blocker.use !== "stopLemma" || !candidate.lemma) return false;
2347
+ const normalizedLemma = normalizeStopLemma(candidate.lemma);
2348
+ return !!normalizedLemma && blocker.normalizedWords.has(normalizedLemma);
2349
+ };
2350
+ const previousWordIsBlocked = (blocker, word) => !!word && blocker.normalizedWords.has(normalizeArabicForComparison(word));
2351
+ const rejectsViaPageStartPreviousWord = (blocker, pageIndex, pages) => {
2352
+ if (pageIndex === 0) return false;
2353
+ const previousPage = pages[pageIndex - 1];
2354
+ if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
2355
+ return previousWordIsBlocked(blocker, extractLastArabicWord$1(previousPage.content));
2356
+ };
2357
+ const rejectsViaPreviousWordBlocker = (pageContent, localIndex, blocker, pageIndex, pages) => {
2358
+ if (blocker.use !== "previousWord") return false;
2359
+ if (isAtPageStart(pageContent, localIndex)) {
2360
+ if (blocker.scope === "pageStart") return rejectsViaPageStartPreviousWord(blocker, pageIndex, pages);
2361
+ if (blocker.scope === "any" && rejectsViaPageStartPreviousWord(blocker, pageIndex, pages)) return true;
2362
+ }
2363
+ if (blocker.scope === "pageStart") return false;
2364
+ return previousWordIsBlocked(blocker, extractLastArabicWord$1(pageContent, localIndex));
2365
+ };
2366
+ const rejectsViaPreviousCharBlocker = (pageContent, localIndex, blocker) => {
2367
+ if (blocker.use !== "previousChar") return false;
2368
+ const previousChar = previousNonWhitespaceChar(pageContent, localIndex);
2369
+ return !!previousChar && blocker.charSet.has(previousChar);
2370
+ };
2371
+ const rejectsViaPageContinuationBlocker = (candidate, blocker, pageContent, pageIndex, pages) => {
2372
+ if (blocker.use !== "pageContinuation") return false;
2373
+ if (!isAtPageStart(pageContent, candidate.localIndex) || pageIndex === 0) return false;
2374
+ const previousPage = pages[pageIndex - 1];
2375
+ if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
2376
+ const previousWord = extractLastArabicWord$1(previousPage.content);
2377
+ return !!previousWord && NORMALIZED_CONTINUATION_PREV_WORDS_SET.has(normalizeArabicForComparison(previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, blocker.authorityPrecision);
2378
+ };
2379
+ const getBlockerRejectionReason = (blocker, candidate, localBeforeCandidate, pageContent, pageIndex, pages) => {
2380
+ if (rejectsViaIntroBlocker(candidate, blocker, localBeforeCandidate)) return "intro";
2381
+ if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
2382
+ if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
2383
+ if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker, pageIndex, pages)) return "previousWord";
2384
+ if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
2385
+ if (rejectsViaPageContinuationBlocker(candidate, blocker, pageContent, pageIndex, pages)) return "pageContinuation";
2386
+ return null;
2387
+ };
2388
+ /**
2389
+ * Evaluates candidate rejection in two phases:
2390
+ *
2391
+ * Phase 1: global safety checks (not configurable per profile)
2392
+ * - `qualifierTail`: rejects comma-tail qualifier fragments such as "أي" and "قال"
2393
+ * - `structuralLeak`: rejects markdown artifacts, structural headings, and other non-lexeme leaks
2394
+ *
2395
+ * These are hard safety invariants for the Shamela-style dictionary surface,
2396
+ * so diagnostics report them alongside configurable blocker reasons.
2397
+ *
2398
+ * Phase 2: zone blockers (configurable per zone)
2399
+ * - iterates `zone.blockers` in declaration order
2400
+ * - returns the first matching rejection reason
2401
+ */
2402
+ const getCandidateRejection = (candidate, zone, pageContext, pages) => {
2403
+ const hasQualifierTail = hasBlockedQualifierTail(candidate.lemma ?? "");
2404
+ if (hasQualifierTail || looksLikeStructuralLeak(candidate)) return { reason: hasQualifierTail ? "qualifierTail" : "structuralLeak" };
2405
+ const localBeforeCandidate = getTrailingContext(pageContext.content, candidate.localIndex);
2406
+ for (const blocker of zone.blockers) {
2407
+ if (!blockerApplies(blocker, candidate.family)) continue;
2408
+ const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
2409
+ if (reason) return { reason };
2410
+ }
2411
+ return null;
2412
+ };
2413
+ /**
2414
+ * Returns `true` when the candidate should be dropped (i.e. any rejection
2415
+ * reason exists). Convenience wrapper over `getCandidateRejection`.
2416
+ */
2417
+ const shouldRejectCandidate = (candidate, zone, pageContext, pages) => getCandidateRejection(candidate, zone, pageContext, pages) !== null;
2418
+ //#endregion
2419
+ //#region src/dictionary/heading-classifier.ts
2420
+ const HEADING_PREFIX = "## ";
2421
+ const CODE_LINE_PATTERN = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
2422
+ const ARABIC_WORD_PATTERN = ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN;
2423
+ const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
2424
+ const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
2425
+ const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN})(?:[)\\]])?$`, "u");
2426
+ const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
2427
+ const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
2428
+ const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
2429
+ const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
2430
+ const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
2431
+ const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
2432
+ const COLON_NOISE_RE = /^.+:\s*.+$/u;
2433
+ const CHAPTER_TERMS = [
2434
+ "باب",
2435
+ "فصل",
2436
+ "كتاب",
2437
+ "حرف",
2438
+ "أبواب"
2439
+ ];
2440
+ const MARKER_PREFIXES = [
2441
+ "بسم الله",
2442
+ "توكلت على الله",
2443
+ "آخر كتاب",
2444
+ "ويتلوه"
2445
+ ];
2446
+ const NOISE_TOKENS = [
2447
+ "قال",
2448
+ "وقيل",
2449
+ "ويقال",
2450
+ "وفي",
2451
+ "يعني",
2452
+ "فإذا"
2453
+ ];
2454
+ const NORMALIZED_CHAPTER_TERMS = CHAPTER_TERMS.map(normalizeArabicForComparison);
2455
+ const NORMALIZED_MARKER_PREFIXES = MARKER_PREFIXES.map(normalizeArabicForComparison);
2456
+ const NORMALIZED_NOISE_TOKENS = NOISE_TOKENS.map(normalizeArabicForComparison);
2457
+ const emptyCounts = () => ({
2458
+ chapter: 0,
2459
+ cluster: 0,
2460
+ codeLine: 0,
2461
+ entry: 0,
2462
+ inlineSubentry: 0,
2463
+ lineEntry: 0,
2464
+ marker: 0,
2465
+ noise: 0,
2466
+ pairedForms: 0
2467
+ });
2468
+ const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
2469
+ const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
2470
+ const isDelimitedPrefixMatch$1 = (text, prefix) => {
2062
2471
  if (text === prefix) return true;
2063
2472
  if (!text.startsWith(prefix)) return false;
2064
2473
  const nextChar = text[prefix.length];
2065
- return nextChar === void 0 || GATE_DELIMITER_RE.test(nextChar);
2474
+ return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
2475
+ };
2476
+ const isCodeHeading = (text) => {
2477
+ if (CODE_LINE_RE.test(text)) return true;
2478
+ const words = text.trim().split(/\s+/u).filter(Boolean);
2479
+ return words.length === 1 && (words[0]?.length ?? 0) === 1;
2480
+ };
2481
+ const looksLikeNoiseHeading = (text, normalizedText) => {
2482
+ const wordCount = text.trim().split(/\s+/u).filter(Boolean).length;
2483
+ if (/(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\s])/u.test(text)) return false;
2484
+ if (wordCount >= 8 && COLON_NOISE_RE.test(text)) return true;
2485
+ return NORMALIZED_NOISE_TOKENS.some((token) => normalizedText.includes(token)) && wordCount >= 4;
2486
+ };
2487
+ /**
2488
+ * Classifies a markdown heading line produced by `convertContentToMarkdown()`.
2489
+ */
2490
+ const classifyDictionaryHeading = (line) => {
2491
+ const text = line.startsWith(HEADING_PREFIX) ? line.slice(3).trim() : line.trim();
2492
+ const unwrapped = stripLeadingWrappers(text);
2493
+ const normalizedText = normalizeArabicForComparison(text);
2494
+ const normalizedUnwrapped = normalizeArabicForComparison(unwrapped);
2495
+ if (!text) return "noise";
2496
+ if (CHAPTER_HEADING_RE.test(text) || NORMALIZED_CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizedUnwrapped, term))) return "chapter";
2497
+ if (looksLikeNoiseHeading(text, normalizedText)) return "noise";
2498
+ if (isCodeHeading(text)) return "marker";
2499
+ if (NORMALIZED_MARKER_PREFIXES.some((token) => normalizedUnwrapped.startsWith(token))) return "marker";
2500
+ if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
2501
+ if (CLUSTER_HEADING_RE.test(text)) return "cluster";
2502
+ return "entry";
2503
+ };
2504
+ const createHeadingMatch = (kind, page, rawLine, lineNumber) => ({
2505
+ kind,
2506
+ lemma: kind === "entry" ? rawLine.slice(3).trim() : void 0,
2507
+ line: lineNumber,
2508
+ pageId: page.id,
2509
+ text: rawLine
2510
+ });
2511
+ const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
2512
+ kind,
2513
+ lemma,
2514
+ line: lineNumber,
2515
+ pageId: page.id,
2516
+ text
2517
+ });
2518
+ const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
2519
+ if (!rawLine.startsWith(HEADING_PREFIX)) return false;
2520
+ const kind = classifyDictionaryHeading(rawLine);
2521
+ matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
2522
+ return true;
2523
+ };
2524
+ const scanLineEntry = (page, rawLine, lineNumber, matches) => {
2525
+ const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
2526
+ if (!lineEntry?.groups?.lemma) return;
2527
+ matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
2528
+ };
2529
+ const scanPairedForms = (page, rawLine, lineNumber, matches) => {
2530
+ const pairedForms = rawLine.match(PAIRED_FORMS_RE);
2531
+ if (!pairedForms?.groups?.forms) return;
2532
+ matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
2066
2533
  };
2067
- const createPageContexts = (pages, pageMap, normalizedPages) => {
2068
- if (normalizedPages && normalizedPages.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} normalized pages, received ${normalizedPages.length}`);
2069
- if (pageMap.boundaries.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} page boundaries, received ${pageMap.boundaries.length}`);
2070
- const contexts = [];
2071
- for (let index = 0; index < pages.length; index++) {
2072
- const page = pages[index];
2073
- const boundary = pageMap.boundaries[index];
2074
- if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
2075
- const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
2076
- contexts.push({
2077
- boundary,
2078
- content,
2079
- index,
2080
- lines: buildPageLines(content),
2081
- page
2082
- });
2083
- }
2084
- return contexts;
2534
+ const scanCodeLine = (page, rawLine, lineNumber, matches) => {
2535
+ const codeLine = rawLine.match(CODE_LINE_RE);
2536
+ if (!codeLine?.groups?.codes) return;
2537
+ matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
2085
2538
  };
2086
- const normalizeIntroContextText = (text) => normalizeArabicForComparison(text).replace(/[\\/]+/gu, " ").replace(/[«»"“”'‘’()[\]{}]+/gu, " ").replace(/\s+/gu, " ").trim();
2087
- const startsWithConfiguredWord = (words, candidate) => words.some((word) => normalizedStartsWith(candidate, word));
2088
- const buildPageLines = (content) => {
2089
- const parts = content.split("\n");
2090
- const lines = [];
2091
- let offset = 0;
2092
- for (let index = 0; index < parts.length; index++) {
2093
- const text = parts[index] ?? "";
2094
- lines.push({
2095
- lineNumber: index + 1,
2096
- start: offset,
2097
- text
2098
- });
2099
- offset += text.length + 1;
2539
+ const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
2540
+ for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
2541
+ if (!match.groups?.lemma) continue;
2542
+ matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
2100
2543
  }
2101
- return lines;
2102
2544
  };
2103
- const headingMatchesGate = (headingText, gate) => {
2104
- if (gate.use === "headingText") {
2105
- const useFuzzy = gate.fuzzy ?? false;
2106
- const source = useFuzzy ? normalizeArabicForComparison(headingText) : headingText.trim();
2107
- const match = useFuzzy ? normalizeArabicForComparison(gate.match) : gate.match.trim();
2108
- return !!match && isDelimitedPrefixMatch(source, match);
2545
+ /**
2546
+ * Extracts dictionary surface matches from a markdown page.
2547
+ */
2548
+ const scanDictionaryMarkdownPage = (page) => {
2549
+ const lines = page.content.split(/\n/u);
2550
+ const matches = [];
2551
+ for (let index = 0; index < lines.length; index++) {
2552
+ const rawLine = lines[index]?.trim() ?? "";
2553
+ if (!rawLine) continue;
2554
+ if (scanHeadingLine(page, rawLine, index + 1, matches)) continue;
2555
+ scanLineEntry(page, rawLine, index + 1, matches);
2556
+ scanPairedForms(page, rawLine, index + 1, matches);
2557
+ scanCodeLine(page, rawLine, index + 1, matches);
2558
+ scanInlineSubentries(page, rawLine, index + 1, matches);
2109
2559
  }
2110
- return normalizedStartsWith(headingText, GATE_TOKEN_MAP[gate.token]);
2111
- };
2112
- const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
2113
- const trimmed = line.text.trim();
2114
- if (!trimmed.startsWith(HEADING_PREFIX)) return false;
2115
- const headingText = trimmed.replace(/^##\s+/u, "").trim();
2116
- return gates.some((gate) => headingMatchesGate(headingText, gate));
2117
- });
2118
- const pageWithinZoneBounds = (zone, pageId) => {
2119
- if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2120
- if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2121
- return true;
2560
+ return matches;
2122
2561
  };
2123
- const findActivationPageId = (zone, pages) => {
2562
+ /**
2563
+ * Aggregates dictionary surface counts across markdown pages.
2564
+ */
2565
+ const analyzeDictionaryMarkdownPages = (pages) => {
2566
+ const counts = emptyCounts();
2567
+ const matches = [];
2124
2568
  for (const page of pages) {
2125
- if (!pageWithinZoneBounds(zone, page.page.id)) continue;
2126
- if (pageMatchesAnyGate(page, zone.when?.activateAfter ?? [])) return page.page.id;
2127
- }
2128
- return null;
2129
- };
2130
- const createZoneActivationMap = (profile, pages) => {
2131
- const activation = /* @__PURE__ */ new Map();
2132
- for (const zone of profile.zones) {
2133
- if (!zone.when?.activateAfter?.length) {
2134
- activation.set(zone.name, null);
2135
- continue;
2569
+ const pageMatches = scanDictionaryMarkdownPage(page);
2570
+ for (const match of pageMatches) {
2571
+ counts[match.kind] += 1;
2572
+ matches.push(match);
2136
2573
  }
2137
- activation.set(zone.name, findActivationPageId(zone, pages));
2138
2574
  }
2139
- return activation;
2575
+ return {
2576
+ counts,
2577
+ matches
2578
+ };
2140
2579
  };
2141
- const pageMatchesZone = (zone, activationMap, pageId) => {
2142
- if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2143
- if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2144
- if (!zone.when?.activateAfter?.length) return true;
2145
- const activatedAt = activationMap.get(zone.name);
2146
- return activatedAt !== null && activatedAt !== void 0 && pageId >= activatedAt;
2580
+ //#endregion
2581
+ //#region src/dictionary/dictionary-candidates.ts
2582
+ const lineEntryRegexCache = /* @__PURE__ */ new WeakMap();
2583
+ const inlineSubentryRegexCache = /* @__PURE__ */ new WeakMap();
2584
+ const pairedFormsRegexCache = /* @__PURE__ */ new WeakMap();
2585
+ const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
2586
+ const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN$1}$`, "u");
2587
+ const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
2588
+ const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
2589
+ const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
2590
+ const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
2591
+ const createLineEntryRegex = (family) => {
2592
+ const cached = lineEntryRegexCache.get(family);
2593
+ if (cached) return cached;
2594
+ const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
2595
+ const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
2596
+ const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
2597
+ lineEntryRegexCache.set(family, regex);
2598
+ return regex;
2147
2599
  };
2148
- const resolveActiveZone = (profile, activationMap, pageId) => {
2149
- let activeZone = null;
2150
- for (const zone of profile.zones) if (pageMatchesZone(zone, activationMap, pageId)) activeZone = zone;
2151
- return activeZone;
2600
+ const parseWrappedCode = (text) => {
2601
+ const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[)\]])$/u);
2602
+ if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
2603
+ return {
2604
+ close: paired.groups.close,
2605
+ inner: paired.groups.inner.trim(),
2606
+ open: paired.groups.open,
2607
+ paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
2608
+ };
2609
+ };
2610
+ const collectHeadingCandidates = (pageStartOffset, line, nextLine, family, trimmed) => {
2611
+ if (!trimmed.startsWith("## ")) return [];
2612
+ const headingClass = classifyDictionaryHeading(trimmed);
2613
+ if (headingClass === "noise") return [];
2614
+ const candidate = createHeadingCandidate(pageStartOffset, line, nextLine, family, headingClass);
2615
+ return candidate ? [candidate] : [];
2152
2616
  };
2153
2617
  const createHeadingCandidate = (pageStartOffset, line, nextLine, family, headingClass) => {
2154
2618
  if (!family.classes.includes(headingClass)) return null;
@@ -2168,19 +2632,6 @@ const createHeadingCandidate = (pageStartOffset, line, nextLine, family, heading
2168
2632
  text: line.text.trim()
2169
2633
  };
2170
2634
  };
2171
- const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
2172
- const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
2173
- const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
2174
- const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
2175
- const createLineEntryRegex = (family) => {
2176
- const cached = lineEntryRegexCache.get(family);
2177
- if (cached) return cached;
2178
- const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
2179
- const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
2180
- const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
2181
- lineEntryRegexCache.set(family, regex);
2182
- return regex;
2183
- };
2184
2635
  const collectLineEntryCandidates = (pageStartOffset, line, family) => {
2185
2636
  const trimmed = line.text.trim();
2186
2637
  if (STATUS_LINE_RE.test(trimmed)) return [];
@@ -2198,17 +2649,22 @@ const collectLineEntryCandidates = (pageStartOffset, line, family) => {
2198
2649
  }];
2199
2650
  };
2200
2651
  const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
2201
- const cached = inlineSubentryRegexCache.get(family);
2202
- const prefixes = family.prefixes.length > 0 ? family.prefixes.map(escapeRegex).join("|") : escapeRegex("و");
2203
- const regex = cached ?? new RegExp(`(^|[\\s،؛,:.])(?<lemma>(?:${prefixes})${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})\\s*:`, "gu");
2204
- if (!cached) inlineSubentryRegexCache.set(family, regex);
2652
+ let cached = inlineSubentryRegexCache.get(family);
2653
+ if (!cached) {
2654
+ const prefixes = family.prefixes.length > 0 ? family.prefixes.map(escapeRegex).join("|") : escapeRegex("و");
2655
+ cached = {
2656
+ matchRegex: new RegExp(`(^|[\\s،؛,:.])(?<lemma>(?:${prefixes})${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})\\s*:`, "gu"),
2657
+ stripPrefixRegex: new RegExp(`^(?:${prefixes})`, "u")
2658
+ };
2659
+ inlineSubentryRegexCache.set(family, cached);
2660
+ }
2205
2661
  const candidates = [];
2206
- for (const match of line.text.matchAll(regex)) {
2662
+ for (const match of line.text.matchAll(cached.matchRegex)) {
2207
2663
  if (!match.groups?.lemma || match.index === void 0) continue;
2208
2664
  const lemmaIndex = match[0].indexOf(match.groups.lemma);
2209
2665
  if (lemmaIndex < 0) continue;
2210
2666
  const candidateStart = match.index + lemmaIndex;
2211
- const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(new RegExp(`^(?:${prefixes})`, "u"), "") : match.groups.lemma;
2667
+ const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(cached.stripPrefixRegex, "") : match.groups.lemma;
2212
2668
  candidates.push({
2213
2669
  absoluteIndex: pageStartOffset + line.start + candidateStart,
2214
2670
  family: "inlineSubentry",
@@ -2222,18 +2678,6 @@ const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
2222
2678
  }
2223
2679
  return candidates;
2224
2680
  };
2225
- const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN}$`, "u");
2226
- const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
2227
- const parseWrappedCode = (text) => {
2228
- const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[\])])$/u);
2229
- if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
2230
- return {
2231
- close: paired.groups.close,
2232
- inner: paired.groups.inner.trim(),
2233
- open: paired.groups.open,
2234
- paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
2235
- };
2236
- };
2237
2681
  const collectCodeLineCandidates = (pageStartOffset, line, family) => {
2238
2682
  const trimmed = line.text.trim();
2239
2683
  const bare = trimmed.replace(STATUS_SUFFIX_RE, "").trim();
@@ -2271,255 +2715,470 @@ const collectPairedFormsCandidates = (pageStartOffset, line, family) => {
2271
2715
  text: line.text.trim()
2272
2716
  }];
2273
2717
  };
2274
- const blockerApplies = (blocker, family) => !blocker.appliesTo || blocker.appliesTo.includes(family);
2275
- const isIntroCandidate = (text) => {
2276
- const normalized = normalizeIntroContextText(text);
2277
- return INTRO_PHRASES.some((phrase) => normalized.startsWith(normalizeArabicForComparison(phrase)));
2718
+ const assertNever$1 = (value) => {
2719
+ throw new Error(`Unhandled dictionary candidate family: ${JSON.stringify(value)}`);
2278
2720
  };
2279
- const endsWithIntroPhrase = (text) => {
2280
- const trimmed = text.trimEnd();
2281
- if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
2282
- const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
2283
- return INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)));
2721
+ const collectCandidatesForFamily = (pageStartOffset, line, nextLine, family, trimmed) => {
2722
+ switch (family.use) {
2723
+ case "heading": return collectHeadingCandidates(pageStartOffset, line, nextLine, family, trimmed);
2724
+ case "lineEntry": return collectLineEntryCandidates(pageStartOffset, line, family);
2725
+ case "inlineSubentry": return collectInlineSubentryCandidates(pageStartOffset, line, family);
2726
+ case "codeLine": return collectCodeLineCandidates(pageStartOffset, line, family);
2727
+ case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
2728
+ default: return assertNever$1(family);
2729
+ }
2730
+ };
2731
+ const familyMayMatchLine = (family, trimmed) => {
2732
+ switch (family.use) {
2733
+ case "heading": return trimmed.startsWith("## ");
2734
+ case "lineEntry":
2735
+ case "inlineSubentry":
2736
+ case "pairedForms": return trimmed.includes(":");
2737
+ case "codeLine": return /^(?:[[(])?\p{Script=Arabic}/u.test(trimmed);
2738
+ default: return assertNever$1(family);
2739
+ }
2740
+ };
2741
+ /**
2742
+ * Collects all family candidates for a single dictionary line within a zone.
2743
+ */
2744
+ const collectCandidatesForLine = (pageStartOffset, line, nextLine, zone) => {
2745
+ const trimmed = line.text.trim();
2746
+ if (!trimmed) return [];
2747
+ const candidates = [];
2748
+ for (const family of zone.families) {
2749
+ if (!familyMayMatchLine(family, trimmed)) continue;
2750
+ candidates.push(...collectCandidatesForFamily(pageStartOffset, line, nextLine, family, trimmed));
2751
+ }
2752
+ return candidates;
2753
+ };
2754
+ //#endregion
2755
+ //#region src/dictionary/dictionary-zones.ts
2756
+ const normalizedStartsWith = (text, prefix) => normalizeArabicForComparison(text).startsWith(normalizeArabicForComparison(prefix));
2757
+ const isDelimitedPrefixMatch = (text, prefix) => {
2758
+ if (text === prefix) return true;
2759
+ if (!text.startsWith(prefix)) return false;
2760
+ const nextChar = text[prefix.length];
2761
+ return nextChar === void 0 || GATE_DELIMITER_RE.test(nextChar);
2762
+ };
2763
+ const getHeadingTextGateMatch = (gate, useFuzzy) => {
2764
+ if (useFuzzy) return "normalizedMatch" in gate ? gate.normalizedMatch : normalizeArabicForComparison(gate.match);
2765
+ return "trimmedMatch" in gate ? gate.trimmedMatch : gate.match.trim();
2766
+ };
2767
+ const buildPageLines = (content) => {
2768
+ const parts = content.split("\n");
2769
+ const lines = [];
2770
+ let offset = 0;
2771
+ for (let index = 0; index < parts.length; index++) {
2772
+ const text = parts[index] ?? "";
2773
+ lines.push({
2774
+ lineNumber: index + 1,
2775
+ start: offset,
2776
+ text
2777
+ });
2778
+ offset += text.length + 1;
2779
+ }
2780
+ return lines;
2781
+ };
2782
+ const headingMatchesGate = (headingText, gate) => {
2783
+ if (gate.use === "headingText") {
2784
+ const useFuzzy = gate.fuzzy ?? false;
2785
+ const source = useFuzzy ? normalizeArabicForComparison(headingText) : headingText.trim();
2786
+ const match = getHeadingTextGateMatch(gate, useFuzzy);
2787
+ return !!match && isDelimitedPrefixMatch(source, match);
2788
+ }
2789
+ return normalizedStartsWith(headingText, GATE_TOKEN_MAP[gate.token]);
2790
+ };
2791
+ const createPageContext = (page, boundary, content, index) => {
2792
+ let cachedLines;
2793
+ const context = {
2794
+ boundary,
2795
+ content,
2796
+ index,
2797
+ page
2798
+ };
2799
+ Object.defineProperty(context, "lines", {
2800
+ configurable: true,
2801
+ enumerable: true,
2802
+ get: () => {
2803
+ cachedLines ??= buildPageLines(content);
2804
+ return cachedLines;
2805
+ }
2806
+ });
2807
+ return context;
2808
+ };
2809
+ const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
2810
+ const trimmed = line.text.trim();
2811
+ if (!trimmed.startsWith("## ")) return false;
2812
+ const headingText = trimmed.slice(3).trim();
2813
+ return gates.some((gate) => headingMatchesGate(headingText, gate));
2814
+ });
2815
+ const pageWithinZoneBounds = (zone, pageId) => {
2816
+ if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2817
+ if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2818
+ return true;
2819
+ };
2820
+ const findActivationPageId = (zone, pages) => {
2821
+ for (const page of pages) {
2822
+ if (!pageWithinZoneBounds(zone, page.page.id)) continue;
2823
+ if (pageMatchesAnyGate(page, zone.when?.activateAfter ?? [])) return page.page.id;
2824
+ }
2825
+ return null;
2826
+ };
2827
+ const createZoneActivationMap = (profile, pages) => {
2828
+ const activation = /* @__PURE__ */ new Map();
2829
+ for (const zone of profile.zones) {
2830
+ if (!zone.when?.activateAfter?.length) {
2831
+ activation.set(zone.name, null);
2832
+ continue;
2833
+ }
2834
+ activation.set(zone.name, findActivationPageId(zone, pages));
2835
+ }
2836
+ return activation;
2837
+ };
2838
+ const pageMatchesZone = (zone, activationMap, pageId) => {
2839
+ if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2840
+ if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2841
+ if (!zone.when?.activateAfter?.length) return true;
2842
+ const activatedAt = activationMap.get(zone.name);
2843
+ return activatedAt !== null && activatedAt !== void 0 && pageId >= activatedAt;
2284
2844
  };
2285
- const endsWithIntroContext = (text) => {
2286
- const trimmed = text.trimEnd();
2287
- if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
2288
- const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
2289
- if (!normalized) return false;
2290
- if (INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
2291
- if (INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
2292
- return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
2845
+ const resolveActiveZone = (profile, activationMap, pageId) => {
2846
+ let activeZone = null;
2847
+ for (const zone of profile.zones) if (pageMatchesZone(zone, activationMap, pageId)) activeZone = zone;
2848
+ return activeZone;
2293
2849
  };
2294
- const isAuthorityCandidate = (text, precision) => {
2295
- const head = normalizeStopLemma(text.split(":", 1)[0] ?? text);
2296
- if (head && AUTHORITY_HEAD_WORDS.some((term) => normalizeStopLemma(term) === head)) return true;
2297
- if (AUTHORITY_RE.test(text)) return true;
2298
- if (precision === "aggressive") {
2299
- const normalized = normalizeIntroContextText(text);
2300
- return [
2301
- "الليث",
2302
- "الأزهري",
2303
- "الأصمعي",
2304
- "الجوهري",
2305
- "الفراء",
2306
- "ثعلب",
2307
- "شمر"
2308
- ].some((term) => normalized.startsWith(normalizeArabicForComparison(term)));
2850
+ const createPageContexts = (pages, pageMap, normalizedPages) => {
2851
+ if (normalizedPages && normalizedPages.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} normalized pages, received ${normalizedPages.length}`);
2852
+ if (pageMap.boundaries.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} page boundaries, received ${pageMap.boundaries.length}`);
2853
+ const contexts = [];
2854
+ for (let index = 0; index < pages.length; index++) {
2855
+ const page = pages[index];
2856
+ const boundary = pageMap.boundaries[index];
2857
+ if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
2858
+ const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
2859
+ contexts.push(createPageContext(page, boundary, content, index));
2309
2860
  }
2310
- return false;
2861
+ return contexts;
2311
2862
  };
2312
- const hasBlockedQualifierTail = (lemma) => {
2313
- const parts = lemma.split(/[،,]/u).map((part) => part.trim()).filter(Boolean);
2314
- if (parts.length < 2) return false;
2315
- return startsWithConfiguredWord(QUALIFIER_TAIL_PREFIXES, parts.slice(1).join(" "));
2863
+ //#endregion
2864
+ //#region src/dictionary/profile.ts
2865
+ const normalizedProfileCache = /* @__PURE__ */ new WeakMap();
2866
+ const PREVIOUS_WORD_SCOPES = [
2867
+ "samePage",
2868
+ "pageStart",
2869
+ "any"
2870
+ ];
2871
+ const BLOCKER_PRECISIONS = ["high", "aggressive"];
2872
+ const uniqueNormalizedSet = (values, normalize) => new Set(values.map(normalize).filter(Boolean));
2873
+ const assertNever = (value) => {
2874
+ throw new Error(`Unhandled dictionary profile variant: ${JSON.stringify(value)}`);
2316
2875
  };
2317
- const looksLikeStructuralLeak = (candidate) => {
2318
- if (!candidate.lemma) return false;
2319
- const normalizedLemma = normalizeArabicForComparison(candidate.lemma);
2320
- if (candidate.kind === "entry" && (/^[^\p{Script=Arabic}\d]+/u.test(candidate.lemma) || candidate.lemma.includes("{") || candidate.lemma.includes("}") || candidate.lemma.includes("##"))) return true;
2321
- if (candidate.kind === "entry" && BARE_CODE_LEMMA_RE.test(candidate.lemma) && (candidate.text === candidate.lemma || candidate.text === `${HEADING_PREFIX}${candidate.lemma}` || candidate.text.startsWith(`${HEADING_PREFIX}${candidate.lemma}`) || candidate.text.startsWith(`${candidate.lemma}\n${HEADING_PREFIX}`))) return true;
2322
- if (candidate.family !== "pairedForms" && candidate.lemma.split(/\s+/u).filter(Boolean).length > 4) return true;
2323
- if (startsWithConfiguredWord(STRUCTURAL_LEMMA_PREFIXES, candidate.lemma)) return true;
2324
- if (normalizedLemma.startsWith(normalizeArabicForComparison("ولل"))) return true;
2325
- const structuralText = candidate.text.startsWith(HEADING_PREFIX) ? candidate.text.slice(3).trim() : candidate.text;
2326
- if (/^[\d\u0660-\u0669]+\s*-\s*\([^)]+\)(?:\s+##.*)?$/u.test(structuralText)) return true;
2327
- const normalizedText = normalizeArabicForComparison(structuralText);
2328
- if (STRUCTURAL_LINE_PATTERNS.some((pattern) => pattern.test(structuralText))) return STRUCTURAL_LINE_KEYWORDS.some((keyword) => normalizedText.includes(normalizeArabicForComparison(keyword)));
2329
- return false;
2876
+ const normalizeFamily = (family) => {
2877
+ switch (family.use) {
2878
+ case "heading": return {
2879
+ ...family,
2880
+ allowNextLineColon: family.allowNextLineColon ?? false,
2881
+ allowSingleLetter: family.allowSingleLetter ?? false
2882
+ };
2883
+ case "lineEntry": return {
2884
+ ...family,
2885
+ allowMultiWord: family.allowMultiWord ?? false,
2886
+ allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
2887
+ wrappers: family.wrappers ?? "none"
2888
+ };
2889
+ case "inlineSubentry": return {
2890
+ ...family,
2891
+ prefixes: family.prefixes ?? ["و"],
2892
+ stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
2893
+ };
2894
+ case "codeLine": return {
2895
+ ...family,
2896
+ wrappers: family.wrappers ?? "either"
2897
+ };
2898
+ case "pairedForms": return {
2899
+ ...family,
2900
+ requireStatusTail: family.requireStatusTail ?? false,
2901
+ separator: family.separator ?? "comma"
2902
+ };
2903
+ default: return assertNever(family);
2904
+ }
2330
2905
  };
2331
- const countLemma = (map, lemma) => {
2332
- if (!lemma) return;
2333
- map.set(lemma, (map.get(lemma) ?? 0) + 1);
2906
+ const normalizeBlocker = (blocker) => {
2907
+ switch (blocker.use) {
2908
+ case "authorityIntro": return {
2909
+ ...blocker,
2910
+ precision: blocker.precision ?? "high"
2911
+ };
2912
+ case "stopLemma": return {
2913
+ ...blocker,
2914
+ normalizedWords: uniqueNormalizedSet(blocker.words, normalizeStopLemmaWord)
2915
+ };
2916
+ case "previousWord": return {
2917
+ ...blocker,
2918
+ normalizedWords: uniqueNormalizedSet(blocker.words, normalizeArabicForComparison),
2919
+ scope: blocker.scope ?? "samePage"
2920
+ };
2921
+ case "previousChar": return {
2922
+ ...blocker,
2923
+ charSet: new Set(blocker.chars)
2924
+ };
2925
+ case "intro": return blocker;
2926
+ case "pageContinuation": return {
2927
+ ...blocker,
2928
+ authorityPrecision: blocker.authorityPrecision ?? "high"
2929
+ };
2930
+ default: return assertNever(blocker);
2931
+ }
2334
2932
  };
2335
- const createInitialKindCounts = () => ({
2336
- chapter: 0,
2337
- entry: 0,
2338
- marker: 0
2339
- });
2340
- const createInitialReasonCounts = () => ({
2341
- authorityIntro: 0,
2342
- intro: 0,
2343
- pageContinuation: 0,
2344
- previousChar: 0,
2345
- previousWord: 0,
2346
- qualifierTail: 0,
2347
- stopLemma: 0,
2348
- structuralLeak: 0
2933
+ const normalizeGate = (gate) => {
2934
+ if (gate.use === "headingToken") return gate;
2935
+ const trimmedMatch = gate.match.trim();
2936
+ return {
2937
+ ...gate,
2938
+ normalizedMatch: normalizeArabicForComparison(trimmedMatch),
2939
+ trimmedMatch
2940
+ };
2941
+ };
2942
+ const normalizeZone = (zone) => ({
2943
+ blockers: (zone.blockers ?? []).map(normalizeBlocker),
2944
+ families: zone.families.map(normalizeFamily),
2945
+ name: zone.name,
2946
+ when: zone.when ? {
2947
+ activateAfter: zone.when.activateAfter?.map(normalizeGate),
2948
+ maxPageId: zone.when.maxPageId,
2949
+ minPageId: zone.when.minPageId
2950
+ } : void 0
2349
2951
  });
2350
- const createInitialFamilyCounts = () => ({
2351
- codeLine: {
2352
- accepted: 0,
2353
- rejected: 0
2354
- },
2355
- heading: {
2356
- accepted: 0,
2357
- rejected: 0
2358
- },
2359
- inlineSubentry: {
2360
- accepted: 0,
2361
- rejected: 0
2362
- },
2363
- lineEntry: {
2364
- accepted: 0,
2365
- rejected: 0
2366
- },
2367
- pairedForms: {
2368
- accepted: 0,
2369
- rejected: 0
2370
- }
2952
+ const createIssue = (code, path, message, zoneName) => ({
2953
+ code,
2954
+ message,
2955
+ path,
2956
+ ...zoneName ? { zoneName } : {}
2371
2957
  });
2372
- const rejectsViaIntroBlocker = (candidate, blocker, localBeforeCandidate) => {
2373
- if (blocker.use !== "intro") return false;
2374
- return isIntroCandidate(candidate.probeText) || endsWithIntroPhrase(localBeforeCandidate) || endsWithIntroContext(localBeforeCandidate);
2958
+ const hasBlankString = (values) => values.length === 0 || values.some((value) => !value.trim());
2959
+ const pushBlockerIssue = (issues, code, path, message, zoneName) => {
2960
+ issues.push(createIssue(code, path, message, zoneName));
2375
2961
  };
2376
- const rejectsViaAuthorityBlocker = (candidate, blocker) => blocker.use === "authorityIntro" && isAuthorityCandidate(candidate.probeText, blocker.precision);
2377
- const rejectsViaStopLemmaBlocker = (candidate, blocker) => blocker.use === "stopLemma" && !!candidate.lemma && !!normalizeStopLemma(candidate.lemma) && blocker.normalizedWords.has(normalizeStopLemma(candidate.lemma));
2378
- const rejectsViaPreviousWordBlocker = (pageContent, localIndex, blocker) => {
2379
- if (blocker.use !== "previousWord") return false;
2380
- const lastWord = extractLastArabicWord$1(pageContent, localIndex);
2381
- return !!lastWord && blocker.normalizedWords.has(normalizeArabicForComparison(lastWord));
2962
+ const validateAuthorityPrecision = (issues, blockerPath, zoneName, code, fieldName, value, blockerUse) => {
2963
+ if (value === void 0 || BLOCKER_PRECISIONS.includes(value)) return;
2964
+ pushBlockerIssue(issues, code, `${blockerPath}.${fieldName}`, `${blockerUse} blocker in zone "${zoneName}" must use ${fieldName} "high" or "aggressive"`, zoneName);
2382
2965
  };
2383
- const rejectsViaPreviousCharBlocker = (pageContent, localIndex, blocker) => {
2384
- if (blocker.use !== "previousChar") return false;
2385
- const previousChar = previousNonWhitespaceChar(pageContent, localIndex);
2386
- return !!previousChar && blocker.charSet.has(previousChar);
2966
+ const validatePreviousWordBlocker = (blocker, blockerPath, zoneName, issues) => {
2967
+ if (hasBlankString(blocker.words)) pushBlockerIssue(issues, "invalid_previous_words", `${blockerPath}.words`, `previousWord blocker in zone "${zoneName}" must include non-empty words`, zoneName);
2968
+ if (blocker.scope !== void 0 && !PREVIOUS_WORD_SCOPES.includes(blocker.scope)) pushBlockerIssue(issues, "invalid_previous_word_scope", `${blockerPath}.scope`, `previousWord blocker in zone "${zoneName}" must use scope "samePage", "pageStart", or "any"`, zoneName);
2387
2969
  };
2388
- const rejectsViaPageContinuationBlocker = (candidate, blocker, localBeforeCandidate, pageIndex, pages) => {
2389
- if (blocker.use !== "pageContinuation") return false;
2390
- if (!(localBeforeCandidate.trim().length === 0) || pageIndex === 0) return false;
2391
- const previousPage = pages[pageIndex - 1];
2392
- if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
2393
- const previousWord = extractLastArabicWord$1(previousPage.content);
2394
- return !!previousWord && CONTINUATION_PREV_WORDS.some((word) => normalizedEquals(word, previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, "high");
2970
+ const validatePreviousCharBlocker = (blocker, blockerPath, zoneName, issues) => {
2971
+ if (blocker.chars.length === 0 || blocker.chars.some((char) => !char)) pushBlockerIssue(issues, "invalid_previous_chars", `${blockerPath}.chars`, `previousChar blocker in zone "${zoneName}" must include chars`, zoneName);
2395
2972
  };
2396
- const getBlockerRejectionReason = (blocker, candidate, localBeforeCandidate, pageContent, pageIndex, pages) => {
2397
- if (rejectsViaIntroBlocker(candidate, blocker, localBeforeCandidate)) return "intro";
2398
- if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
2399
- if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
2400
- if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker)) return "previousWord";
2401
- if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
2402
- if (rejectsViaPageContinuationBlocker(candidate, blocker, localBeforeCandidate, pageIndex, pages)) return "pageContinuation";
2403
- return null;
2973
+ const validateStopLemmaBlocker = (blocker, blockerPath, zoneName, issues) => {
2974
+ if (hasBlankString(blocker.words)) pushBlockerIssue(issues, "invalid_stop_words", `${blockerPath}.words`, `stopLemma blocker in zone "${zoneName}" must include non-empty words`, zoneName);
2404
2975
  };
2405
- const getCandidateRejection = (candidate, zone, pageContext, pages) => {
2406
- const hasQualifierTail = hasBlockedQualifierTail(candidate.lemma ?? "");
2407
- if (hasQualifierTail || looksLikeStructuralLeak(candidate)) return { reason: hasQualifierTail ? "qualifierTail" : "structuralLeak" };
2408
- const localBeforeCandidate = getTrailingContext(pageContext.content, candidate.localIndex);
2409
- for (const blocker of zone.blockers) {
2410
- if (!blockerApplies(blocker, candidate.family)) continue;
2411
- const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
2412
- if (reason) return { reason };
2976
+ const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
2977
+ const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
2978
+ if (gate.use === "headingText") {
2979
+ if (!gate.match.trim()) issues.push(createIssue("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
2980
+ if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
2413
2981
  }
2414
- return null;
2982
+ const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
2983
+ if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
2984
+ seenActivateAfterKeys.add(dedupeKey);
2415
2985
  };
2416
- const shouldRejectCandidate = (candidate, zone, pageContext, pages) => {
2417
- return getCandidateRejection(candidate, zone, pageContext, pages) !== null;
2986
+ const validateFamily = (family, zone, familyIndex, issues) => {
2987
+ const familyPath = `zones[].families[${familyIndex}]`.replace("[]", `[${zone.name}]`);
2988
+ switch (family.use) {
2989
+ case "heading":
2990
+ if (family.classes.length === 0) issues.push(createIssue("empty_heading_classes", `${familyPath}.classes`, `dictionary heading family in zone "${zone.name}" must include at least one class`, zone.name));
2991
+ if (family.emit === "chapter" && !family.classes.includes("chapter")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "chapter" but never matches chapter headings`, zone.name));
2992
+ if (family.emit === "marker" && !family.classes.includes("marker")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "marker" but never matches marker headings`, zone.name));
2993
+ if (family.emit === "entry" && !family.classes.includes("entry")) issues.push(createIssue("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "entry" but never matches entry headings`, zone.name));
2994
+ break;
2995
+ case "lineEntry": break;
2996
+ case "inlineSubentry":
2997
+ if (family.prefixes?.some((prefix) => !prefix.trim())) issues.push(createIssue("empty_inline_prefixes", `${familyPath}.prefixes`, `inlineSubentry prefixes must be non-empty strings`, zone.name));
2998
+ break;
2999
+ case "codeLine": break;
3000
+ case "pairedForms": break;
3001
+ default: assertNever(family);
3002
+ }
2418
3003
  };
2419
- const collectHeadingCandidates = (pageStartOffset, line, nextLine, family, trimmed) => {
2420
- if (!trimmed.startsWith(HEADING_PREFIX)) return [];
2421
- const headingClass = classifyDictionaryHeading(trimmed);
2422
- if (headingClass === "noise") return [];
2423
- const candidate = createHeadingCandidate(pageStartOffset, line, nextLine, family, headingClass);
2424
- return candidate ? [candidate] : [];
3004
+ const validateBlocker = (blocker, zone, blockerIndex, issues) => {
3005
+ const blockerPath = `zones[].blockers[${blockerIndex}]`.replace("[]", `[${zone.name}]`);
3006
+ switch (blocker.use) {
3007
+ case "authorityIntro":
3008
+ validateAuthorityPrecision(issues, blockerPath, zone.name, "invalid_authority_intro_precision", "precision", blocker.precision, "authorityIntro");
3009
+ break;
3010
+ case "stopLemma":
3011
+ validateStopLemmaBlocker(blocker, blockerPath, zone.name, issues);
3012
+ break;
3013
+ case "previousWord":
3014
+ validatePreviousWordBlocker(blocker, blockerPath, zone.name, issues);
3015
+ break;
3016
+ case "previousChar":
3017
+ validatePreviousCharBlocker(blocker, blockerPath, zone.name, issues);
3018
+ break;
3019
+ case "intro": break;
3020
+ case "pageContinuation":
3021
+ validateAuthorityPrecision(issues, blockerPath, zone.name, "invalid_continuation_precision", "authorityPrecision", blocker.authorityPrecision, "pageContinuation");
3022
+ break;
3023
+ default: assertNever(blocker);
3024
+ }
2425
3025
  };
2426
- const collectCandidatesForFamily = (pageStartOffset, line, nextLine, family, trimmed) => {
2427
- switch (family.use) {
2428
- case "heading": return collectHeadingCandidates(pageStartOffset, line, nextLine, family, trimmed);
2429
- case "lineEntry": return collectLineEntryCandidates(pageStartOffset, line, family);
2430
- case "inlineSubentry": return collectInlineSubentryCandidates(pageStartOffset, line, family);
2431
- case "codeLine": return collectCodeLineCandidates(pageStartOffset, line, family);
2432
- case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
2433
- default: return assertNever$1(family);
3026
+ var DictionaryProfileValidationError = class extends Error {
3027
+ issues;
3028
+ constructor(issues) {
3029
+ super(issues.length === 1 ? issues[0].message : `Dictionary profile validation failed with ${issues.length} issues`);
3030
+ this.name = "DictionaryProfileValidationError";
3031
+ this.issues = issues;
2434
3032
  }
2435
3033
  };
2436
- const collectCandidatesForLine = (pageStartOffset, line, nextLine, zone) => {
2437
- const trimmed = line.text.trim();
2438
- const candidates = [];
2439
- if (!trimmed) return candidates;
2440
- for (const family of zone.families) candidates.push(...collectCandidatesForFamily(pageStartOffset, line, nextLine, family, trimmed));
2441
- return candidates;
3034
+ const validateZone = (zone, zoneIndex, seenZoneNames, issues) => {
3035
+ const zonePath = `zones[${zoneIndex}]`;
3036
+ const trimmedName = zone.name.trim();
3037
+ if (!trimmedName) issues.push(createIssue("empty_zone_name", `${zonePath}.name`, `dictionary zone name must be non-empty`));
3038
+ else if (seenZoneNames.has(trimmedName)) issues.push(createIssue("duplicate_zone_name", `${zonePath}.name`, `dictionary zone names must be unique; duplicated "${trimmedName}"`, trimmedName));
3039
+ else seenZoneNames.add(trimmedName);
3040
+ if (zone.families.length === 0) issues.push(createIssue("empty_zone_families", `${zonePath}.families`, `dictionary zone "${zone.name}" must declare at least one family`, zone.name));
3041
+ if (zone.when?.minPageId !== void 0 && zone.when?.maxPageId !== void 0 && zone.when.minPageId > zone.when.maxPageId) issues.push(createIssue("invalid_zone_page_range", `${zonePath}.when`, `dictionary zone "${zone.name}" has minPageId greater than maxPageId`, zone.name));
3042
+ const seenActivateAfterKeys = /* @__PURE__ */ new Set();
3043
+ for (let gateIndex = 0; gateIndex < (zone.when?.activateAfter?.length ?? 0); gateIndex++) validateGate(zone.when.activateAfter[gateIndex], zone, gateIndex, seenActivateAfterKeys, issues);
3044
+ for (let familyIndex = 0; familyIndex < zone.families.length; familyIndex++) validateFamily(zone.families[familyIndex], zone, familyIndex, issues);
3045
+ for (let blockerIndex = 0; blockerIndex < (zone.blockers?.length ?? 0); blockerIndex++) validateBlocker(zone.blockers[blockerIndex], zone, blockerIndex, issues);
2442
3046
  };
2443
- const candidateToSplitPoint = (candidate, debugMetaKey) => {
2444
- const baseMeta = candidate.lemma ? {
2445
- kind: candidate.kind,
2446
- lemma: candidate.lemma
2447
- } : { kind: candidate.kind };
2448
- const meta = debugMetaKey === void 0 ? baseMeta : mergeDebugIntoMeta(baseMeta, debugMetaKey, { dictionary: {
2449
- family: candidate.family,
2450
- ...candidate.headingClass ? { headingClass: candidate.headingClass } : {}
2451
- } });
2452
- return {
2453
- contentStartOffset: candidate.contentStartOffset,
2454
- index: candidate.absoluteIndex,
2455
- meta
3047
+ /**
3048
+ * Validates a dictionary profile without normalizing it.
3049
+ */
3050
+ const validateDictionaryProfile = (profile) => {
3051
+ const issues = [];
3052
+ if (profile.version !== 2) issues.push(createIssue("invalid_version", "version", `dictionary profile version must be 2, got ${profile.version}`));
3053
+ if (profile.zones.length === 0) {
3054
+ issues.push(createIssue("missing_zones", "zones", `dictionary profile must contain at least one zone`));
3055
+ return issues;
3056
+ }
3057
+ const seenZoneNames = /* @__PURE__ */ new Set();
3058
+ for (let zoneIndex = 0; zoneIndex < profile.zones.length; zoneIndex++) validateZone(profile.zones[zoneIndex], zoneIndex, seenZoneNames, issues);
3059
+ return issues;
3060
+ };
3061
+ /**
3062
+ * Normalizes and validates a dictionary profile before runtime matching.
3063
+ */
3064
+ const normalizeDictionaryProfile = (profile) => {
3065
+ const cached = normalizedProfileCache.get(profile);
3066
+ if (cached) return cached;
3067
+ const issues = validateDictionaryProfile(profile);
3068
+ if (issues.length > 0) throw new DictionaryProfileValidationError(issues);
3069
+ const normalized = {
3070
+ version: 2,
3071
+ zones: profile.zones.map(normalizeZone)
2456
3072
  };
3073
+ normalizedProfileCache.set(profile, normalized);
3074
+ return normalized;
3075
+ };
3076
+ //#endregion
3077
+ //#region src/dictionary/dictionary-diagnostics.ts
3078
+ const createInitialKindCounts = () => ({
3079
+ chapter: 0,
3080
+ entry: 0,
3081
+ marker: 0
3082
+ });
3083
+ const createInitialReasonCounts = () => ({
3084
+ authorityIntro: 0,
3085
+ intro: 0,
3086
+ pageContinuation: 0,
3087
+ previousChar: 0,
3088
+ previousWord: 0,
3089
+ qualifierTail: 0,
3090
+ stopLemma: 0,
3091
+ structuralLeak: 0
3092
+ });
3093
+ const createInitialFamilyCounts = () => ({
3094
+ codeLine: {
3095
+ accepted: 0,
3096
+ rejected: 0
3097
+ },
3098
+ heading: {
3099
+ accepted: 0,
3100
+ rejected: 0
3101
+ },
3102
+ inlineSubentry: {
3103
+ accepted: 0,
3104
+ rejected: 0
3105
+ },
3106
+ lineEntry: {
3107
+ accepted: 0,
3108
+ rejected: 0
3109
+ },
3110
+ pairedForms: {
3111
+ accepted: 0,
3112
+ rejected: 0
3113
+ }
3114
+ });
3115
+ const countLemma = (map, lemma) => {
3116
+ if (!lemma) return;
3117
+ map.set(lemma, (map.get(lemma) ?? 0) + 1);
2457
3118
  };
2458
3119
  const pushDiagnosticSample = (samples, sampleLimit, sample) => {
2459
3120
  if (samples.length < sampleLimit) samples.push(sample);
2460
3121
  };
2461
3122
  /**
2462
- * Collects dictionary-profile split points using the pages-only markdown surface.
3123
+ * Builds a minimal `PageMap` from a pages array for use inside
3124
+ * `diagnoseDictionaryProfile`, which does not receive one from the segmenter.
2463
3125
  */
2464
- const collectDictionarySplitPoints = (pages, profile, pageMap, normalizedPages, logger, debugMetaKey) => {
2465
- const normalizedProfile = normalizeDictionaryProfile(profile);
2466
- const pageContexts = createPageContexts(pages, pageMap, normalizedPages);
2467
- const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
2468
- const splitPoints = [];
2469
- logger?.debug?.("[dictionary] collecting split points", {
2470
- pageCount: pages.length,
2471
- zoneCount: normalizedProfile.zones.length
2472
- });
2473
- for (const pageContext of pageContexts) {
2474
- const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
2475
- if (!zone) continue;
2476
- for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
2477
- const line = pageContext.lines[lineIndex];
2478
- const nextLine = pageContext.lines[lineIndex + 1];
2479
- const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
2480
- for (const candidate of candidates) {
2481
- if (shouldRejectCandidate(candidate, zone, pageContext, pageContexts)) continue;
2482
- splitPoints.push(candidateToSplitPoint(candidate, debugMetaKey));
3126
+ const buildDiagnosticsPageMap = (pages, normalizedContents) => {
3127
+ const boundaries = [];
3128
+ const pageBreaks = [];
3129
+ let offset = 0;
3130
+ for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
3131
+ const normalized = normalizedContents[pageIndex];
3132
+ boundaries.push({
3133
+ end: offset + normalized.length,
3134
+ id: pages[pageIndex].id,
3135
+ start: offset
3136
+ });
3137
+ if (pageIndex < pages.length - 1) {
3138
+ pageBreaks.push(offset + normalized.length);
3139
+ offset += normalized.length + 1;
3140
+ } else offset += normalized.length;
3141
+ }
3142
+ const findBoundary = (off) => {
3143
+ let lo = 0;
3144
+ let hi = boundaries.length - 1;
3145
+ while (lo <= hi) {
3146
+ const mid = lo + hi >>> 1;
3147
+ const boundary = boundaries[mid];
3148
+ if (off < boundary.start) {
3149
+ hi = mid - 1;
3150
+ continue;
2483
3151
  }
3152
+ if (off > boundary.end) {
3153
+ lo = mid + 1;
3154
+ continue;
3155
+ }
3156
+ return boundary;
2484
3157
  }
2485
- }
2486
- logger?.debug?.("[dictionary] collected split points", { splitPointCount: splitPoints.length });
2487
- return splitPoints;
3158
+ return boundaries.at(-1);
3159
+ };
3160
+ return {
3161
+ boundaries,
3162
+ getId: (off) => findBoundary(off)?.id ?? 0,
3163
+ pageBreaks,
3164
+ pageIds: pages.map((page) => page.id)
3165
+ };
2488
3166
  };
2489
3167
  /**
2490
- * Collects authoring diagnostics for a dictionary profile without creating segments.
3168
+ * Collects tuning-oriented diagnostics for a dictionary profile without creating
3169
+ * segments. This output is intended for profile authoring workflows rather than
3170
+ * long-term compatibility guarantees.
2491
3171
  *
2492
3172
  * This is useful when tuning blockers and family choices for a new dictionary.
2493
3173
  */
2494
3174
  const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
2495
3175
  const normalizedProfile = normalizeDictionaryProfile(profile);
2496
- const pageMap = {
2497
- boundaries: [],
2498
- getId: (offset) => {
2499
- for (const boundary of pageMap.boundaries) if (offset >= boundary.start && offset <= boundary.end) return boundary.id;
2500
- return pageMap.boundaries.at(-1)?.id ?? 0;
2501
- },
2502
- pageBreaks: [],
2503
- pageIds: pages.map((page) => page.id)
2504
- };
2505
- let offset = 0;
2506
- const pageContexts = createPageContexts(pages, pageMap, pages.map((page, pageIndex) => {
2507
- const normalized = normalizeLineEndings(page.content);
2508
- pageMap.boundaries.push({
2509
- end: offset + normalized.length,
2510
- id: page.id,
2511
- start: offset
2512
- });
2513
- if (pageIndex < pages.length - 1) {
2514
- pageMap.pageBreaks.push(offset + normalized.length);
2515
- offset += normalized.length + 1;
2516
- } else offset += normalized.length;
2517
- return normalized;
2518
- }));
3176
+ const normalizedPages = pages.map((page) => normalizeLineEndings(page.content));
3177
+ const pageContexts = createPageContexts(pages, buildDiagnosticsPageMap(pages, normalizedPages), normalizedPages);
2519
3178
  const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
2520
3179
  const sampleLimit = options.sampleLimit ?? 50;
2521
3180
  const acceptedKinds = createInitialKindCounts();
2522
- const blockerHits = createInitialReasonCounts();
3181
+ const rejectionReasons = createInitialReasonCounts();
2523
3182
  const familyCounts = createInitialFamilyCounts();
2524
3183
  const zoneCounts = {};
2525
3184
  const rejectedLemmaCounts = /* @__PURE__ */ new Map();
@@ -2551,7 +3210,7 @@ const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
2551
3210
  };
2552
3211
  if (rejection) {
2553
3212
  rejectedCount += 1;
2554
- blockerHits[rejection.reason] += 1;
3213
+ rejectionReasons[rejection.reason] += 1;
2555
3214
  familyCounts[candidate.family].rejected += 1;
2556
3215
  zoneCounts[zone.name].rejected += 1;
2557
3216
  countLemma(rejectedLemmaCounts, candidate.lemma);
@@ -2580,186 +3239,59 @@ const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
2580
3239
  return {
2581
3240
  acceptedCount,
2582
3241
  acceptedKinds,
2583
- blockerHits,
2584
3242
  familyCounts,
2585
3243
  pageCount: pages.length,
2586
3244
  rejectedCount,
2587
3245
  rejectedLemmas,
3246
+ rejectionReasons,
2588
3247
  samples,
2589
3248
  zoneCounts
2590
3249
  };
2591
3250
  };
2592
3251
  //#endregion
2593
- //#region src/optimization/optimize-rules.ts
2594
- const MERGEABLE_KEYS = new Set([
2595
- "lineStartsWith",
2596
- "lineStartsAfter",
2597
- "lineEndsWith"
2598
- ]);
2599
- /**
2600
- * Get the pattern type key for a rule.
2601
- */
2602
- const getPatternKey = (rule) => PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
2603
- const getPatternArray = (rule, key) => {
2604
- const value = rule[key];
2605
- return Array.isArray(value) ? value : [];
2606
- };
2607
- const getPatternString = (rule, key) => {
2608
- const value = rule[key];
2609
- return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
2610
- };
2611
- const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
2612
- const getDictionaryEntrySpecificityScore = (rule) => {
2613
- if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
2614
- const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
2615
- return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
2616
- };
2617
- const getSpecificityScore = (rule) => {
2618
- const key = getPatternKey(rule);
2619
- if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
2620
- return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
2621
- };
2622
- const createMergeKey = (rule) => {
2623
- const key = getPatternKey(rule);
2624
- const { [key]: _, ...rest } = rule;
2625
- return `${key}|${JSON.stringify(rest)}`;
2626
- };
2627
- const optimizeRules = (rules) => {
2628
- const output = [];
2629
- const indexByMergeKey = /* @__PURE__ */ new Map();
2630
- let mergedCount = 0;
2631
- for (const rule of rules) {
2632
- const key = getPatternKey(rule);
2633
- if (!MERGEABLE_KEYS.has(key)) {
2634
- output.push(rule);
2635
- continue;
2636
- }
2637
- const mergeKey = createMergeKey(rule);
2638
- const existingIndex = indexByMergeKey.get(mergeKey);
2639
- if (existingIndex === void 0) {
2640
- indexByMergeKey.set(mergeKey, output.length);
2641
- output.push({
2642
- ...rule,
2643
- [key]: normalizePatterns(getPatternArray(rule, key))
2644
- });
2645
- } else {
2646
- const existing = output[existingIndex];
2647
- existing[key] = normalizePatterns([...getPatternArray(existing, key), ...getPatternArray(rule, key)]);
2648
- mergedCount++;
2649
- }
2650
- }
3252
+ //#region src/dictionary/runtime.ts
3253
+ const candidateToSplitPoint = (candidate, debugMetaKey) => {
3254
+ const baseMeta = candidate.lemma ? {
3255
+ kind: candidate.kind,
3256
+ lemma: candidate.lemma
3257
+ } : { kind: candidate.kind };
3258
+ const meta = debugMetaKey === void 0 ? baseMeta : mergeDebugIntoMeta(baseMeta, debugMetaKey, { dictionary: {
3259
+ family: candidate.family,
3260
+ ...candidate.headingClass ? { headingClass: candidate.headingClass } : {}
3261
+ } });
2651
3262
  return {
2652
- mergedCount,
2653
- rules: output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a))
2654
- };
2655
- };
2656
- //#endregion
2657
- //#region src/preprocessing/transforms.ts
2658
- /** Helper for exhaustive switch checking - TypeScript will error if a case is missed */
2659
- const assertNever = (x) => {
2660
- throw new Error(`Unknown preprocess transform type: ${JSON.stringify(x)}`);
2661
- };
2662
- /** Check if a character is whitespace (space, newline, tab, etc.) */
2663
- const isWhitespace = (char) => /\s/.test(char);
2664
- /**
2665
- * Check if a character code is a zero-width control character.
2666
- *
2667
- * Covers:
2668
- * - U+200B–U+200F (Zero Width Space, Joiners, Direction Marks)
2669
- * - U+202A–U+202E (Bidirectional Formatting)
2670
- * - U+2060–U+2064 (Word Joiner, Invisible Operators)
2671
- * - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
2672
- */
2673
- const isZeroWidth = (code) => code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
2674
- /**
2675
- * Remove zero-width control characters from text.
2676
- *
2677
- * @param text - Input text
2678
- * @param mode - 'strip' (default) removes entirely, 'space' replaces with space
2679
- * @returns Text with zero-width characters removed or replaced
2680
- */
2681
- const removeZeroWidth = (text, mode = "strip") => {
2682
- if (mode === "space") {
2683
- const parts = [];
2684
- let lastWasWhitespace = true;
2685
- for (let i = 0; i < text.length; i++) if (isZeroWidth(text.charCodeAt(i))) {
2686
- if (!lastWasWhitespace && parts.length > 0) {
2687
- parts.push(" ");
2688
- lastWasWhitespace = true;
2689
- }
2690
- } else {
2691
- const char = text[i];
2692
- parts.push(char);
2693
- lastWasWhitespace = isWhitespace(char);
2694
- }
2695
- return parts.join("");
2696
- }
2697
- return text.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/g, "");
2698
- };
2699
- /**
2700
- * Condense multiple periods (...) into ellipsis character (…).
2701
- *
2702
- * Prevents `{{tarqim}}` from false-matching inside ellipsis since
2703
- * the `.` in tarqim matches individual periods.
2704
- *
2705
- * @param text - Input text
2706
- * @returns Text with period sequences replaced by ellipsis
2707
- */
2708
- const condenseEllipsis = (text) => text.replace(/\.{2,}/g, "…");
2709
- /**
2710
- * Join trailing و (waw) to the next word.
2711
- *
2712
- * Fixes OCR/digitization artifacts: ' و ' → ' و' (waw joined to next word)
2713
- *
2714
- * @param text - Input text
2715
- * @returns Text with trailing waw joined to following word
2716
- */
2717
- const fixTrailingWaw = (text) => text.replace(/ و /g, " و");
2718
- /**
2719
- * Check if a page ID is within a constraint range.
2720
- */
2721
- const isInRange = (pageId, constraint) => {
2722
- if (constraint.min !== void 0 && pageId < constraint.min) return false;
2723
- if (constraint.max !== void 0 && pageId > constraint.max) return false;
2724
- return true;
2725
- };
2726
- /**
2727
- * Normalize a transform to its object form.
2728
- */
2729
- const normalizeTransform = (transform) => {
2730
- if (typeof transform === "string") return { type: transform };
2731
- return transform;
3263
+ contentStartOffset: candidate.contentStartOffset,
3264
+ index: candidate.absoluteIndex,
3265
+ meta
3266
+ };
2732
3267
  };
2733
3268
  /**
2734
- * Apply preprocessing transforms to a page's content.
2735
- *
2736
- * Transforms run in array order. Each can be limited to specific pages
2737
- * via `min`/`max` constraints.
2738
- *
2739
- * @param content - Page content to transform
2740
- * @param pageId - Page ID for constraint checking
2741
- * @param transforms - Array of transforms to apply
2742
- * @returns Transformed content
3269
+ * Collects dictionary-profile split points using the pages-only markdown surface.
2743
3270
  */
2744
- const applyPreprocessToPage = (content, pageId, transforms) => {
2745
- let result = content;
2746
- for (const transform of transforms) {
2747
- const rule = normalizeTransform(transform);
2748
- if (!isInRange(pageId, rule)) continue;
2749
- switch (rule.type) {
2750
- case "removeZeroWidth":
2751
- result = removeZeroWidth(result, rule.mode ?? "strip");
2752
- break;
2753
- case "condenseEllipsis":
2754
- result = condenseEllipsis(result);
2755
- break;
2756
- case "fixTrailingWaw":
2757
- result = fixTrailingWaw(result);
2758
- break;
2759
- default: assertNever(rule.type);
3271
+ const collectDictionarySplitPoints = (pages, profile, pageMap, normalizedPages, logger, debugMetaKey) => {
3272
+ const normalizedProfile = normalizeDictionaryProfile(profile);
3273
+ const pageContexts = createPageContexts(pages, pageMap, normalizedPages);
3274
+ const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
3275
+ const splitPoints = [];
3276
+ logger?.debug?.("[dictionary] collecting split points", {
3277
+ pageCount: pages.length,
3278
+ zoneCount: normalizedProfile.zones.length
3279
+ });
3280
+ for (const pageContext of pageContexts) {
3281
+ const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
3282
+ if (!zone) continue;
3283
+ for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
3284
+ const line = pageContext.lines[lineIndex];
3285
+ const nextLine = pageContext.lines[lineIndex + 1];
3286
+ const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
3287
+ for (const candidate of candidates) {
3288
+ if (shouldRejectCandidate(candidate, zone, pageContext, pageContexts)) continue;
3289
+ splitPoints.push(candidateToSplitPoint(candidate, debugMetaKey));
3290
+ }
2760
3291
  }
2761
3292
  }
2762
- return result;
3293
+ logger?.debug?.("[dictionary] collected split points", { splitPointCount: splitPoints.length });
3294
+ return splitPoints;
2763
3295
  };
2764
3296
  const WINDOW_PREFIX_LENGTHS = [
2765
3297
  80,
@@ -3656,219 +4188,16 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
3656
4188
  *
3657
4189
  * @param content The text content
3658
4190
  * @param targetPosition The desired split position (hard limit)
3659
- * @param lookbackChars How far back to search for a safe break
3660
- * @returns The new split position (index), or -1 if no safe break found
3661
- */
3662
- const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
3663
- const startSearch = Math.max(0, targetPosition - lookbackChars);
3664
- for (let i = targetPosition - 1; i >= startSearch; i--) {
3665
- const char = content[i];
3666
- if (STOP_CHARACTERS.test(char)) return i + 1;
3667
- }
3668
- return -1;
3669
- };
3670
- //#endregion
3671
- //#region src/segmentation/pattern-validator.ts
3672
- const KNOWN_TOKENS = new Set(getAvailableTokens());
3673
- const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
3674
- const BARE_TOKEN_REGEX = (() => {
3675
- const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
3676
- return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
3677
- })();
3678
- const createMalformedTokenIssue = (tokenLiteral, side) => {
3679
- const token = tokenLiteral.split(":", 1)[0] || void 0;
3680
- return {
3681
- message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
3682
- suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
3683
- token,
3684
- type: "missing_braces"
3685
- };
3686
- };
3687
- const detectMalformedLeftToken = (pattern) => {
3688
- for (let index = 0; index < pattern.length - 1; index++) {
3689
- if (pattern.slice(index, index + 2) !== "{{") continue;
3690
- const closeIndex = pattern.indexOf("}}", index + 2);
3691
- if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
3692
- index = closeIndex + 1;
3693
- }
3694
- };
3695
- const detectMalformedRightToken = (pattern) => {
3696
- for (let index = 0; index < pattern.length - 1; index++) {
3697
- if (pattern.slice(index, index + 2) !== "}}") continue;
3698
- if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
3699
- }
3700
- };
3701
- const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
3702
- /**
3703
- * Validates a single pattern for common issues.
3704
- */
3705
- const validatePattern = (pattern, seenPatterns) => {
3706
- if (!pattern.trim()) return {
3707
- message: "Empty pattern is not allowed",
3708
- type: "empty_pattern"
3709
- };
3710
- if (seenPatterns.has(pattern)) return {
3711
- message: `Duplicate pattern: "${pattern}"`,
3712
- pattern,
3713
- type: "duplicate"
3714
- };
3715
- seenPatterns.add(pattern);
3716
- TOKEN_INSIDE_BRACES.lastIndex = 0;
3717
- for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
3718
- const name = match[1];
3719
- if (name && !KNOWN_TOKENS.has(name)) return {
3720
- message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
3721
- suggestion: "Check spelling or use a known token",
3722
- token: name,
3723
- type: "unknown_token"
3724
- };
3725
- }
3726
- const malformed = detectMalformedToken(pattern);
3727
- if (malformed) return malformed;
3728
- for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
3729
- const [full, name] = match;
3730
- const idx = match.index;
3731
- if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
3732
- message: `Token "${name}" appears to be missing {{}}. Did you mean "{{${full}}}"?`,
3733
- suggestion: `{{${full}}}`,
3734
- token: name,
3735
- type: "missing_braces"
3736
- };
3737
- }
3738
- };
3739
- /**
3740
- * Validates an array of patterns, returning parallel array of issues.
3741
- */
3742
- const validatePatternArray = (patterns) => {
3743
- const seen = /* @__PURE__ */ new Set();
3744
- const issues = patterns.map((p) => validatePattern(p, seen));
3745
- return issues.some(Boolean) ? issues : void 0;
3746
- };
3747
- const applyRulePatternValidation = (result, key, patterns) => {
3748
- if (!patterns) return false;
3749
- const issues = validatePatternArray(patterns);
3750
- if (!issues) return false;
3751
- result[key] = issues;
3752
- return true;
3753
- };
3754
- const validateTemplateRule = (rule, result) => {
3755
- if (!("template" in rule)) return false;
3756
- const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
3757
- if (!issue) return false;
3758
- result.template = issue;
3759
- return true;
3760
- };
3761
- const validateRegexRule = (rule, result) => {
3762
- if (!("regex" in rule)) return false;
3763
- if (!rule.regex.trim()) {
3764
- result.regex = {
3765
- message: "Empty pattern is not allowed",
3766
- type: "empty_pattern"
3767
- };
3768
- return true;
3769
- }
3770
- try {
3771
- new RegExp(rule.regex, "u");
3772
- return false;
3773
- } catch (error) {
3774
- result.regex = {
3775
- message: error instanceof Error ? error.message : String(error),
3776
- pattern: rule.regex,
3777
- type: "invalid_regex"
3778
- };
3779
- return true;
3780
- }
3781
- };
3782
- const invalidDictionaryEntryIssue = (message) => ({
3783
- message,
3784
- type: "invalid_option"
3785
- });
3786
- const addBooleanDictionaryEntryIssue = (issues, key, value) => {
3787
- if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
3788
- };
3789
- const addCaptureNameIssue = (issues, captureName) => {
3790
- if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
3791
- };
3792
- const addMinLettersIssue = (issues, minLetters) => {
3793
- if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
3794
- };
3795
- const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
3796
- const min = minLetters ?? 2;
3797
- if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
3798
- };
3799
- const validateDictionaryEntryRule = (rule, result) => {
3800
- if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
3801
- const issues = {};
3802
- const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
3803
- if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
3804
- addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
3805
- addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
3806
- addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
3807
- addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
3808
- addCaptureNameIssue(issues, captureName);
3809
- addMinLettersIssue(issues, minLetters);
3810
- addMaxLettersIssue(issues, maxLetters, minLetters);
3811
- if (Object.keys(issues).length === 0) return false;
3812
- result.dictionaryEntry = issues;
3813
- return true;
3814
- };
3815
- const formatValidationIssue = (_type, issue, loc) => {
3816
- if (!issue) return null;
3817
- if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
3818
- if (issue.type === "unknown_token") return `${loc}: Unknown token "{{${issue.token}}}"`;
3819
- if (issue.type === "duplicate") return `${loc}: Duplicate pattern "${issue.pattern}"`;
3820
- if (issue.type === "invalid_regex") return `${loc}: Invalid regex (${issue.message})`;
3821
- return `${loc}: ${issue.message || issue.type}`;
3822
- };
3823
- /**
3824
- * Validates split rules for common pattern issues.
3825
- *
3826
- * Checks for:
3827
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
3828
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
3829
- * - Duplicate patterns within the same rule
3830
- *
3831
- * @param rules - Array of split rules to validate
3832
- * @returns Array parallel to input with validation results (undefined if no issues)
3833
- *
3834
- * @example
3835
- * const issues = validateRules([
3836
- * { lineStartsAfter: ['raqms:num'] }, // Missing braces
3837
- * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
3838
- * ]);
3839
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
3840
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
3841
- */
3842
- const validateRules = (rules) => rules.map((rule) => {
3843
- const result = {};
3844
- const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
3845
- const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
3846
- const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
3847
- const templateIssues = validateTemplateRule(rule, result);
3848
- const regexIssues = validateRegexRule(rule, result);
3849
- const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
3850
- return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
3851
- });
3852
- /**
3853
- * Formats a validation result array into a list of human-readable error messages.
3854
- *
3855
- * Useful for displaying validation errors in UIs.
3856
- *
3857
- * @param results - The result array from `validateRules()`
3858
- * @returns Array of formatted error strings
3859
- *
3860
- * @example
3861
- * const issues = validateRules(rules);
3862
- * const errors = formatValidationReport(issues);
3863
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
4191
+ * @param lookbackChars How far back to search for a safe break
4192
+ * @returns The new split position (index), or -1 if no safe break found
3864
4193
  */
3865
- const formatValidationReport = (results) => results.flatMap((result, i) => {
3866
- if (!result) return [];
3867
- return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
3868
- });
3869
- const formatValidationIssues = (type, issues, ruleNumber) => {
3870
- if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
3871
- return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
4194
+ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) => {
4195
+ const startSearch = Math.max(0, targetPosition - lookbackChars);
4196
+ for (let i = targetPosition - 1; i >= startSearch; i--) {
4197
+ const char = content[i];
4198
+ if (STOP_CHARACTERS.test(char)) return i + 1;
4199
+ }
4200
+ return -1;
3872
4201
  };
3873
4202
  //#endregion
3874
4203
  //#region src/segmentation/breakpoint-processor.ts
@@ -4130,7 +4459,6 @@ const computeIterationWindow = (fullContent, cursorPos, currentFromIdx, fromIdx,
4130
4459
  const sliceEnd = Math.max(cursorPos + 1, Math.min(sliceEndByPages, sliceEndByLength));
4131
4460
  return {
4132
4461
  remainingContent: fullContent.slice(cursorPos, sliceEnd),
4133
- sliceEnd,
4134
4462
  windowEndIdx
4135
4463
  };
4136
4464
  };
@@ -4161,31 +4489,87 @@ const updateLastBreakpointFromFound = (found, lastBreakpoint) => {
4161
4489
  };
4162
4490
  return lastBreakpoint;
4163
4491
  };
4164
- const appendPieceAndAdvance = (fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, result, logger, contentLengthSplit) => {
4165
- let { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
4166
- if (actualStartIdx < currentFromIdx) {
4167
- logger?.warn?.("[breakpoints] Page attribution drift detected; clamping actualStartIdx", {
4492
+ const buildIterativeContext = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
4493
+ const fullContent = segment.content;
4494
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
4495
+ logger?.debug?.("[breakpoints] boundaryPositions built", {
4496
+ boundaryPositions,
4497
+ fromIdx,
4498
+ fullContentLength: fullContent.length,
4499
+ toIdx
4500
+ });
4501
+ return {
4502
+ boundaryPositions,
4503
+ cumulativeOffsets,
4504
+ debugMetaKey,
4505
+ expandedBreakpoints,
4506
+ fromIdx,
4507
+ fullContent,
4508
+ logger,
4509
+ maxContentLength,
4510
+ maxPages,
4511
+ normalizedPages,
4512
+ pageIds,
4513
+ prefer,
4514
+ segment,
4515
+ toIdx
4516
+ };
4517
+ };
4518
+ const createInitialIterativeState = (fromIdx) => ({
4519
+ currentFromIdx: fromIdx,
4520
+ cursorPos: 0,
4521
+ isFirstPiece: true,
4522
+ lastBreakpoint: null
4523
+ });
4524
+ const hasIterationWorkRemaining = (state, context) => state.cursorPos < context.fullContent.length && state.currentFromIdx <= context.toIdx;
4525
+ const prepareIteration = (context, state) => {
4526
+ if (!hasIterationWorkRemaining(state, context)) return null;
4527
+ const { remainingContent, windowEndIdx } = computeIterationWindow(context.fullContent, state.cursorPos, state.currentFromIdx, context.fromIdx, context.toIdx, context.pageIds, context.boundaryPositions, context.maxPages, context.maxContentLength);
4528
+ if (!remainingContent.trim()) return null;
4529
+ const actualRemainingContent = context.fullContent.slice(state.cursorPos);
4530
+ const actualEndPos = Math.max(state.cursorPos, context.fullContent.length - 1);
4531
+ return {
4532
+ actualRemainingContent,
4533
+ actualRemainingEndIdx: Math.min(findPageIndexForPosition(actualEndPos, context.boundaryPositions, context.fromIdx), context.toIdx),
4534
+ remainingContent,
4535
+ windowEndIdx,
4536
+ windowEndPosition: computeWindowEndPositionForIteration(remainingContent, state.cursorPos, state.currentFromIdx, context.fromIdx, windowEndIdx, context.toIdx, context.pageIds, context.boundaryPositions, context.normalizedPages, context.cumulativeOffsets, context.maxPages, context.maxContentLength, context.logger)
4537
+ };
4538
+ };
4539
+ const buildPageBoundaryBreakpoint = (context, state) => {
4540
+ const pageBoundaryIdx = context.expandedBreakpoints.findIndex((bp) => bp.regex === null);
4541
+ return pageBoundaryIdx >= 0 ? {
4542
+ breakpointIndex: pageBoundaryIdx,
4543
+ rule: { pattern: "" }
4544
+ } : state.lastBreakpoint;
4545
+ };
4546
+ const appendPieceAndAdvance = (context, state, breakPos, pieceContent, result, contentLengthSplit) => {
4547
+ let { actualEndIdx, actualStartIdx } = computePiecePages(state.cursorPos, breakPos, context.boundaryPositions, context.fromIdx, context.toIdx);
4548
+ if (actualStartIdx < state.currentFromIdx) {
4549
+ context.logger?.warn?.("[breakpoints] Page attribution drift detected; clamping actualStartIdx", {
4168
4550
  actualStartIdx,
4169
- currentFromIdx
4551
+ currentFromIdx: state.currentFromIdx
4170
4552
  });
4171
- actualStartIdx = currentFromIdx;
4553
+ actualStartIdx = state.currentFromIdx;
4172
4554
  }
4173
- if (maxPages === 0) {
4174
- actualEndIdx = Math.min(actualEndIdx, currentFromIdx);
4175
- actualStartIdx = Math.min(actualStartIdx, currentFromIdx);
4176
- } else if (maxPages > 0) {
4177
- const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, toIdx, pageIds, maxPages);
4555
+ if (context.maxPages === 0) {
4556
+ actualEndIdx = Math.min(actualEndIdx, state.currentFromIdx);
4557
+ actualStartIdx = Math.min(actualStartIdx, state.currentFromIdx);
4558
+ } else if (context.maxPages > 0) {
4559
+ const maxAllowedEndIdx = computeWindowEndIdx(actualStartIdx, context.toIdx, context.pageIds, context.maxPages);
4178
4560
  actualEndIdx = Math.min(actualEndIdx, maxAllowedEndIdx);
4179
4561
  }
4180
- const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, originalMeta, lastBreakpoint, contentLengthSplit);
4181
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, true);
4562
+ const meta = getSegmentMetaWithDebug(state.isFirstPiece, context.debugMetaKey, context.segment.meta, state.lastBreakpoint, contentLengthSplit);
4563
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, context.pageIds, meta, true);
4182
4564
  if (pieceSeg) result.push(pieceSeg);
4183
- const next = advanceCursorAndIndex(fullContent, breakPos, actualEndIdx, toIdx, pageIds, normalizedPages);
4565
+ const next = advanceCursorAndIndex(context.fullContent, breakPos, actualEndIdx, context.toIdx, context.pageIds, context.normalizedPages);
4184
4566
  let nextFromIdx = next.currentFromIdx;
4185
- if (maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, boundaryPositions, fromIdx);
4567
+ if (context.maxPages === 0) nextFromIdx = findPageIndexForPosition(next.cursorPos, context.boundaryPositions, context.fromIdx);
4186
4568
  return {
4569
+ ...state,
4187
4570
  currentFromIdx: nextFromIdx,
4188
- cursorPos: next.cursorPos
4571
+ cursorPos: next.cursorPos,
4572
+ isFirstPiece: false
4189
4573
  };
4190
4574
  };
4191
4575
  const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, logger, debugMetaKey, maxContentLength) => {
@@ -4201,109 +4585,84 @@ const tryProcessOversizedSegmentFastPath = (segment, fromIdx, toIdx, pageIds, no
4201
4585
  * For maxPages=0 with maxContentLength: if current page's remaining content fits,
4202
4586
  * create a segment and advance to next page without applying breakpoints.
4203
4587
  */
4204
- const tryHandleCurrentPageFit = (fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segmentMeta, lastBreakpoint, result) => {
4205
- if (maxPages !== 0 || !maxContentLength || currentFromIdx >= actualRemainingEndIdx) return { handled: false };
4206
- const currentPageEndPos = boundaryPositions[currentFromIdx - fromIdx + 1] ?? fullContent.length;
4207
- const currentPageRemainingContent = fullContent.slice(cursorPos, currentPageEndPos).trim();
4208
- if (!currentPageRemainingContent) return { handled: false };
4209
- const currentPageFitsInLength = currentPageRemainingContent.length <= maxContentLength;
4210
- const currentPageHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, currentFromIdx);
4211
- if (!currentPageFitsInLength || currentPageHasExclusions) return { handled: false };
4212
- const pageBoundaryIdx = expandedBreakpoints.findIndex((bp) => bp.regex === null);
4213
- const pageBoundaryBreakpoint = pageBoundaryIdx >= 0 ? {
4214
- breakpointIndex: pageBoundaryIdx,
4215
- rule: { pattern: "" }
4216
- } : lastBreakpoint;
4217
- const includeMeta = isFirstPiece || Boolean(debugMetaKey);
4218
- const meta = getSegmentMetaWithDebug(isFirstPiece, debugMetaKey, segmentMeta, pageBoundaryBreakpoint);
4219
- const seg = createSegment(currentPageRemainingContent, pageIds[currentFromIdx], void 0, includeMeta ? meta : void 0);
4588
+ const tryHandleCurrentPageFit = (context, state, actualRemainingEndIdx, result) => {
4589
+ if (context.maxPages !== 0 || !context.maxContentLength || state.currentFromIdx >= actualRemainingEndIdx) return null;
4590
+ const boundaryIdx = state.currentFromIdx - context.fromIdx + 1;
4591
+ const currentPageEndPos = context.boundaryPositions[boundaryIdx] ?? context.fullContent.length;
4592
+ const currentPageRemainingContent = context.fullContent.slice(state.cursorPos, currentPageEndPos).trim();
4593
+ if (!currentPageRemainingContent) return null;
4594
+ const currentPageFitsInLength = currentPageRemainingContent.length <= context.maxContentLength;
4595
+ const currentPageHasExclusions = hasAnyExclusionsInRange(context.expandedBreakpoints, context.pageIds, state.currentFromIdx, state.currentFromIdx);
4596
+ if (!currentPageFitsInLength || currentPageHasExclusions) return null;
4597
+ const pageBoundaryBreakpoint = buildPageBoundaryBreakpoint(context, state);
4598
+ const includeMeta = state.isFirstPiece || Boolean(context.debugMetaKey);
4599
+ const meta = getSegmentMetaWithDebug(state.isFirstPiece, context.debugMetaKey, context.segment.meta, pageBoundaryBreakpoint);
4600
+ const seg = createSegment(currentPageRemainingContent, context.pageIds[state.currentFromIdx], void 0, includeMeta ? meta : void 0);
4220
4601
  if (seg) result.push(seg);
4221
- let newCursorPos = currentPageEndPos;
4222
- while (newCursorPos < fullContent.length && /\s/.test(fullContent[newCursorPos])) newCursorPos++;
4223
4602
  return {
4224
- handled: true,
4225
- newCursorPos,
4226
- newFromIdx: currentFromIdx + 1,
4227
- newLastBreakpoint: pageBoundaryBreakpoint
4603
+ ...state,
4604
+ currentFromIdx: state.currentFromIdx + 1,
4605
+ cursorPos: skipWhitespace(context.fullContent, currentPageEndPos),
4606
+ isFirstPiece: false,
4607
+ lastBreakpoint: pageBoundaryBreakpoint
4608
+ };
4609
+ };
4610
+ const tryFinalizeIteration = (context, state, prepared, result) => handleOversizedSegmentFit(prepared.actualRemainingContent, state.currentFromIdx, prepared.actualRemainingEndIdx, context.pageIds, context.expandedBreakpoints, context.maxPages, context.maxContentLength, state.isFirstPiece, context.debugMetaKey, context.segment.meta, state.lastBreakpoint, result);
4611
+ const applyBreakpointToIteration = (context, state, prepared, iteration, result) => {
4612
+ context.logger?.trace?.(`[breakpoints] iteration=${iteration}`, {
4613
+ currentFromIdx: state.currentFromIdx,
4614
+ cursorPos: state.cursorPos,
4615
+ windowEndIdx: prepared.windowEndIdx,
4616
+ windowEndPosition: prepared.windowEndPosition
4617
+ });
4618
+ const found = findBreakOffsetForWindow(prepared.remainingContent, state.currentFromIdx, prepared.windowEndIdx, context.toIdx, prepared.windowEndPosition, context.pageIds, context.expandedBreakpoints, context.cumulativeOffsets, context.normalizedPages, context.prefer, context.maxContentLength);
4619
+ const breakOffset = ensureProgressingBreakOffset(found.breakOffset, prepared.remainingContent, state.cursorPos, context.maxContentLength, context.logger);
4620
+ const nextState = {
4621
+ ...state,
4622
+ lastBreakpoint: updateLastBreakpointFromFound(found, state.lastBreakpoint)
4623
+ };
4624
+ const breakPos = state.cursorPos + breakOffset;
4625
+ const pieceContent = context.fullContent.slice(state.cursorPos, breakPos).trim();
4626
+ if (!pieceContent) return {
4627
+ ...nextState,
4628
+ cursorPos: breakPos,
4629
+ isFirstPiece: false
4228
4630
  };
4631
+ return appendPieceAndAdvance(context, nextState, breakPos, pieceContent, result, found.contentLengthSplit);
4229
4632
  };
4230
4633
  const processOversizedSegmentIterative = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength) => {
4231
4634
  const result = [];
4232
- const fullContent = segment.content;
4233
4635
  const pageCount = toIdx - fromIdx + 1;
4234
4636
  logger?.debug?.("[breakpoints] processOversizedSegment: Using iterative path", {
4235
- contentLength: fullContent.length,
4637
+ contentLength: segment.content.length,
4236
4638
  fromIdx,
4237
4639
  maxContentLength,
4238
4640
  maxPages,
4239
4641
  pageCount,
4240
4642
  toIdx
4241
4643
  });
4242
- let cursorPos = 0;
4243
- let currentFromIdx = fromIdx;
4244
- let isFirstPiece = true;
4245
- let lastBreakpoint = null;
4246
- const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
4247
- logger?.debug?.("[breakpoints] boundaryPositions built", {
4248
- boundaryPositions,
4249
- fromIdx,
4250
- fullContentLength: fullContent.length,
4251
- toIdx
4252
- });
4253
- const MAX_SAFE_ITERATIONS = 1e5;
4254
- let didHitMaxIterations = true;
4255
- for (let i = 1; i <= MAX_SAFE_ITERATIONS; i++) {
4256
- if (cursorPos >= fullContent.length || currentFromIdx > toIdx) {
4257
- didHitMaxIterations = false;
4258
- break;
4259
- }
4260
- const { remainingContent, windowEndIdx } = computeIterationWindow(fullContent, cursorPos, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, maxPages, maxContentLength);
4261
- if (!remainingContent.trim()) {
4262
- didHitMaxIterations = false;
4263
- break;
4264
- }
4265
- const actualRemainingContent = fullContent.slice(cursorPos);
4266
- const actualEndPos = Math.max(cursorPos, fullContent.length - 1);
4267
- const actualRemainingEndIdx = Math.min(findPageIndexForPosition(actualEndPos, boundaryPositions, fromIdx), toIdx);
4268
- const currentPageFit = tryHandleCurrentPageFit(fullContent, cursorPos, currentFromIdx, fromIdx, actualRemainingEndIdx, boundaryPositions, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result);
4269
- if (currentPageFit.handled) {
4270
- cursorPos = currentPageFit.newCursorPos;
4271
- currentFromIdx = currentPageFit.newFromIdx;
4272
- lastBreakpoint = currentPageFit.newLastBreakpoint;
4273
- isFirstPiece = false;
4644
+ const context = buildIterativeContext(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey, maxContentLength);
4645
+ let state = createInitialIterativeState(fromIdx);
4646
+ for (let iteration = 1;; iteration++) {
4647
+ const prepared = prepareIteration(context, state);
4648
+ if (!prepared) break;
4649
+ const currentPageFitState = tryHandleCurrentPageFit(context, state, prepared.actualRemainingEndIdx, result);
4650
+ if (currentPageFitState) {
4651
+ state = currentPageFitState;
4274
4652
  continue;
4275
4653
  }
4276
- if (handleOversizedSegmentFit(actualRemainingContent, currentFromIdx, actualRemainingEndIdx, pageIds, expandedBreakpoints, maxPages, maxContentLength, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result)) {
4277
- didHitMaxIterations = false;
4654
+ if (tryFinalizeIteration(context, state, prepared, result)) break;
4655
+ const nextState = applyBreakpointToIteration(context, state, prepared, iteration, result);
4656
+ if (nextState.cursorPos <= state.cursorPos) {
4657
+ context.logger?.error?.("[breakpoints] Iterative splitting stalled; aborting to avoid an infinite loop", {
4658
+ cursorPos: state.cursorPos,
4659
+ iteration,
4660
+ nextCursorPos: nextState.cursorPos
4661
+ });
4278
4662
  break;
4279
4663
  }
4280
- const windowEndPosition = computeWindowEndPositionForIteration(remainingContent, cursorPos, currentFromIdx, fromIdx, windowEndIdx, toIdx, pageIds, boundaryPositions, normalizedPages, cumulativeOffsets, maxPages, maxContentLength, logger);
4281
- logger?.trace?.(`[breakpoints] iteration=${i}`, {
4282
- currentFromIdx,
4283
- cursorPos,
4284
- windowEndIdx,
4285
- windowEndPosition
4286
- });
4287
- const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer, maxContentLength);
4288
- const breakOffset = ensureProgressingBreakOffset(found.breakOffset, remainingContent, cursorPos, maxContentLength, logger);
4289
- lastBreakpoint = updateLastBreakpointFromFound(found, lastBreakpoint);
4290
- const breakPos = cursorPos + breakOffset;
4291
- const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
4292
- if (!pieceContent) {
4293
- cursorPos = breakPos;
4294
- isFirstPiece = false;
4295
- continue;
4296
- }
4297
- const next = appendPieceAndAdvance(fullContent, cursorPos, breakPos, pieceContent, currentFromIdx, fromIdx, toIdx, pageIds, boundaryPositions, normalizedPages, maxPages, isFirstPiece, debugMetaKey, segment.meta, lastBreakpoint, result, logger, found.contentLengthSplit);
4298
- cursorPos = next.cursorPos;
4299
- currentFromIdx = next.currentFromIdx;
4300
- isFirstPiece = false;
4664
+ state = nextState;
4301
4665
  }
4302
- if (didHitMaxIterations) logger?.error?.("[breakpoints] Stopped processing oversized segment: reached MAX_SAFE_ITERATIONS", {
4303
- cursorPos,
4304
- fullContentLength: fullContent.length,
4305
- iterations: MAX_SAFE_ITERATIONS
4306
- });
4307
4666
  logger?.debug?.("[breakpoints] processOversizedSegment: Complete", { resultCount: result.length });
4308
4667
  return result;
4309
4668
  };
@@ -4377,6 +4736,120 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
4377
4736
  return result;
4378
4737
  };
4379
4738
  //#endregion
4739
+ //#region src/dictionary/arabic-dictionary-rule.ts
4740
+ const uniqueCanonicalWords = (words) => {
4741
+ const seen = /* @__PURE__ */ new Set();
4742
+ const result = [];
4743
+ for (const word of words) {
4744
+ const normalized = normalizeArabicForComparison(word);
4745
+ if (!normalized || seen.has(normalized)) continue;
4746
+ seen.add(normalized);
4747
+ result.push(word);
4748
+ }
4749
+ return result;
4750
+ };
4751
+ const buildStopAlternation = (stopWords) => {
4752
+ const unique = uniqueCanonicalWords(stopWords);
4753
+ if (unique.length === 0) return "";
4754
+ return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
4755
+ };
4756
+ const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
4757
+ if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
4758
+ const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
4759
+ return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
4760
+ };
4761
+ const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
4762
+ const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
4763
+ const withCapture = `(?<${captureName}>${headwordBody})`;
4764
+ if (!allowParenthesized) return `${withCapture}${colon}`;
4765
+ return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
4766
+ };
4767
+ const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
4768
+ if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
4769
+ if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
4770
+ if (!/^[A-Za-z_]\w*$/.test(captureName)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
4771
+ };
4772
+ const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
4773
+ validateDictionaryEntryOptions({
4774
+ captureName,
4775
+ maxLetters,
4776
+ minLetters
4777
+ });
4778
+ const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
4779
+ const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
4780
+ const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
4781
+ const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
4782
+ const stopAlternation = buildStopAlternation(stopWords);
4783
+ const lemmaBody = buildHeadwordBody({
4784
+ allowCommaSeparated,
4785
+ colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
4786
+ stopAlternation,
4787
+ stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
4788
+ unit: lemmaUnit
4789
+ });
4790
+ const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
4791
+ const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
4792
+ const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
4793
+ const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
4794
+ allowParenthesized,
4795
+ allowWhitespaceBeforeColon,
4796
+ captureName: prefixedCaptureName,
4797
+ headwordBody: lemmaBody
4798
+ });
4799
+ return {
4800
+ captureNames: [prefixedCaptureName],
4801
+ regex
4802
+ };
4803
+ };
4804
+ /**
4805
+ * Creates a reusable split rule for Arabic dictionary entries.
4806
+ *
4807
+ * The returned rule preserves authoring intent as a serializable
4808
+ * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
4809
+ * regex string.
4810
+ *
4811
+ * @example
4812
+ * createArabicDictionaryEntryRule({
4813
+ * stopWords: ['وقيل', 'ويقال', 'قال'],
4814
+ * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
4815
+ * })
4816
+ *
4817
+ * @example
4818
+ * createArabicDictionaryEntryRule({
4819
+ * allowParenthesized: true,
4820
+ * allowWhitespaceBeforeColon: true,
4821
+ * allowCommaSeparated: true,
4822
+ * stopWords: ['الليث', 'العجاج'],
4823
+ * })
4824
+ */
4825
+ /**
4826
+ * @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
4827
+ * whole-book dictionary segmentation. Keep this helper for advanced single-rule
4828
+ * composition inside a broader `SplitRule[]` pipeline.
4829
+ */
4830
+ const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
4831
+ validateDictionaryEntryOptions({
4832
+ captureName,
4833
+ maxLetters,
4834
+ minLetters
4835
+ });
4836
+ return {
4837
+ dictionaryEntry: {
4838
+ allowCommaSeparated,
4839
+ allowParenthesized,
4840
+ allowWhitespaceBeforeColon,
4841
+ captureName,
4842
+ maxLetters,
4843
+ midLineSubentries,
4844
+ minLetters,
4845
+ stopWords: uniqueCanonicalWords(stopWords)
4846
+ },
4847
+ meta,
4848
+ pageStartPrevWordStoplist,
4849
+ samePagePrevWordStoplist
4850
+ };
4851
+ };
4852
+ //#endregion
4380
4853
  //#region src/segmentation/rule-regex.ts
4381
4854
  /**
4382
4855
  * Checks if a regex pattern contains standard (anonymous) capturing groups.
@@ -5319,425 +5792,566 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner, hasDict
5319
5792
  const createSegmentsFromSplitPoints = () => {
5320
5793
  const result = [];
5321
5794
  for (let i = 0; i < splitPoints.length; i++) {
5322
- const sp = splitPoints[i];
5323
- const end = splitPoints[i + 1]?.index ?? content.length;
5324
- const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
5325
- if (s) result.push(s);
5326
- }
5327
- return result;
5328
- };
5329
- const segments = [];
5330
- if (!splitPoints.length) {
5331
- const firstId = pageMap.getId(0);
5332
- if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
5333
- const s = createSegment(0, content.length);
5334
- if (s) segments.push(s);
5335
- }
5336
- return segments;
5337
- }
5338
- if (splitPoints[0].index > 0) {
5339
- const firstId = pageMap.getId(0);
5340
- if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
5341
- const s = createSegment(0, splitPoints[0].index);
5342
- if (s) segments.push(s);
5343
- }
5344
- }
5345
- return [...segments, ...createSegmentsFromSplitPoints()];
5346
- };
5347
- //#endregion
5348
- //#region src/validation/validate-segments.ts
5349
- /**
5350
- * Creates a short preview string of text content for error reporting.
5351
- * Truncates content exceeding PREVIEW_LIMIT.
5352
- */
5353
- const buildPreview = (text) => {
5354
- const normalized = text.replace(/\s+/g, " ").trim();
5355
- if (normalized.length <= 140) return normalized;
5356
- return `${normalized.slice(0, 140)}...`;
5357
- };
5358
- /**
5359
- * Creates a lightweight snapshot of a segment for inclusion in validation checks.
5360
- */
5361
- const buildSegmentSnapshot = (segment) => ({
5362
- contentPreview: buildPreview(segment.content),
5363
- from: segment.from,
5364
- to: segment.to
5365
- });
5366
- /**
5367
- * Normalizes page content by applying preprocessing transforms and standardizing line endings.
5368
- */
5369
- const normalizePages = (pages, options) => {
5370
- const transforms = options.preprocess ?? [];
5371
- return pages.map((page) => {
5372
- return {
5373
- content: normalizeLineEndings(transforms.length ? applyPreprocessToPage(page.content, page.id, transforms) : page.content),
5374
- id: page.id
5375
- };
5376
- });
5377
- };
5378
- /**
5379
- * Joins all page content into a single string with boundary tracking.
5380
- * Returns the joined string and a list of boundary mappings (start/end indices for each page).
5381
- */
5382
- const buildJoinedContent = (pages, joiner) => {
5383
- const boundaries = [];
5384
- const joined = pages.map((p) => p.content).join(joiner);
5385
- let offset = 0;
5386
- for (let i = 0; i < pages.length; i++) {
5387
- const content = pages[i].content;
5388
- const start = offset;
5389
- const end = start + content.length;
5390
- boundaries.push({
5391
- end,
5392
- id: pages[i].id,
5393
- start
5394
- });
5395
- offset += content.length + (i < pages.length - 1 ? joiner.length : 0);
5795
+ const sp = splitPoints[i];
5796
+ const end = splitPoints[i + 1]?.index ?? content.length;
5797
+ const s = createSegment(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
5798
+ if (s) result.push(s);
5799
+ }
5800
+ return result;
5801
+ };
5802
+ const segments = [];
5803
+ if (!splitPoints.length) {
5804
+ const firstId = pageMap.getId(0);
5805
+ if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
5806
+ const s = createSegment(0, content.length);
5807
+ if (s) segments.push(s);
5808
+ }
5809
+ return segments;
5810
+ }
5811
+ if (splitPoints[0].index > 0) {
5812
+ const firstId = pageMap.getId(0);
5813
+ if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
5814
+ const s = createSegment(0, splitPoints[0].index);
5815
+ if (s) segments.push(s);
5816
+ }
5396
5817
  }
5818
+ return [...segments, ...createSegmentsFromSplitPoints()];
5819
+ };
5820
+ //#endregion
5821
+ //#region src/analysis/segmentation-advisor.ts
5822
+ const ZERO_WIDTH_REGEX = /[\u061C\u200B-\u200F\u202A-\u202E\u2060-\u2064\uFEFF]/gu;
5823
+ const ELLIPSIS_REGEX = /\.{3,}/g;
5824
+ const TRAILING_WAW_REGEX = /\sو\s+(?=[\p{Script=Arabic}])/gu;
5825
+ const STRUCTURAL_META_BY_TOKEN = {
5826
+ bab: "chapter",
5827
+ basmalah: "basmalah",
5828
+ fasl: "section",
5829
+ kitab: "book"
5830
+ };
5831
+ const NUMBER_TOKENS = [
5832
+ "numbered",
5833
+ "raqms",
5834
+ "raqm",
5835
+ "nums",
5836
+ "num"
5837
+ ];
5838
+ const DEFAULT_BREAKPOINTS = [{
5839
+ pattern: "{{tarqim}}\\s*",
5840
+ split: "after"
5841
+ }, ""];
5842
+ const resolveOptions = (pages, options = {}) => {
5843
+ const minCount = pages.length >= 25 ? 3 : 2;
5397
5844
  return {
5398
- boundaries,
5399
- joined
5845
+ maxRules: options.maxRules ?? 4,
5846
+ minLineStartCount: options.minLineStartCount ?? minCount,
5847
+ minRepeatingCount: options.minRepeatingCount ?? minCount,
5848
+ sampleSegments: options.sampleSegments ?? 5,
5849
+ topLineStarts: options.topLineStarts ?? 12,
5850
+ topRepeatingSequences: options.topRepeatingSequences ?? 8
5400
5851
  };
5401
5852
  };
5402
- /**
5403
- * Binary search to find which page ID corresponds to a character offset in the joined content.
5404
- * Returns undefined if the offset falls within a joiner gap or outside bounds.
5405
- */
5406
- const findBoundaryIdForOffset = (offset, boundaries) => {
5407
- let lo = 0;
5408
- let hi = boundaries.length - 1;
5409
- while (lo <= hi) {
5410
- const mid = lo + hi >>> 1;
5411
- const boundary = boundaries[mid];
5412
- if (offset < boundary.start) hi = mid - 1;
5413
- else if (offset > boundary.end) lo = mid + 1;
5414
- else return boundary.id;
5853
+ const countMatches = (text, regex) => text.match(regex)?.length ?? 0;
5854
+ const getDetections = (pages) => pages.reduce((acc, page) => ({
5855
+ ellipsisCount: acc.ellipsisCount + countMatches(page.content, ELLIPSIS_REGEX),
5856
+ trailingWawCount: acc.trailingWawCount + countMatches(page.content, TRAILING_WAW_REGEX),
5857
+ zeroWidthCount: acc.zeroWidthCount + countMatches(page.content, ZERO_WIDTH_REGEX)
5858
+ }), {
5859
+ ellipsisCount: 0,
5860
+ trailingWawCount: 0,
5861
+ zeroWidthCount: 0
5862
+ });
5863
+ const getPreprocessSuggestions = (detections) => {
5864
+ const suggestions = [];
5865
+ if (detections.zeroWidthCount > 0) suggestions.push({
5866
+ count: detections.zeroWidthCount,
5867
+ reason: "Invisible directional/zero-width marks can break anchors and token matching.",
5868
+ transform: "removeZeroWidth"
5869
+ });
5870
+ if (detections.ellipsisCount > 0) suggestions.push({
5871
+ count: detections.ellipsisCount,
5872
+ reason: "Repeated periods often cause noisy punctuation breakpoints.",
5873
+ transform: "condenseEllipsis"
5874
+ });
5875
+ if (detections.trailingWawCount > 0) suggestions.push({
5876
+ count: detections.trailingWawCount,
5877
+ reason: "Separated waw prefixes are a common digitization artifact in Arabic corpora.",
5878
+ transform: "fixTrailingWaw"
5879
+ });
5880
+ return suggestions;
5881
+ };
5882
+ const extractTokenNames = (pattern) => [...pattern.matchAll(/\{\{(\w+)(?::[^}]+)?\}\}/g)].map((match) => match[1]);
5883
+ const getStructuralMeta = (tokens) => {
5884
+ for (const token of tokens) if (token in STRUCTURAL_META_BY_TOKEN) return STRUCTURAL_META_BY_TOKEN[token];
5885
+ };
5886
+ const applyFirstTokenReplacement = (pattern, token, replacement) => {
5887
+ const target = `{{${token}}}`;
5888
+ return pattern.includes(target) ? pattern.replace(target, replacement) : pattern;
5889
+ };
5890
+ const addNamedCaptures = (pattern) => {
5891
+ let next = pattern;
5892
+ if (next.includes("{{numbered}}")) next = next.replace("{{numbered}}", "{{raqms:num}} {{dash}} ");
5893
+ else for (const token of NUMBER_TOKENS) {
5894
+ const replacement = token === "num" ? "{{num:num}}" : `{{${token}:num}}`;
5895
+ const replaced = applyFirstTokenReplacement(next, token, replacement);
5896
+ if (replaced !== next) {
5897
+ next = replaced;
5898
+ break;
5899
+ }
5415
5900
  }
5416
- if (boundaries.length === 0) return;
5417
- const last = boundaries.at(-1);
5418
- return offset > last.end ? last.id : void 0;
5419
- };
5420
- /**
5421
- * Helper to construct a standardized validation issue object.
5422
- */
5423
- const createIssue = (type, segment, segmentIndex, overrides = {}, pageMap) => {
5424
- const segmentSnapshot = buildSegmentSnapshot(segment);
5425
- const page = pageMap?.get(segment.from);
5426
- const matchIndex = overrides.matchIndex;
5427
- const { matchIndex: _ignored, ...restOverrides } = overrides;
5428
- const base = {
5429
- actual: {
5430
- from: segment.from,
5431
- to: segment.to
5432
- },
5433
- segment: segmentSnapshot,
5434
- segmentIndex,
5435
- ...restOverrides
5436
- };
5437
- switch (type) {
5438
- case "page_not_found": return {
5439
- ...base,
5440
- evidence: overrides.evidence ?? `Segment.from=${segment.from} does not exist in input pages.`,
5441
- hint: "Check page IDs passed into segmentPages() and validateSegments().",
5442
- severity: "error",
5443
- type
5444
- };
5445
- case "content_not_found": return {
5446
- ...base,
5447
- evidence: overrides.evidence ?? "Segment content not found in any page content.",
5448
- hint: overrides.hint ?? "Check preprocessing options, joiner settings, or whitespace normalization.",
5449
- pageContext: page ? {
5450
- pageId: page.id,
5451
- pagePreview: buildPreview(page.content)
5452
- } : void 0,
5453
- severity: "error",
5454
- type
5901
+ if (next.includes("{{rumuz}}")) next = next.replace("{{rumuz}}", "{{rumuz:source}}");
5902
+ return next;
5903
+ };
5904
+ const findTokenIndex = (pattern, token) => {
5905
+ const plainIndex = pattern.indexOf(`{{${token}}}`);
5906
+ const namedIndex = pattern.indexOf(`{{${token}:`);
5907
+ if (plainIndex === -1) return namedIndex;
5908
+ if (namedIndex === -1) return plainIndex;
5909
+ return Math.min(plainIndex, namedIndex);
5910
+ };
5911
+ const trimNumberBoundaryPattern = (pattern) => {
5912
+ const stopTokens = [
5913
+ "naql",
5914
+ "bab",
5915
+ "basmalah",
5916
+ "fasl",
5917
+ "kitab"
5918
+ ];
5919
+ let end = pattern.length;
5920
+ for (const token of stopTokens) {
5921
+ const index = findTokenIndex(pattern, token);
5922
+ if (index >= 0) end = Math.min(end, index);
5923
+ }
5924
+ return pattern.slice(0, end).trimEnd();
5925
+ };
5926
+ const getRuleMeta = (tokens) => {
5927
+ const structural = getStructuralMeta(tokens);
5928
+ if (structural) return { type: structural };
5929
+ if (tokens.includes("naql") || tokens.some((token) => NUMBER_TOKENS.includes(token))) return { type: "entry" };
5930
+ };
5931
+ const getSuggestionConfidence = (tokens, shape) => {
5932
+ if (getStructuralMeta(tokens)) return "high";
5933
+ if (tokens.some((token) => NUMBER_TOKENS.includes(token)) || tokens.includes("naql")) return "high";
5934
+ if (shape === "sequence" && tokens.includes("rumuz")) return "medium";
5935
+ return tokens.length > 0 ? "medium" : "low";
5936
+ };
5937
+ const getSuggestionReason = (tokens, source) => {
5938
+ const structural = getStructuralMeta(tokens);
5939
+ if (structural) return `Repeated structural marker suggests ${structural}-style boundaries.`;
5940
+ if (tokens.some((token) => NUMBER_TOKENS.includes(token))) return "Repeated numbering marker is a strong candidate for entry boundaries.";
5941
+ if (tokens.includes("naql")) return source === "line-start" ? "Repeated transmission phrase appears at line starts and can anchor segments." : "Repeated transmission phrase inside prose is a good candidate for template-based splitting.";
5942
+ return source === "line-start" ? "Frequent line-start signature is worth trying as a structural boundary." : "Frequent tokenized sequence may help split continuous prose.";
5943
+ };
5944
+ const createRule = (pattern, tokens, shape) => {
5945
+ const fuzzy = shouldDefaultToFuzzy(pattern);
5946
+ const meta = getRuleMeta(tokens);
5947
+ if (shape === "line-start") {
5948
+ if (getStructuralMeta(tokens)) return meta ? {
5949
+ fuzzy,
5950
+ lineStartsWith: [pattern],
5951
+ meta,
5952
+ split: "at"
5953
+ } : {
5954
+ fuzzy,
5955
+ lineStartsWith: [pattern],
5956
+ split: "at"
5455
5957
  };
5456
- case "page_attribution_mismatch": {
5457
- const matchedFromId = overrides.expected?.from ?? overrides.actual?.from ?? segment.from;
5458
- const actualPage = pageMap?.get(matchedFromId);
5459
- return {
5460
- ...base,
5461
- evidence: overrides.evidence ?? `Content found in joined content at page ${matchedFromId}, but segment.from=${segment.from}.`,
5462
- hint: overrides.hint ?? "Check duplicate content handling and boundary detection rules.",
5463
- pageContext: actualPage ? {
5464
- matchIndex: matchIndex ?? -1,
5465
- pageId: actualPage.id,
5466
- pagePreview: buildPreview(actualPage.content)
5467
- } : void 0,
5468
- severity: "error",
5469
- type
5958
+ if (tokens.some((token) => NUMBER_TOKENS.includes(token))) {
5959
+ const captured = addNamedCaptures(trimNumberBoundaryPattern(pattern));
5960
+ return meta ? {
5961
+ fuzzy,
5962
+ lineStartsAfter: [captured],
5963
+ meta,
5964
+ split: "at"
5965
+ } : {
5966
+ fuzzy,
5967
+ lineStartsAfter: [captured],
5968
+ split: "at"
5470
5969
  };
5471
5970
  }
5472
- case "max_pages_violation": return {
5473
- ...base,
5474
- evidence: overrides.evidence ?? `Segment spans pages ${segment.from}-${overrides.actual?.to}.`,
5475
- hint: overrides.hint ?? "Check maxPages windowing in breakpoint-processor.ts and page constraints.",
5476
- severity: "error",
5477
- type
5478
- };
5479
- default: return {
5480
- ...base,
5481
- severity: "error",
5482
- type
5971
+ return meta ? {
5972
+ fuzzy,
5973
+ lineStartsWith: [pattern],
5974
+ meta,
5975
+ split: "at"
5976
+ } : {
5977
+ fuzzy,
5978
+ lineStartsWith: [pattern],
5979
+ split: "at"
5483
5980
  };
5484
5981
  }
5982
+ const captured = addNamedCaptures(pattern);
5983
+ return meta ? {
5984
+ fuzzy,
5985
+ meta,
5986
+ split: "at",
5987
+ template: captured
5988
+ } : {
5989
+ fuzzy,
5990
+ split: "at",
5991
+ template: captured
5992
+ };
5485
5993
  };
5486
- /**
5487
- * Finds all occurrences of a content string within the joined text.
5488
- * Respects search limits to avoid performance cliffs on highly repetitive content.
5489
- */
5490
- const findJoinedMatches = (content, joined, searchStart, searchEnd, limit = Infinity) => {
5491
- const matches = [];
5492
- if (!content || searchStart >= searchEnd) return matches;
5493
- let idx = joined.indexOf(content, searchStart);
5494
- let count = 0;
5495
- while (idx >= 0 && idx < searchEnd && count < limit) {
5496
- matches.push({
5497
- end: idx + content.length - 1,
5498
- start: idx
5499
- });
5500
- idx = joined.indexOf(content, idx + 1);
5501
- if (idx >= searchEnd) break;
5502
- count++;
5503
- }
5504
- return matches;
5994
+ const createLineStartSuggestion = (pattern) => {
5995
+ const tokens = extractTokenNames(pattern.pattern);
5996
+ return {
5997
+ confidence: getSuggestionConfidence(tokens, "line-start"),
5998
+ count: pattern.count,
5999
+ example: {
6000
+ pageId: pattern.examples[0]?.pageId ?? -1,
6001
+ text: pattern.examples[0]?.line ?? ""
6002
+ },
6003
+ pattern: pattern.pattern,
6004
+ reason: getSuggestionReason(tokens, "line-start"),
6005
+ rule: createRule(pattern.pattern, tokens, "line-start"),
6006
+ source: "line-start"
6007
+ };
5505
6008
  };
5506
- /**
5507
- * Verifies that a matched segment falls within the allowed maxTerms/maxPages constraints.
5508
- * Checks both implicit spans (calculated from match end) and explicit segment.to claims.
5509
- */
5510
- const checkMaxPagesViolation = (segment, segmentIndex, maxPages, matchEnd, _expectedBoundaryEnd, boundaries) => {
5511
- const actualToId = findBoundaryIdForOffset(matchEnd, boundaries);
5512
- if (actualToId === void 0) return [];
5513
- if (maxPages === 0) {
5514
- if (actualToId !== segment.from) return [createIssue("max_pages_violation", segment, segmentIndex, {
5515
- actual: {
5516
- from: segment.from,
5517
- to: actualToId
5518
- },
5519
- evidence: `Segment spans pages ${segment.from}-${actualToId} in joined content (maxPages=0).`,
5520
- expected: {
5521
- from: segment.from,
5522
- to: segment.from
5523
- }
5524
- })];
6009
+ const createRepeatingSuggestion = (pattern) => {
6010
+ const tokens = extractTokenNames(pattern.pattern);
6011
+ return {
6012
+ confidence: getSuggestionConfidence(tokens, "sequence"),
6013
+ count: pattern.count,
6014
+ example: {
6015
+ pageId: pattern.examples[0]?.pageId ?? -1,
6016
+ text: pattern.examples[0]?.text ?? ""
6017
+ },
6018
+ pattern: pattern.pattern,
6019
+ reason: getSuggestionReason(tokens, "repeating-sequence"),
6020
+ rule: createRule(pattern.pattern, tokens, "sequence"),
6021
+ source: "repeating-sequence"
6022
+ };
6023
+ };
6024
+ const confidenceScore = (confidence) => confidence === "high" ? 3 : confidence === "medium" ? 2 : 1;
6025
+ const sourceScore = (mode, source) => {
6026
+ if (mode === "structured") return source === "line-start" ? 3 : 1;
6027
+ if (mode === "continuous") return source === "repeating-sequence" ? 3 : 1;
6028
+ return source === "line-start" ? 3 : 2;
6029
+ };
6030
+ const compareSuggestions = (mode, left, right) => sourceScore(mode, right.source) - sourceScore(mode, left.source) || confidenceScore(right.confidence) - confidenceScore(left.confidence) || right.count - left.count || left.pattern.localeCompare(right.pattern);
6031
+ const dedupeSuggestions = (suggestions) => {
6032
+ const seen = /* @__PURE__ */ new Set();
6033
+ const deduped = [];
6034
+ for (const suggestion of suggestions) {
6035
+ const key = JSON.stringify(suggestion.rule);
6036
+ if (seen.has(key)) continue;
6037
+ seen.add(key);
6038
+ deduped.push(suggestion);
5525
6039
  }
5526
- if (segment.to !== void 0) {
5527
- if (actualToId > segment.to) return [createIssue("max_pages_violation", segment, segmentIndex, {
5528
- actual: {
5529
- from: segment.from,
5530
- to: actualToId
5531
- },
5532
- evidence: `Segment content ends on page ${actualToId} but segment.to is ${segment.to}.`,
5533
- expected: {
5534
- from: segment.from,
5535
- to: segment.to
5536
- }
5537
- })];
5538
- } else if (maxPages !== void 0) {
5539
- const span = actualToId - segment.from;
5540
- if (span > maxPages) return [createIssue("max_pages_violation", segment, segmentIndex, {
5541
- actual: {
5542
- from: segment.from,
5543
- to: actualToId
6040
+ return deduped;
6041
+ };
6042
+ const chooseAssessment = (pages, lineStarts, repeatingSequences) => {
6043
+ const totalLines = pages.reduce((sum, page) => sum + page.content.split("\n").length, 0);
6044
+ const topLine = lineStarts[0]?.count ?? 0;
6045
+ const topSequence = repeatingSequences[0]?.count ?? 0;
6046
+ const hasDenseLineBreaks = totalLines > pages.length;
6047
+ if (topLine >= Math.max(2, topSequence) && hasDenseLineBreaks) return {
6048
+ mode: "structured",
6049
+ reason: "Frequent repeated line-start markers dominate and the text has strong line structure."
6050
+ };
6051
+ if (topSequence > topLine && !hasDenseLineBreaks) return {
6052
+ mode: "continuous",
6053
+ reason: "Tokenized prose sequences are stronger than line-start signals and the pages are mostly continuous text."
6054
+ };
6055
+ return {
6056
+ mode: "mixed",
6057
+ reason: "The book shows both structural line markers and inline recurring sequences."
6058
+ };
6059
+ };
6060
+ const getRecommendedOptions = (mode, suggestions, maxRules, preprocess) => {
6061
+ const primarySource = mode === "continuous" ? "repeating-sequence" : "line-start";
6062
+ const sourceMatched = suggestions.filter((suggestion) => suggestion.source === primarySource);
6063
+ const selectedRules = (sourceMatched.length > 0 ? sourceMatched : suggestions).slice(0, maxRules).map((suggestion) => suggestion.rule);
6064
+ const optimized = optimizeRules(selectedRules);
6065
+ const baseOptions = primarySource === "line-start" ? {
6066
+ pageJoiner: "newline",
6067
+ rules: optimized.rules
6068
+ } : { rules: optimized.rules };
6069
+ return {
6070
+ optimization: {
6071
+ mergedCount: optimized.mergedCount,
6072
+ optimizedRuleCount: optimized.rules.length,
6073
+ originalRuleCount: selectedRules.length
6074
+ },
6075
+ options: preprocess.length > 0 ? {
6076
+ ...baseOptions,
6077
+ preprocess
6078
+ } : baseOptions
6079
+ };
6080
+ };
6081
+ const evaluateRecommendation = (pages, options, sampleSegments) => {
6082
+ if ((options.rules?.length ?? 0) === 0) return { segmentSamples: [] };
6083
+ try {
6084
+ const segments = segmentPages(pages, options);
6085
+ const validation = validateSegments(pages, options, segments);
6086
+ const totalLength = segments.reduce((sum, segment) => sum + segment.content.length, 0);
6087
+ const multiPageSegments = segments.filter((segment) => segment.to !== void 0 && segment.to !== segment.from).length;
6088
+ return {
6089
+ evaluation: {
6090
+ averageSegmentLength: segments.length === 0 ? 0 : totalLength / segments.length,
6091
+ maxSegmentLength: Math.max(0, ...segments.map((segment) => segment.content.length)),
6092
+ multiPageSegments,
6093
+ segmentCount: segments.length,
6094
+ validation
5544
6095
  },
5545
- evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
5546
- expected: {
5547
- from: segment.from,
5548
- to: segment.from + maxPages
5549
- }
5550
- })];
6096
+ segmentSamples: segments.slice(0, sampleSegments)
6097
+ };
6098
+ } catch {
6099
+ return { segmentSamples: [] };
5551
6100
  }
5552
- return [];
6101
+ };
6102
+ const toTemplateFallbackRule = (rule) => {
6103
+ if (!("lineStartsAfter" in rule) || !Array.isArray(rule.lineStartsAfter) || rule.lineStartsAfter.length !== 1) return null;
6104
+ return rule.meta ? {
6105
+ meta: rule.meta,
6106
+ split: rule.split,
6107
+ template: `^${rule.lineStartsAfter[0]}`
6108
+ } : {
6109
+ split: rule.split,
6110
+ template: `^${rule.lineStartsAfter[0]}`
6111
+ };
6112
+ };
6113
+ const getTemplateFallbackOptions = (options) => {
6114
+ if ((options.rules?.length ?? 0) === 0) return null;
6115
+ const fallbackRules = options.rules?.map(toTemplateFallbackRule).filter((rule) => rule !== null);
6116
+ if (!fallbackRules || fallbackRules.length !== options.rules?.length || fallbackRules.length === 0) return null;
6117
+ return options.preprocess ? {
6118
+ pageJoiner: "newline",
6119
+ preprocess: options.preprocess,
6120
+ rules: fallbackRules
6121
+ } : {
6122
+ pageJoiner: "newline",
6123
+ rules: fallbackRules
6124
+ };
6125
+ };
6126
+ const shouldUseTemplateFallback = (primary, fallback) => {
6127
+ if (!fallback) return false;
6128
+ if (!primary) return true;
6129
+ return fallback.segmentCount > primary.segmentCount && fallback.validation.summary.issues <= primary.validation.summary.issues;
6130
+ };
6131
+ const getBreakpointSuggestions = (pages, evaluation) => {
6132
+ const averagePageLength = pages.length === 0 ? 0 : pages.reduce((sum, page) => sum + page.content.length, 0) / pages.length;
6133
+ if (!((evaluation?.multiPageSegments ?? 0) > 0 || (evaluation?.maxSegmentLength ?? 0) > 4e3 || averagePageLength > 2500)) return [];
6134
+ return [{
6135
+ breakpoints: DEFAULT_BREAKPOINTS,
6136
+ maxPages: 1,
6137
+ prefer: "longer",
6138
+ reason: "Some segments are likely to grow large enough that sentence punctuation plus page-boundary fallback is worth testing."
6139
+ }];
5553
6140
  };
5554
6141
  /**
5555
- * Handles validation when content is not found in the expected boundary window.
5556
- * Fallback strategy: search entire document if segment matches existing content elsewhere.
6142
+ * Generate a machine-readable draft segmentation report for AI agents.
6143
+ *
6144
+ * This helper is intentionally deterministic: it inspects pages, drafts
6145
+ * candidate rules, validates them, and evaluates its own recommendation.
5557
6146
  */
5558
- const handleMissingBoundary = (segment, segmentIndex, joined, boundaries, pageMap) => {
5559
- const matches = findJoinedMatches(segment.content, joined, 0, joined.length, 1);
5560
- if (matches.length === 0) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content not found in any page content." }, pageMap)];
5561
- const match = matches[0];
5562
- const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
5563
- const actualToId = findBoundaryIdForOffset(match.end, boundaries);
5564
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
5565
- actual: {
5566
- from: segment.from,
5567
- to: segment.to
5568
- },
5569
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
5570
- expected: {
5571
- from: actualFromId,
5572
- to: actualToId
6147
+ const suggestSegmentationOptions = (pages, options = {}) => {
6148
+ const resolved = resolveOptions(pages, options);
6149
+ const detections = getDetections(pages);
6150
+ const preprocessSuggestions = getPreprocessSuggestions(detections);
6151
+ const preprocess = preprocessSuggestions.map((suggestion) => suggestion.transform);
6152
+ const lineStarts = analyzeCommonLineStarts(pages, {
6153
+ minCount: resolved.minLineStartCount,
6154
+ sortBy: "count",
6155
+ topK: resolved.topLineStarts
6156
+ });
6157
+ const repeatingSequences = analyzeRepeatingSequences(pages, {
6158
+ maxElements: 3,
6159
+ minCount: resolved.minRepeatingCount,
6160
+ minElements: 1,
6161
+ topK: resolved.topRepeatingSequences
6162
+ });
6163
+ const assessment = chooseAssessment(pages, lineStarts, repeatingSequences);
6164
+ const lineSuggestions = lineStarts.map(createLineStartSuggestion);
6165
+ const repeatingSuggestions = repeatingSequences.map(createRepeatingSuggestion);
6166
+ const ruleSuggestions = dedupeSuggestions([...lineSuggestions, ...repeatingSuggestions]).sort((left, right) => compareSuggestions(assessment.mode, left, right));
6167
+ const { optimization, options: recommendedOptions } = getRecommendedOptions(assessment.mode, ruleSuggestions, resolved.maxRules, preprocess);
6168
+ const primary = evaluateRecommendation(pages, recommendedOptions, resolved.sampleSegments);
6169
+ const fallbackOptions = getTemplateFallbackOptions(recommendedOptions);
6170
+ const fallback = fallbackOptions ? evaluateRecommendation(pages, fallbackOptions, resolved.sampleSegments) : void 0;
6171
+ const finalOptions = shouldUseTemplateFallback(primary.evaluation, fallback?.evaluation) && fallbackOptions ? fallbackOptions : recommendedOptions;
6172
+ const finalEvaluation = finalOptions === fallbackOptions && fallback ? fallback : primary;
6173
+ const ruleValidation = validateRules(finalOptions.rules ?? []).filter((result) => result !== void 0);
6174
+ const ruleValidationErrors = formatValidationReport(ruleValidation);
6175
+ return {
6176
+ assessment,
6177
+ breakpointSuggestions: getBreakpointSuggestions(pages, finalEvaluation.evaluation),
6178
+ evaluation: finalEvaluation.evaluation,
6179
+ lineStarts,
6180
+ optimization,
6181
+ preprocess: {
6182
+ detections,
6183
+ suggestions: preprocessSuggestions
5573
6184
  },
5574
- matchIndex: match.start
5575
- }, pageMap)];
6185
+ recommendedOptions: finalOptions,
6186
+ repeatingSequences,
6187
+ ruleSuggestions,
6188
+ ruleValidation,
6189
+ ruleValidationErrors,
6190
+ segmentSamples: finalEvaluation.segmentSamples
6191
+ };
5576
6192
  };
6193
+ //#endregion
6194
+ //#region src/detection.ts
6195
+ /**
6196
+ * Token detection order - more specific patterns first to avoid partial matches.
6197
+ * Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
6198
+ *
6199
+ * Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
6200
+ */
6201
+ const TOKEN_PRIORITY_ORDER = [
6202
+ "basmalah",
6203
+ "kitab",
6204
+ "bab",
6205
+ "fasl",
6206
+ "naql",
6207
+ "rumuz",
6208
+ "numbered",
6209
+ "raqms",
6210
+ "raqm",
6211
+ "tarqim",
6212
+ "bullet",
6213
+ "dash",
6214
+ "harf"
6215
+ ];
5577
6216
  /**
5578
- * Performs a widened search when the direct check fails.
5579
- * Includes a small buffer around the expected position, and optionally a full-document search for short segments.
6217
+ * Gets the token detection priority order.
6218
+ * Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
5580
6219
  */
5581
- const handleFallbackSearch = (segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions) => {
5582
- const content = segment.content;
5583
- const bufferSize = 1e3;
5584
- const rawMatches = findJoinedMatches(content, joined, Math.max(0, searchStart - bufferSize), Math.min(joined.length, searchEnd + bufferSize), 5);
5585
- if (rawMatches.length === 0) {
5586
- const threshold = validationOptions?.fullSearchThreshold ?? 500;
5587
- if (content.length < threshold) {
5588
- const fullMatches = findJoinedMatches(content, joined, 0, joined.length, 50);
5589
- const validMatch = fullMatches.find((m) => {
5590
- return findBoundaryIdForOffset(m.start, boundaries) === segment.from;
5591
- });
5592
- if (validMatch) return checkMaxPagesViolation(segment, segmentIndex, maxPages, validMatch.end, expectedBoundary.end, boundaries);
5593
- if (fullMatches.length > 0) {
5594
- const match = fullMatches[0];
5595
- const actualFromId = findBoundaryIdForOffset(match.start, boundaries);
5596
- const actualToId = findBoundaryIdForOffset(match.end, boundaries);
5597
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
5598
- actual: {
5599
- from: segment.from,
5600
- to: segment.to
5601
- },
5602
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
5603
- expected: {
5604
- from: actualFromId,
5605
- to: actualToId
5606
- },
5607
- matchIndex: match.start
5608
- }, pageMap)];
5609
- }
5610
- }
5611
- return [createIssue("content_not_found", segment, segmentIndex, {
5612
- evidence: `Segment content (${content.length} chars) not found in expected window.`,
5613
- hint: "Check page boundary attribution in segmenter.ts."
5614
- }, pageMap)];
5615
- }
5616
- const alignedMatches = rawMatches.filter((m) => m.start >= expectedBoundary.start && m.start <= expectedBoundary.end);
5617
- if (alignedMatches.length > 0) {
5618
- const primary = alignedMatches[0];
5619
- return checkMaxPagesViolation(segment, segmentIndex, maxPages, primary.end, expectedBoundary.end, boundaries);
5620
- }
5621
- const primary = rawMatches[0];
5622
- const actualFromId = findBoundaryIdForOffset(primary.start, boundaries);
5623
- const actualToId = findBoundaryIdForOffset(primary.end, boundaries);
5624
- return [createIssue("page_attribution_mismatch", segment, segmentIndex, {
5625
- actual: {
5626
- from: segment.from,
5627
- to: segment.to
5628
- },
5629
- evidence: `Content found in joined content at page ${actualFromId}, but segment.from=${segment.from}.`,
5630
- expected: {
5631
- from: actualFromId,
5632
- to: actualToId
5633
- },
5634
- matchIndex: primary.start
5635
- }, pageMap)];
6220
+ const getTokenPriority = () => {
6221
+ const allTokens = getAvailableTokens();
6222
+ const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
6223
+ const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
6224
+ return [...prioritized, ...remaining];
6225
+ };
6226
+ const isRumuzStandalone = (text, startIndex, endIndex) => {
6227
+ const before = startIndex > 0 ? text[startIndex - 1] : "";
6228
+ const after = endIndex < text.length ? text[endIndex] : "";
6229
+ const isWhitespace = (ch) => !!ch && /\s/u.test(ch);
6230
+ const isOpenBracket = (ch) => !!ch && /[([{]/u.test(ch);
6231
+ const isRightDelimiter = (ch) => !!ch && /[::\-–—ـ،؛.?!؟)\]}]/u.test(ch);
6232
+ const isArabicWordy = (ch) => !!ch && /[\u0600-\u06FF]/u.test(ch);
6233
+ const leftOk = !before || isWhitespace(before) || isOpenBracket(before) || !isArabicWordy(before);
6234
+ const rightOk = !after || isWhitespace(after) || isRightDelimiter(after) || !isArabicWordy(after);
6235
+ return leftOk && rightOk;
5636
6236
  };
5637
6237
  /**
5638
- * Calculates the search range end index based on segment.to or strict bounds.
6238
+ * Analyzes text and returns all detected token patterns with their positions.
6239
+ * Patterns are detected in priority order to avoid partial matches.
6240
+ *
6241
+ * @param text - The text to analyze for token patterns
6242
+ * @returns Array of detected patterns sorted by position
6243
+ *
6244
+ * @example
6245
+ * detectTokenPatterns("٣٤ - حدثنا")
6246
+ * // Returns: [
6247
+ * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
6248
+ * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
6249
+ * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
6250
+ * // ]
5639
6251
  */
5640
- const getSearchRange = (segment, expectedBoundary, boundaryMap, joinedLength) => {
5641
- let searchEnd = expectedBoundary.end + 1;
5642
- if (segment.to !== void 0) {
5643
- const endBoundary = boundaryMap.get(segment.to);
5644
- if (endBoundary) searchEnd = endBoundary.end + 1;
5645
- else searchEnd = Math.min(joinedLength, expectedBoundary.end + 5e4);
6252
+ const detectTokenPatterns = (text) => {
6253
+ if (!text) return [];
6254
+ const results = [];
6255
+ const coveredRanges = [];
6256
+ const isPositionCovered = (start, end) => {
6257
+ return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
6258
+ };
6259
+ for (const tokenName of getTokenPriority()) {
6260
+ const pattern = TOKEN_PATTERNS[tokenName];
6261
+ if (!pattern) continue;
6262
+ try {
6263
+ const regex = new RegExp(`(${pattern})`, "gu");
6264
+ let match;
6265
+ while ((match = regex.exec(text)) !== null) {
6266
+ const startIndex = match.index;
6267
+ const endIndex = startIndex + match[0].length;
6268
+ if (tokenName === "rumuz" && !isRumuzStandalone(text, startIndex, endIndex)) continue;
6269
+ if (isPositionCovered(startIndex, endIndex)) continue;
6270
+ results.push({
6271
+ endIndex,
6272
+ index: startIndex,
6273
+ match: match[0],
6274
+ token: tokenName
6275
+ });
6276
+ coveredRanges.push([startIndex, endIndex]);
6277
+ }
6278
+ } catch {}
5646
6279
  }
5647
- return searchEnd;
6280
+ return results.sort((a, b) => a.index - b.index);
5648
6281
  };
5649
6282
  /**
5650
- * Validates attribution for a single segment by searching for its content in the joined text.
5651
- * Returns issues if content is missing, mis-attributed, or violates page limits.
6283
+ * Generates a template pattern from text using detected tokens.
6284
+ * Replaces matched portions with {{token}} syntax.
6285
+ *
6286
+ * @param text - Original text
6287
+ * @param detected - Array of detected patterns from detectTokenPatterns
6288
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
6289
+ *
6290
+ * @example
6291
+ * const detected = detectTokenPatterns("٣٤ - ");
6292
+ * generateTemplateFromText("٣٤ - ", detected);
6293
+ * // Returns: "{{raqms}} {{dash}} "
5652
6294
  */
5653
- const getAttributionIssues = (segment, segmentIndex, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions) => {
5654
- if (!segment.content) return [createIssue("content_not_found", segment, segmentIndex, { evidence: "Segment content is empty." }, pageMap)];
5655
- const expectedBoundary = boundaryMap.get(segment.from);
5656
- if (!expectedBoundary) return handleMissingBoundary(segment, segmentIndex, joined, boundaries, pageMap);
5657
- const searchEnd = getSearchRange(segment, expectedBoundary, boundaryMap, joined.length);
5658
- const searchStart = expectedBoundary.start;
5659
- const idx = joined.indexOf(segment.content, searchStart);
5660
- if (idx !== -1 && idx < searchEnd) return checkMaxPagesViolation(segment, segmentIndex, maxPages, idx + segment.content.length - 1, expectedBoundary.end, boundaries);
5661
- return handleFallbackSearch(segment, segmentIndex, joined, searchStart, searchEnd, expectedBoundary, boundaries, pageMap, maxPages, validationOptions);
6295
+ const generateTemplateFromText = (text, detected) => {
6296
+ if (!text || detected.length === 0) return text;
6297
+ let template = text;
6298
+ const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
6299
+ for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
6300
+ return template;
5662
6301
  };
5663
6302
  /**
5664
- * Performs purely static checks on the segment metadata (Ids and spans) before expensive content searching.
6303
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
6304
+ *
6305
+ * @param detected - Array of detected patterns
6306
+ * @returns Suggested pattern type and whether to use fuzzy matching
5665
6307
  */
5666
- const checkStaticMaxPages = (segment, index, maxPages) => {
5667
- if (maxPages === void 0 || segment.to === void 0) return null;
5668
- if (maxPages === 0) {
5669
- if (segment.to !== segment.from) return createIssue("max_pages_violation", segment, index, {
5670
- evidence: "maxPages=0 requires all segments to stay within one page.",
5671
- expected: {
5672
- from: segment.from,
5673
- to: segment.from
5674
- },
5675
- hint: "Check boundary detection in breakpoint-utils.ts."
5676
- });
5677
- return null;
5678
- }
5679
- const span = segment.to - segment.from;
5680
- if (span > maxPages) return createIssue("max_pages_violation", segment, index, {
5681
- evidence: `Segment spans ${span} pages (maxPages=${maxPages}).`,
5682
- expected: {
5683
- from: segment.from,
5684
- to: segment.from + maxPages
5685
- },
5686
- hint: "Check breakpoint windowing and page attribution in breakpoint-processor.ts."
5687
- });
5688
- return null;
6308
+ const suggestPatternConfig = (detected) => {
6309
+ const hasStructuralToken = detected.some((d) => [
6310
+ "basmalah",
6311
+ "kitab",
6312
+ "bab",
6313
+ "fasl"
6314
+ ].includes(d.token));
6315
+ const hasNumberedPattern = detected.some((d) => [
6316
+ "raqms",
6317
+ "raqm",
6318
+ "numbered"
6319
+ ].includes(d.token));
6320
+ if (hasStructuralToken) return {
6321
+ fuzzy: true,
6322
+ metaType: detected.find((d) => [
6323
+ "kitab",
6324
+ "bab",
6325
+ "fasl"
6326
+ ].includes(d.token))?.token || "chapter",
6327
+ patternType: "lineStartsWith"
6328
+ };
6329
+ if (hasNumberedPattern) return {
6330
+ fuzzy: false,
6331
+ metaType: "hadith",
6332
+ patternType: "lineStartsAfter"
6333
+ };
6334
+ return {
6335
+ fuzzy: false,
6336
+ patternType: "lineStartsAfter"
6337
+ };
5689
6338
  };
5690
6339
  /**
5691
- * Validates a list of segments against the source pages.
5692
- * checks for:
5693
- * - Page existence (invalid IDs)
5694
- * - Content fidelity (content must exist in pages)
5695
- * - Page attribution (from/to must match content location)
5696
- * - Page constraints (maxPages violations)
6340
+ * Analyzes text and generates a complete suggested rule configuration.
5697
6341
  *
5698
- * @param pages Input pages used for segmentation
5699
- * @param options Operations used during segmentation (for preprocessing/joining consistency)
5700
- * @param segments The output segments to validate
5701
- * @param validationOptions Optional settings for validation behavior
5702
- * @returns A detailed validation report
6342
+ * @param text - Highlighted text from the page
6343
+ * @returns Suggested rule configuration or null if no patterns detected
5703
6344
  */
5704
- const validateSegments = (pages, options, segments, validationOptions) => {
5705
- const normalizedPages = normalizePages(pages, options);
5706
- const { boundaries, joined } = buildJoinedContent(normalizedPages, options.pageJoiner === "newline" ? "\n" : " ");
5707
- const boundaryMap = /* @__PURE__ */ new Map();
5708
- const pageMap = /* @__PURE__ */ new Map();
5709
- for (const b of boundaries) boundaryMap.set(b.id, b);
5710
- for (const p of normalizedPages) pageMap.set(p.id, p);
5711
- const pageIds = new Set(normalizedPages.map((p) => p.id));
5712
- const maxPages = options.maxPages;
5713
- const issues = [];
5714
- for (let i = 0; i < segments.length; i++) {
5715
- const segment = segments[i];
5716
- if (!pageIds.has(segment.from)) {
5717
- issues.push(createIssue("page_not_found", segment, i));
5718
- continue;
5719
- }
5720
- if (segment.to !== void 0 && !pageIds.has(segment.to)) issues.push(createIssue("page_not_found", segment, i, { evidence: `Segment.to=${segment.to} does not exist in input pages.` }));
5721
- const staticMaxPageIssue = checkStaticMaxPages(segment, i, maxPages);
5722
- if (staticMaxPageIssue) issues.push(staticMaxPageIssue);
5723
- const attributionIssues = getAttributionIssues(segment, i, maxPages, joined, boundaries, boundaryMap, pageMap, validationOptions);
5724
- issues.push(...attributionIssues);
5725
- }
5726
- const errors = issues.filter((issue) => issue.severity === "error").length;
5727
- const warnings = issues.filter((issue) => issue.severity === "warn").length;
6345
+ const analyzeTextForRule = (text) => {
6346
+ const detected = detectTokenPatterns(text);
6347
+ if (detected.length === 0) return null;
5728
6348
  return {
5729
- issues,
5730
- ok: issues.length === 0,
5731
- summary: {
5732
- errors,
5733
- issues: issues.length,
5734
- pageCount: pages.length,
5735
- segmentCount: segments.length,
5736
- warnings
5737
- }
6349
+ detected,
6350
+ template: generateTemplateFromText(text, detected),
6351
+ ...suggestPatternConfig(detected)
5738
6352
  };
5739
6353
  };
5740
6354
  //#endregion
5741
- export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
6355
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
5742
6356
 
5743
6357
  //# sourceMappingURL=index.mjs.map