flappa-doormal 2.19.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +40 -11
- package/README.md +292 -11
- package/dist/index.d.mts +295 -70
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1386 -251
- package/dist/index.mjs.map +1 -1
- package/package.json +7 -3
package/dist/index.mjs
CHANGED
|
@@ -148,7 +148,7 @@ numbered: "{{raqms}} {{dash}} " };
|
|
|
148
148
|
const expandCompositeTokensInTemplate = (template) => {
|
|
149
149
|
let out = template;
|
|
150
150
|
for (let i = 0; i < 10; i++) {
|
|
151
|
-
const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => COMPOSITE_TOKENS[tokenName]
|
|
151
|
+
const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => tokenName in COMPOSITE_TOKENS ? COMPOSITE_TOKENS[tokenName] : m);
|
|
152
152
|
if (next === out) break;
|
|
153
153
|
out = next;
|
|
154
154
|
}
|
|
@@ -162,7 +162,8 @@ const expandCompositeTokensInTemplate = (template) => {
|
|
|
162
162
|
* @returns Expanded pattern with base tokens replaced
|
|
163
163
|
* @internal
|
|
164
164
|
*/
|
|
165
|
-
const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => BASE_TOKENS[tokenName]
|
|
165
|
+
const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => tokenName in BASE_TOKENS ? BASE_TOKENS[tokenName] : `{{${tokenName}}}`);
|
|
166
|
+
const EXPANDED_COMPOSITE_TOKENS = Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([key, value]) => [key, expandBaseTokens(value)]));
|
|
166
167
|
/**
|
|
167
168
|
* Token definitions mapping human-readable token names to regex patterns.
|
|
168
169
|
*
|
|
@@ -190,7 +191,7 @@ const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, to
|
|
|
190
191
|
*/
|
|
191
192
|
const TOKEN_PATTERNS = {
|
|
192
193
|
...BASE_TOKENS,
|
|
193
|
-
...
|
|
194
|
+
...EXPANDED_COMPOSITE_TOKENS
|
|
194
195
|
};
|
|
195
196
|
/**
|
|
196
197
|
* Regex pattern for matching tokens with optional named capture syntax.
|
|
@@ -283,8 +284,8 @@ const expandTokenLiteral = (literal, opts) => {
|
|
|
283
284
|
if (!parsed) return literal;
|
|
284
285
|
const { tokenName, captureName } = parsed;
|
|
285
286
|
if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
|
|
287
|
+
if (!(tokenName in TOKEN_PATTERNS)) return literal;
|
|
286
288
|
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
287
|
-
if (!tokenPattern) return literal;
|
|
288
289
|
tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
|
|
289
290
|
if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
|
|
290
291
|
return tokenPattern;
|
|
@@ -490,7 +491,7 @@ const applyTokenMappings = (template, mappings) => {
|
|
|
490
491
|
* // → '{{raqms}} {{dash}}'
|
|
491
492
|
*/
|
|
492
493
|
const stripTokenMappings = (template) => {
|
|
493
|
-
return template.replace(/\{\{([^:}]
|
|
494
|
+
return template.replace(/\{\{([^:}]*)?:[^}]+\}\}/g, (_match, tokenName) => `{{${tokenName ?? ""}}}`);
|
|
494
495
|
};
|
|
495
496
|
//#endregion
|
|
496
497
|
//#region src/utils/textUtils.ts
|
|
@@ -1279,30 +1280,1315 @@ const analyzeTextForRule = (text) => {
|
|
|
1279
1280
|
};
|
|
1280
1281
|
};
|
|
1281
1282
|
//#endregion
|
|
1283
|
+
//#region src/dictionary/arabic-dictionary-rule.ts
|
|
1284
|
+
const uniqueCanonicalWords = (words) => {
|
|
1285
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1286
|
+
const result = [];
|
|
1287
|
+
for (const word of words) {
|
|
1288
|
+
const normalized = normalizeArabicForComparison(word);
|
|
1289
|
+
if (!normalized || seen.has(normalized)) continue;
|
|
1290
|
+
seen.add(normalized);
|
|
1291
|
+
result.push(word);
|
|
1292
|
+
}
|
|
1293
|
+
return result;
|
|
1294
|
+
};
|
|
1295
|
+
const buildStopAlternation = (stopWords) => {
|
|
1296
|
+
const unique = uniqueCanonicalWords(stopWords);
|
|
1297
|
+
if (unique.length === 0) return "";
|
|
1298
|
+
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1299
|
+
};
|
|
1300
|
+
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1301
|
+
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
1302
|
+
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
1303
|
+
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
1304
|
+
};
|
|
1305
|
+
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1306
|
+
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1307
|
+
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
1308
|
+
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1309
|
+
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1310
|
+
};
|
|
1311
|
+
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
1312
|
+
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1313
|
+
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1314
|
+
if (!/^[A-Za-z_]\w*$/.test(captureName)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1315
|
+
};
|
|
1316
|
+
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
1317
|
+
validateDictionaryEntryOptions({
|
|
1318
|
+
captureName,
|
|
1319
|
+
maxLetters,
|
|
1320
|
+
minLetters
|
|
1321
|
+
});
|
|
1322
|
+
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1323
|
+
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1324
|
+
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1325
|
+
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1326
|
+
const stopAlternation = buildStopAlternation(stopWords);
|
|
1327
|
+
const lemmaBody = buildHeadwordBody({
|
|
1328
|
+
allowCommaSeparated,
|
|
1329
|
+
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1330
|
+
stopAlternation,
|
|
1331
|
+
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1332
|
+
unit: lemmaUnit
|
|
1333
|
+
});
|
|
1334
|
+
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
1335
|
+
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
1336
|
+
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
1337
|
+
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
1338
|
+
allowParenthesized,
|
|
1339
|
+
allowWhitespaceBeforeColon,
|
|
1340
|
+
captureName: prefixedCaptureName,
|
|
1341
|
+
headwordBody: lemmaBody
|
|
1342
|
+
});
|
|
1343
|
+
return {
|
|
1344
|
+
captureNames: [prefixedCaptureName],
|
|
1345
|
+
regex
|
|
1346
|
+
};
|
|
1347
|
+
};
|
|
1348
|
+
/**
|
|
1349
|
+
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1350
|
+
*
|
|
1351
|
+
* The returned rule preserves authoring intent as a serializable
|
|
1352
|
+
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1353
|
+
* regex string.
|
|
1354
|
+
*
|
|
1355
|
+
* @example
|
|
1356
|
+
* createArabicDictionaryEntryRule({
|
|
1357
|
+
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1358
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1359
|
+
* })
|
|
1360
|
+
*
|
|
1361
|
+
* @example
|
|
1362
|
+
* createArabicDictionaryEntryRule({
|
|
1363
|
+
* allowParenthesized: true,
|
|
1364
|
+
* allowWhitespaceBeforeColon: true,
|
|
1365
|
+
* allowCommaSeparated: true,
|
|
1366
|
+
* stopWords: ['الليث', 'العجاج'],
|
|
1367
|
+
* })
|
|
1368
|
+
*/
|
|
1369
|
+
/**
|
|
1370
|
+
* @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
|
|
1371
|
+
* whole-book dictionary segmentation. Keep this helper for advanced single-rule
|
|
1372
|
+
* composition inside a broader `SplitRule[]` pipeline.
|
|
1373
|
+
*/
|
|
1374
|
+
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1375
|
+
validateDictionaryEntryOptions({
|
|
1376
|
+
captureName,
|
|
1377
|
+
maxLetters,
|
|
1378
|
+
minLetters
|
|
1379
|
+
});
|
|
1380
|
+
return {
|
|
1381
|
+
dictionaryEntry: {
|
|
1382
|
+
allowCommaSeparated,
|
|
1383
|
+
allowParenthesized,
|
|
1384
|
+
allowWhitespaceBeforeColon,
|
|
1385
|
+
captureName,
|
|
1386
|
+
maxLetters,
|
|
1387
|
+
midLineSubentries,
|
|
1388
|
+
minLetters,
|
|
1389
|
+
stopWords: uniqueCanonicalWords(stopWords)
|
|
1390
|
+
},
|
|
1391
|
+
meta,
|
|
1392
|
+
pageStartPrevWordStoplist,
|
|
1393
|
+
samePagePrevWordStoplist
|
|
1394
|
+
};
|
|
1395
|
+
};
|
|
1396
|
+
//#endregion
|
|
1397
|
+
//#region src/dictionary/heading-classifier.ts
|
|
1398
|
+
const HEADING_PREFIX$1 = "## ";
|
|
1399
|
+
const CODE_LINE_PATTERN$1 = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
|
|
1400
|
+
const ARABIC_WORD_PATTERN = ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN;
|
|
1401
|
+
const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
|
|
1402
|
+
const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
|
|
1403
|
+
const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN$1})(?:[)\\]])?$`, "u");
|
|
1404
|
+
const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
|
|
1405
|
+
const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
|
|
1406
|
+
const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
|
|
1407
|
+
const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
|
|
1408
|
+
const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
|
|
1409
|
+
const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
|
|
1410
|
+
const COLON_NOISE_RE = /^.+:\s*.+$/u;
|
|
1411
|
+
const CHAPTER_TERMS = [
|
|
1412
|
+
"باب",
|
|
1413
|
+
"فصل",
|
|
1414
|
+
"كتاب",
|
|
1415
|
+
"حرف",
|
|
1416
|
+
"أبواب"
|
|
1417
|
+
];
|
|
1418
|
+
const MARKER_PREFIXES = [
|
|
1419
|
+
"بسم الله",
|
|
1420
|
+
"توكلت على الله",
|
|
1421
|
+
"آخر كتاب",
|
|
1422
|
+
"ويتلوه"
|
|
1423
|
+
];
|
|
1424
|
+
const NOISE_TOKENS = [
|
|
1425
|
+
"قال",
|
|
1426
|
+
"وقيل",
|
|
1427
|
+
"ويقال",
|
|
1428
|
+
"وفي",
|
|
1429
|
+
"يعني",
|
|
1430
|
+
"فإذا"
|
|
1431
|
+
];
|
|
1432
|
+
const emptyCounts = () => ({
|
|
1433
|
+
chapter: 0,
|
|
1434
|
+
cluster: 0,
|
|
1435
|
+
codeLine: 0,
|
|
1436
|
+
entry: 0,
|
|
1437
|
+
inlineSubentry: 0,
|
|
1438
|
+
lineEntry: 0,
|
|
1439
|
+
marker: 0,
|
|
1440
|
+
noise: 0,
|
|
1441
|
+
pairedForms: 0
|
|
1442
|
+
});
|
|
1443
|
+
const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
|
|
1444
|
+
const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
|
|
1445
|
+
const isDelimitedPrefixMatch$1 = (text, prefix) => {
|
|
1446
|
+
if (text === prefix) return true;
|
|
1447
|
+
if (!text.startsWith(prefix)) return false;
|
|
1448
|
+
const nextChar = text[prefix.length];
|
|
1449
|
+
return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
|
|
1450
|
+
};
|
|
1451
|
+
const isCodeHeading = (text) => {
|
|
1452
|
+
if (CODE_LINE_RE.test(text)) return true;
|
|
1453
|
+
const words = text.trim().split(/\s+/u).filter(Boolean);
|
|
1454
|
+
return words.length === 1 && (words[0]?.length ?? 0) === 1;
|
|
1455
|
+
};
|
|
1456
|
+
const looksLikeNoiseHeading = (text) => {
|
|
1457
|
+
const normalized = normalizeArabicForComparison(text);
|
|
1458
|
+
const wordCount = text.trim().split(/\s+/u).filter(Boolean).length;
|
|
1459
|
+
if (/(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\s])/u.test(text)) return false;
|
|
1460
|
+
if (wordCount >= 8 && COLON_NOISE_RE.test(text)) return true;
|
|
1461
|
+
return NOISE_TOKENS.some((token) => normalized.includes(normalizeArabicForComparison(token))) && wordCount >= 4;
|
|
1462
|
+
};
|
|
1463
|
+
/**
|
|
1464
|
+
* Classifies a markdown heading line produced by `convertContentToMarkdown()`.
|
|
1465
|
+
*/
|
|
1466
|
+
const classifyDictionaryHeading = (line) => {
|
|
1467
|
+
const text = line.startsWith(HEADING_PREFIX$1) ? line.slice(3).trim() : line.trim();
|
|
1468
|
+
const unwrapped = stripLeadingWrappers(text);
|
|
1469
|
+
if (!text) return "noise";
|
|
1470
|
+
if (CHAPTER_HEADING_RE.test(text) || CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizeArabicForComparison(unwrapped), normalizeArabicForComparison(term)))) return "chapter";
|
|
1471
|
+
if (looksLikeNoiseHeading(text)) return "noise";
|
|
1472
|
+
if (isCodeHeading(text)) return "marker";
|
|
1473
|
+
if (MARKER_PREFIXES.some((token) => normalizeArabicForComparison(unwrapped).startsWith(normalizeArabicForComparison(token)))) return "marker";
|
|
1474
|
+
if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
|
|
1475
|
+
if (CLUSTER_HEADING_RE.test(text)) return "cluster";
|
|
1476
|
+
return "entry";
|
|
1477
|
+
};
|
|
1478
|
+
const createHeadingMatch = (kind, page, rawLine, lineNumber) => ({
|
|
1479
|
+
kind,
|
|
1480
|
+
lemma: kind === "entry" ? rawLine.slice(3).trim() : void 0,
|
|
1481
|
+
line: lineNumber,
|
|
1482
|
+
pageId: page.id,
|
|
1483
|
+
text: rawLine
|
|
1484
|
+
});
|
|
1485
|
+
const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
|
|
1486
|
+
kind,
|
|
1487
|
+
lemma,
|
|
1488
|
+
line: lineNumber,
|
|
1489
|
+
pageId: page.id,
|
|
1490
|
+
text
|
|
1491
|
+
});
|
|
1492
|
+
const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
|
|
1493
|
+
if (!rawLine.startsWith(HEADING_PREFIX$1)) return false;
|
|
1494
|
+
const kind = classifyDictionaryHeading(rawLine);
|
|
1495
|
+
matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
|
|
1496
|
+
return true;
|
|
1497
|
+
};
|
|
1498
|
+
const scanLineEntry = (page, rawLine, lineNumber, matches) => {
|
|
1499
|
+
const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
|
|
1500
|
+
if (!lineEntry?.groups?.lemma) return;
|
|
1501
|
+
matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
|
|
1502
|
+
};
|
|
1503
|
+
const scanPairedForms = (page, rawLine, lineNumber, matches) => {
|
|
1504
|
+
const pairedForms = rawLine.match(PAIRED_FORMS_RE);
|
|
1505
|
+
if (!pairedForms?.groups?.forms) return;
|
|
1506
|
+
matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
|
|
1507
|
+
};
|
|
1508
|
+
const scanCodeLine = (page, rawLine, lineNumber, matches) => {
|
|
1509
|
+
const codeLine = rawLine.match(CODE_LINE_RE);
|
|
1510
|
+
if (!codeLine?.groups?.codes) return;
|
|
1511
|
+
matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
|
|
1512
|
+
};
|
|
1513
|
+
const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
|
|
1514
|
+
for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
|
|
1515
|
+
if (!match.groups?.lemma) continue;
|
|
1516
|
+
matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
|
|
1517
|
+
}
|
|
1518
|
+
};
|
|
1519
|
+
/**
|
|
1520
|
+
* Extracts dictionary surface matches from a markdown page.
|
|
1521
|
+
*/
|
|
1522
|
+
const scanDictionaryMarkdownPage = (page) => {
|
|
1523
|
+
const lines = page.content.split(/\n/u);
|
|
1524
|
+
const matches = [];
|
|
1525
|
+
for (let index = 0; index < lines.length; index++) {
|
|
1526
|
+
const rawLine = lines[index]?.trim() ?? "";
|
|
1527
|
+
if (!rawLine) continue;
|
|
1528
|
+
if (scanHeadingLine(page, rawLine, index + 1, matches)) continue;
|
|
1529
|
+
scanLineEntry(page, rawLine, index + 1, matches);
|
|
1530
|
+
scanPairedForms(page, rawLine, index + 1, matches);
|
|
1531
|
+
scanCodeLine(page, rawLine, index + 1, matches);
|
|
1532
|
+
scanInlineSubentries(page, rawLine, index + 1, matches);
|
|
1533
|
+
}
|
|
1534
|
+
return matches;
|
|
1535
|
+
};
|
|
1536
|
+
/**
|
|
1537
|
+
* Aggregates dictionary surface counts across markdown pages.
|
|
1538
|
+
*/
|
|
1539
|
+
const analyzeDictionaryMarkdownPages = (pages) => {
|
|
1540
|
+
const counts = emptyCounts();
|
|
1541
|
+
const matches = [];
|
|
1542
|
+
for (const page of pages) {
|
|
1543
|
+
const pageMatches = scanDictionaryMarkdownPage(page);
|
|
1544
|
+
for (const match of pageMatches) {
|
|
1545
|
+
counts[match.kind] += 1;
|
|
1546
|
+
matches.push(match);
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
return {
|
|
1550
|
+
counts,
|
|
1551
|
+
matches
|
|
1552
|
+
};
|
|
1553
|
+
};
|
|
1554
|
+
//#endregion
|
|
1555
|
+
//#region src/dictionary/profile.ts
|
|
1556
|
+
const normalizedProfileCache = /* @__PURE__ */ new WeakMap();
|
|
1557
|
+
const normalizeStopLemmaWord = (word) => normalizeArabicForComparison(word).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
|
|
1558
|
+
const uniqueNormalizedSet = (values, normalize) => new Set(values.map(normalize).filter(Boolean));
|
|
1559
|
+
const assertNever$2 = (value) => {
|
|
1560
|
+
throw new Error(`Unhandled dictionary profile variant: ${JSON.stringify(value)}`);
|
|
1561
|
+
};
|
|
1562
|
+
const normalizeFamily = (family) => {
|
|
1563
|
+
switch (family.use) {
|
|
1564
|
+
case "heading": return {
|
|
1565
|
+
...family,
|
|
1566
|
+
allowNextLineColon: family.allowNextLineColon ?? false,
|
|
1567
|
+
allowSingleLetter: family.allowSingleLetter ?? false
|
|
1568
|
+
};
|
|
1569
|
+
case "lineEntry": return {
|
|
1570
|
+
...family,
|
|
1571
|
+
allowMultiWord: family.allowMultiWord ?? false,
|
|
1572
|
+
allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
|
|
1573
|
+
wrappers: family.wrappers ?? "none"
|
|
1574
|
+
};
|
|
1575
|
+
case "inlineSubentry": return {
|
|
1576
|
+
...family,
|
|
1577
|
+
prefixes: family.prefixes ?? ["و"],
|
|
1578
|
+
stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
|
|
1579
|
+
};
|
|
1580
|
+
case "codeLine": return {
|
|
1581
|
+
...family,
|
|
1582
|
+
wrappers: family.wrappers ?? "either"
|
|
1583
|
+
};
|
|
1584
|
+
case "pairedForms": return {
|
|
1585
|
+
...family,
|
|
1586
|
+
requireStatusTail: family.requireStatusTail ?? false,
|
|
1587
|
+
separator: family.separator ?? "comma"
|
|
1588
|
+
};
|
|
1589
|
+
default: return assertNever$2(family);
|
|
1590
|
+
}
|
|
1591
|
+
};
|
|
1592
|
+
const normalizeBlocker = (blocker) => {
|
|
1593
|
+
switch (blocker.use) {
|
|
1594
|
+
case "authorityIntro": return {
|
|
1595
|
+
...blocker,
|
|
1596
|
+
precision: blocker.precision ?? "high"
|
|
1597
|
+
};
|
|
1598
|
+
case "stopLemma": return {
|
|
1599
|
+
...blocker,
|
|
1600
|
+
normalizedWords: uniqueNormalizedSet(blocker.words, normalizeStopLemmaWord)
|
|
1601
|
+
};
|
|
1602
|
+
case "previousWord": return {
|
|
1603
|
+
...blocker,
|
|
1604
|
+
normalizedWords: uniqueNormalizedSet(blocker.words, normalizeArabicForComparison)
|
|
1605
|
+
};
|
|
1606
|
+
case "previousChar": return {
|
|
1607
|
+
...blocker,
|
|
1608
|
+
charSet: new Set(blocker.chars)
|
|
1609
|
+
};
|
|
1610
|
+
case "intro":
|
|
1611
|
+
case "pageContinuation": return blocker;
|
|
1612
|
+
default: return assertNever$2(blocker);
|
|
1613
|
+
}
|
|
1614
|
+
};
|
|
1615
|
+
const normalizeZone = (zone) => ({
|
|
1616
|
+
blockers: (zone.blockers ?? []).map(normalizeBlocker),
|
|
1617
|
+
families: zone.families.map(normalizeFamily),
|
|
1618
|
+
name: zone.name,
|
|
1619
|
+
when: zone.when ? {
|
|
1620
|
+
activateAfter: zone.when.activateAfter,
|
|
1621
|
+
maxPageId: zone.when.maxPageId,
|
|
1622
|
+
minPageId: zone.when.minPageId
|
|
1623
|
+
} : void 0
|
|
1624
|
+
});
|
|
1625
|
+
const createIssue$1 = (code, path, message, zoneName) => ({
|
|
1626
|
+
code,
|
|
1627
|
+
message,
|
|
1628
|
+
path,
|
|
1629
|
+
...zoneName ? { zoneName } : {}
|
|
1630
|
+
});
|
|
1631
|
+
const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
|
|
1632
|
+
const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
|
|
1633
|
+
if (gate.use === "headingText") {
|
|
1634
|
+
if (!gate.match.trim()) issues.push(createIssue$1("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
|
|
1635
|
+
if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue$1("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
|
|
1636
|
+
}
|
|
1637
|
+
const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
|
|
1638
|
+
if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue$1("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
|
|
1639
|
+
seenActivateAfterKeys.add(dedupeKey);
|
|
1640
|
+
};
|
|
1641
|
+
const validateFamily = (family, zone, familyIndex, issues) => {
|
|
1642
|
+
const familyPath = `zones[].families[${familyIndex}]`.replace("[]", `[${zone.name}]`);
|
|
1643
|
+
switch (family.use) {
|
|
1644
|
+
case "heading":
|
|
1645
|
+
if (family.classes.length === 0) issues.push(createIssue$1("empty_heading_classes", `${familyPath}.classes`, `dictionary heading family in zone "${zone.name}" must include at least one class`, zone.name));
|
|
1646
|
+
if (family.emit === "chapter" && !family.classes.includes("chapter")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "chapter" but never matches chapter headings`, zone.name));
|
|
1647
|
+
if (family.emit === "marker" && !family.classes.includes("marker")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "marker" but never matches marker headings`, zone.name));
|
|
1648
|
+
if (family.emit === "entry" && !family.classes.includes("entry")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "entry" but never matches entry headings`, zone.name));
|
|
1649
|
+
break;
|
|
1650
|
+
case "lineEntry": break;
|
|
1651
|
+
case "inlineSubentry":
|
|
1652
|
+
if (family.prefixes?.some((prefix) => !prefix.trim())) issues.push(createIssue$1("empty_inline_prefixes", `${familyPath}.prefixes`, `inlineSubentry prefixes must be non-empty strings`, zone.name));
|
|
1653
|
+
break;
|
|
1654
|
+
case "codeLine": break;
|
|
1655
|
+
case "pairedForms": break;
|
|
1656
|
+
default: assertNever$2(family);
|
|
1657
|
+
}
|
|
1658
|
+
};
|
|
1659
|
+
const validateBlocker = (blocker, zone, blockerIndex, issues) => {
|
|
1660
|
+
const blockerPath = `zones[].blockers[${blockerIndex}]`.replace("[]", `[${zone.name}]`);
|
|
1661
|
+
switch (blocker.use) {
|
|
1662
|
+
case "stopLemma":
|
|
1663
|
+
if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_stop_words", `${blockerPath}.words`, `stopLemma blocker in zone "${zone.name}" must include non-empty words`, zone.name));
|
|
1664
|
+
break;
|
|
1665
|
+
case "previousWord":
|
|
1666
|
+
if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_previous_words", `${blockerPath}.words`, `previousWord blocker in zone "${zone.name}" must include non-empty words`, zone.name));
|
|
1667
|
+
break;
|
|
1668
|
+
case "previousChar":
|
|
1669
|
+
if (blocker.chars.length === 0 || blocker.chars.some((char) => !char)) issues.push(createIssue$1("invalid_previous_chars", `${blockerPath}.chars`, `previousChar blocker in zone "${zone.name}" must include chars`, zone.name));
|
|
1670
|
+
break;
|
|
1671
|
+
case "authorityIntro":
|
|
1672
|
+
case "intro":
|
|
1673
|
+
case "pageContinuation": break;
|
|
1674
|
+
default: assertNever$2(blocker);
|
|
1675
|
+
}
|
|
1676
|
+
};
|
|
1677
|
+
var DictionaryProfileValidationError = class extends Error {
|
|
1678
|
+
issues;
|
|
1679
|
+
constructor(issues) {
|
|
1680
|
+
super(issues.length === 1 ? issues[0].message : `Dictionary profile validation failed with ${issues.length} issues`);
|
|
1681
|
+
this.name = "DictionaryProfileValidationError";
|
|
1682
|
+
this.issues = issues;
|
|
1683
|
+
}
|
|
1684
|
+
};
|
|
1685
|
+
const validateZone = (zone, zoneIndex, seenZoneNames, issues) => {
|
|
1686
|
+
const zonePath = `zones[${zoneIndex}]`;
|
|
1687
|
+
const trimmedName = zone.name.trim();
|
|
1688
|
+
if (!trimmedName) issues.push(createIssue$1("empty_zone_name", `${zonePath}.name`, `dictionary zone name must be non-empty`));
|
|
1689
|
+
else if (seenZoneNames.has(trimmedName)) issues.push(createIssue$1("duplicate_zone_name", `${zonePath}.name`, `dictionary zone names must be unique; duplicated "${trimmedName}"`, trimmedName));
|
|
1690
|
+
else seenZoneNames.add(trimmedName);
|
|
1691
|
+
if (zone.families.length === 0) issues.push(createIssue$1("empty_zone_families", `${zonePath}.families`, `dictionary zone "${zone.name}" must declare at least one family`, zone.name));
|
|
1692
|
+
if (zone.when?.minPageId !== void 0 && zone.when?.maxPageId !== void 0 && zone.when.minPageId > zone.when.maxPageId) issues.push(createIssue$1("invalid_zone_page_range", `${zonePath}.when`, `dictionary zone "${zone.name}" has minPageId greater than maxPageId`, zone.name));
|
|
1693
|
+
const seenActivateAfterKeys = /* @__PURE__ */ new Set();
|
|
1694
|
+
for (let gateIndex = 0; gateIndex < (zone.when?.activateAfter?.length ?? 0); gateIndex++) validateGate(zone.when.activateAfter[gateIndex], zone, gateIndex, seenActivateAfterKeys, issues);
|
|
1695
|
+
for (let familyIndex = 0; familyIndex < zone.families.length; familyIndex++) validateFamily(zone.families[familyIndex], zone, familyIndex, issues);
|
|
1696
|
+
for (let blockerIndex = 0; blockerIndex < (zone.blockers?.length ?? 0); blockerIndex++) validateBlocker(zone.blockers[blockerIndex], zone, blockerIndex, issues);
|
|
1697
|
+
};
|
|
1698
|
+
/**
|
|
1699
|
+
* Validates a dictionary profile without normalizing it.
|
|
1700
|
+
*/
|
|
1701
|
+
const validateDictionaryProfile = (profile) => {
|
|
1702
|
+
const issues = [];
|
|
1703
|
+
if (profile.version !== 2) issues.push(createIssue$1("invalid_version", "version", `dictionary profile version must be 2, got ${profile.version}`));
|
|
1704
|
+
if (profile.zones.length === 0) {
|
|
1705
|
+
issues.push(createIssue$1("missing_zones", "zones", `dictionary profile must contain at least one zone`));
|
|
1706
|
+
return issues;
|
|
1707
|
+
}
|
|
1708
|
+
const seenZoneNames = /* @__PURE__ */ new Set();
|
|
1709
|
+
for (let zoneIndex = 0; zoneIndex < profile.zones.length; zoneIndex++) validateZone(profile.zones[zoneIndex], zoneIndex, seenZoneNames, issues);
|
|
1710
|
+
return issues;
|
|
1711
|
+
};
|
|
1712
|
+
/**
|
|
1713
|
+
* Normalizes and validates a dictionary profile before runtime matching.
|
|
1714
|
+
*/
|
|
1715
|
+
const normalizeDictionaryProfile = (profile) => {
|
|
1716
|
+
const cached = normalizedProfileCache.get(profile);
|
|
1717
|
+
if (cached) return cached;
|
|
1718
|
+
const issues = validateDictionaryProfile(profile);
|
|
1719
|
+
if (issues.length > 0) throw new DictionaryProfileValidationError(issues);
|
|
1720
|
+
const normalized = {
|
|
1721
|
+
version: 2,
|
|
1722
|
+
zones: profile.zones.map(normalizeZone)
|
|
1723
|
+
};
|
|
1724
|
+
normalizedProfileCache.set(profile, normalized);
|
|
1725
|
+
return normalized;
|
|
1726
|
+
};
|
|
1727
|
+
//#endregion
|
|
1282
1728
|
//#region src/types/rules.ts
|
|
1283
1729
|
/**
|
|
1284
|
-
* Pattern type key names for split rules.
|
|
1285
|
-
*
|
|
1286
|
-
* Use this array to dynamically iterate over pattern types in UIs,
|
|
1287
|
-
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
1730
|
+
* Pattern type key names for split rules.
|
|
1731
|
+
*
|
|
1732
|
+
* Use this array to dynamically iterate over pattern types in UIs,
|
|
1733
|
+
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
1734
|
+
*
|
|
1735
|
+
* @example
|
|
1736
|
+
* // Build a dropdown/select in UI
|
|
1737
|
+
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
1738
|
+
*
|
|
1739
|
+
* @example
|
|
1740
|
+
* // Type-safe pattern key validation
|
|
1741
|
+
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
1742
|
+
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
1743
|
+
*/
|
|
1744
|
+
const PATTERN_TYPE_KEYS = [
|
|
1745
|
+
"lineStartsWith",
|
|
1746
|
+
"lineStartsAfter",
|
|
1747
|
+
"lineEndsWith",
|
|
1748
|
+
"template",
|
|
1749
|
+
"regex",
|
|
1750
|
+
"dictionaryEntry"
|
|
1751
|
+
];
|
|
1752
|
+
//#endregion
|
|
1753
|
+
//#region src/segmentation/debug-meta.ts
|
|
1754
|
+
const resolveDebugConfig = (debug) => {
|
|
1755
|
+
if (debug === true) return {
|
|
1756
|
+
includeBreakpoint: true,
|
|
1757
|
+
includeRule: true,
|
|
1758
|
+
metaKey: "_flappa"
|
|
1759
|
+
};
|
|
1760
|
+
if (!debug || typeof debug !== "object") return null;
|
|
1761
|
+
const { metaKey, include } = debug;
|
|
1762
|
+
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
1763
|
+
return {
|
|
1764
|
+
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
1765
|
+
includeRule,
|
|
1766
|
+
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
1767
|
+
};
|
|
1768
|
+
};
|
|
1769
|
+
const getRulePatternType = (rule) => {
|
|
1770
|
+
return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
|
|
1771
|
+
};
|
|
1772
|
+
const isPlainObject$1 = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
1773
|
+
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
1774
|
+
const out = meta ? { ...meta } : {};
|
|
1775
|
+
const existing = out[metaKey];
|
|
1776
|
+
out[metaKey] = {
|
|
1777
|
+
...isPlainObject$1(existing) ? existing : {},
|
|
1778
|
+
...patch
|
|
1779
|
+
};
|
|
1780
|
+
return out;
|
|
1781
|
+
};
|
|
1782
|
+
const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
|
|
1783
|
+
const patternType = getRulePatternType(rule);
|
|
1784
|
+
const patterns = rule[patternType];
|
|
1785
|
+
const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
|
|
1786
|
+
return { rule: {
|
|
1787
|
+
index: ruleIndex,
|
|
1788
|
+
patternType,
|
|
1789
|
+
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
1790
|
+
...word !== void 0 ? { word } : {}
|
|
1791
|
+
} };
|
|
1792
|
+
};
|
|
1793
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
1794
|
+
index: breakpointIndex,
|
|
1795
|
+
kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
|
|
1796
|
+
pattern: rule.pattern ?? rule.regex,
|
|
1797
|
+
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
1798
|
+
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
1799
|
+
} });
|
|
1800
|
+
/**
|
|
1801
|
+
* Helper to format the debug info into a human-readable string.
|
|
1802
|
+
* @param meta - The segment metadata object
|
|
1803
|
+
* @param options - Formatting options
|
|
1804
|
+
*/
|
|
1805
|
+
const formatRuleReason = (rule, concise) => {
|
|
1806
|
+
const { index, patternType, wordIndex, word } = rule;
|
|
1807
|
+
if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
|
|
1808
|
+
const wordInfo = word ? ` (Matched: "${word}")` : "";
|
|
1809
|
+
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
1810
|
+
};
|
|
1811
|
+
const formatBreakpointReason = (breakpoint, concise) => {
|
|
1812
|
+
const { index, kind, pattern, wordIndex, word } = breakpoint;
|
|
1813
|
+
if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
|
|
1814
|
+
if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
|
|
1815
|
+
if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
|
|
1816
|
+
return `Breakpoint #${index} (${kind}) - "${pattern}"`;
|
|
1817
|
+
};
|
|
1818
|
+
const formatContentLengthReason = (split, concise) => {
|
|
1819
|
+
const { maxContentLength, splitReason } = split;
|
|
1820
|
+
if (concise) return `> ${maxContentLength} (${splitReason})`;
|
|
1821
|
+
return `Safety Split (${splitReason}) > ${maxContentLength}`;
|
|
1822
|
+
};
|
|
1823
|
+
/**
|
|
1824
|
+
* Helper to format the debug info into a human-readable string.
|
|
1825
|
+
* @param meta - The segment metadata object
|
|
1826
|
+
* @param options - Formatting options
|
|
1827
|
+
*/
|
|
1828
|
+
const getDebugReason = (meta, options) => {
|
|
1829
|
+
const debug = meta?._flappa;
|
|
1830
|
+
if (!debug) return "-";
|
|
1831
|
+
const concise = options?.concise;
|
|
1832
|
+
if (debug.rule) return formatRuleReason(debug.rule, concise);
|
|
1833
|
+
if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
|
|
1834
|
+
if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
|
|
1835
|
+
return "Unknown";
|
|
1836
|
+
};
|
|
1837
|
+
/**
|
|
1838
|
+
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
1839
|
+
* @param segment - The segment object
|
|
1840
|
+
* @param options - Formatting options
|
|
1841
|
+
*/
|
|
1842
|
+
const getSegmentDebugReason = (segment, options) => {
|
|
1843
|
+
return getDebugReason(segment.meta, options);
|
|
1844
|
+
};
|
|
1845
|
+
//#endregion
|
|
1846
|
+
//#region src/dictionary/runtime.ts
|
|
1847
|
+
const INTRO_PHRASES = [
|
|
1848
|
+
"وقال",
|
|
1849
|
+
"قال",
|
|
1850
|
+
"وفي الحديث",
|
|
1851
|
+
"في الحديث",
|
|
1852
|
+
"وفي حديث",
|
|
1853
|
+
"في حديث",
|
|
1854
|
+
"وفي رواية",
|
|
1855
|
+
"في رواية",
|
|
1856
|
+
"وفي قراءة",
|
|
1857
|
+
"في قراءة",
|
|
1858
|
+
"وفي قول",
|
|
1859
|
+
"في قول",
|
|
1860
|
+
"وفي كلام",
|
|
1861
|
+
"في كلام",
|
|
1862
|
+
"ومنه قول",
|
|
1863
|
+
"ومنها قول",
|
|
1864
|
+
"وقرأ",
|
|
1865
|
+
"قرأ",
|
|
1866
|
+
"قراءة",
|
|
1867
|
+
"حديث",
|
|
1868
|
+
"ويقال",
|
|
1869
|
+
"وقيل",
|
|
1870
|
+
"قلت",
|
|
1871
|
+
"فقال",
|
|
1872
|
+
"قال الشاعر",
|
|
1873
|
+
"أنشد",
|
|
1874
|
+
"وأنشد"
|
|
1875
|
+
];
|
|
1876
|
+
const INTRO_TAIL_PHRASES = [
|
|
1877
|
+
"بفتح",
|
|
1878
|
+
"بالفتح",
|
|
1879
|
+
"بكسر",
|
|
1880
|
+
"بالكسر",
|
|
1881
|
+
"بضم",
|
|
1882
|
+
"بالضم",
|
|
1883
|
+
"بالتحريك",
|
|
1884
|
+
"حديث",
|
|
1885
|
+
"الحديث",
|
|
1886
|
+
"في التنزيل",
|
|
1887
|
+
"وفي التنزيل",
|
|
1888
|
+
"في التنزيل العزيز",
|
|
1889
|
+
"وفي التنزيل العزيز",
|
|
1890
|
+
"في مقتل",
|
|
1891
|
+
"وفي مقتل",
|
|
1892
|
+
"في المجاز",
|
|
1893
|
+
"وفي المجاز",
|
|
1894
|
+
"من المجاز",
|
|
1895
|
+
"ومن المجاز",
|
|
1896
|
+
"في رواية",
|
|
1897
|
+
"وفي رواية",
|
|
1898
|
+
"في قراءة",
|
|
1899
|
+
"وفي قراءة",
|
|
1900
|
+
"في قول",
|
|
1901
|
+
"وفي قول",
|
|
1902
|
+
"في كلام",
|
|
1903
|
+
"وفي كلام",
|
|
1904
|
+
"في صفة",
|
|
1905
|
+
"وفي صفة",
|
|
1906
|
+
"في خطبته",
|
|
1907
|
+
"وفي خطبته",
|
|
1908
|
+
"ومنه قول",
|
|
1909
|
+
"ومنها قول",
|
|
1910
|
+
"يقال لرقبة",
|
|
1911
|
+
"على جهتين",
|
|
1912
|
+
"قوله جل",
|
|
1913
|
+
"قوله جل وعز",
|
|
1914
|
+
"جل وعز",
|
|
1915
|
+
"ومنه حديث",
|
|
1916
|
+
"ومنه الحديث",
|
|
1917
|
+
"كرم الله",
|
|
1918
|
+
"صلى الله عليه",
|
|
1919
|
+
"رضي الله عنه",
|
|
1920
|
+
"رضي الله عنها",
|
|
1921
|
+
"رضي الله عنهما",
|
|
1922
|
+
"قال ابو",
|
|
1923
|
+
"وقال ابو",
|
|
1924
|
+
"عن ابي",
|
|
1925
|
+
"قال ابن",
|
|
1926
|
+
"وقال ابن",
|
|
1927
|
+
"عن ابن"
|
|
1928
|
+
];
|
|
1929
|
+
const INTRO_TAIL_PATTERNS = [
|
|
1930
|
+
/(?:^|\s)(?:في|وفي|ومنه|ومنها)\s+(?:حديث|الحديث|رواية|قراءة|قول|كلام|مقتل|صفة|خطبته)(?:\s+\S+){0,8}$/u,
|
|
1931
|
+
/(?:^|\s)(?:حديث|الحديث|رواية|قراءة|قول|كلام)(?:\s+\S+){1,8}$/u,
|
|
1932
|
+
/(?:^|\s)(?:قوله|قول(?:ه|هم)?|قال(?:\s+قائل)?|وقرأ|قرأ|قراءة)\s+(?:جل(?:\s+وعز)?|[^\s]+)$/u,
|
|
1933
|
+
/(?:^|\s)(?:ابو|ابي|ابا|ابن|بن|بنت)(?:\s+\S+){1,4}$/u,
|
|
1934
|
+
/(?:^|\s)(?:قال|وقال|انشد|وانشد|روي|وروي|اخبر|واخبر)(?:\s+\S+){0,4}$/u
|
|
1935
|
+
];
|
|
1936
|
+
const QUALIFIER_TAIL_PREFIXES = [
|
|
1937
|
+
"أي",
|
|
1938
|
+
"قال",
|
|
1939
|
+
"تقول",
|
|
1940
|
+
"يقال",
|
|
1941
|
+
"يقول",
|
|
1942
|
+
"يريد",
|
|
1943
|
+
"يُريد",
|
|
1944
|
+
"ويقال",
|
|
1945
|
+
"ويقول",
|
|
1946
|
+
"وجمعه",
|
|
1947
|
+
"وجمعها",
|
|
1948
|
+
"والجميع",
|
|
1949
|
+
"والجمع"
|
|
1950
|
+
];
|
|
1951
|
+
const STRUCTURAL_LEMMA_PREFIXES = [
|
|
1952
|
+
"لجزء",
|
|
1953
|
+
"جزء",
|
|
1954
|
+
"ومما يستدرك عليه",
|
|
1955
|
+
"آخر حرف",
|
|
1956
|
+
"كتاب حرف"
|
|
1957
|
+
];
|
|
1958
|
+
const STRUCTURAL_LINE_PATTERNS = [
|
|
1959
|
+
/^\d+\s*-\s*\(.+\)$/u,
|
|
1960
|
+
/^\(.+\)$/u,
|
|
1961
|
+
/^\(.+\)\s*##\s*/u
|
|
1962
|
+
];
|
|
1963
|
+
const STRUCTURAL_LINE_KEYWORDS = [
|
|
1964
|
+
"باب",
|
|
1965
|
+
"فصل",
|
|
1966
|
+
"حرف",
|
|
1967
|
+
"أبواب",
|
|
1968
|
+
"كتاب",
|
|
1969
|
+
"المعجمة",
|
|
1970
|
+
"المهملة",
|
|
1971
|
+
"المثناة"
|
|
1972
|
+
];
|
|
1973
|
+
const CONTINUATION_PREV_WORDS = [
|
|
1974
|
+
"بفتح",
|
|
1975
|
+
"بالفتح",
|
|
1976
|
+
"بكسر",
|
|
1977
|
+
"بالكسر",
|
|
1978
|
+
"بضم",
|
|
1979
|
+
"بالضم",
|
|
1980
|
+
"بالتحريك",
|
|
1981
|
+
"قال",
|
|
1982
|
+
"وقال",
|
|
1983
|
+
"وقيل",
|
|
1984
|
+
"ويقال",
|
|
1985
|
+
"يقال",
|
|
1986
|
+
"قلت",
|
|
1987
|
+
"فقال",
|
|
1988
|
+
"قالوا",
|
|
1989
|
+
"من",
|
|
1990
|
+
"في",
|
|
1991
|
+
"على",
|
|
1992
|
+
"إذا",
|
|
1993
|
+
"نحو",
|
|
1994
|
+
"ثم",
|
|
1995
|
+
"وجل"
|
|
1996
|
+
];
|
|
1997
|
+
const AUTHORITY_RE = /^(?:(?:و)?قال\s+(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\b|(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\s+\S+)/u;
|
|
1998
|
+
const AUTHORITY_HEAD_WORDS = [
|
|
1999
|
+
"الأزهري",
|
|
2000
|
+
"الأصمعي",
|
|
2001
|
+
"الأشجعي",
|
|
2002
|
+
"الأموي",
|
|
2003
|
+
"الأمويّ",
|
|
2004
|
+
"الجوهري",
|
|
2005
|
+
"الرياشي",
|
|
2006
|
+
"الزجاج",
|
|
2007
|
+
"الزجاجي",
|
|
2008
|
+
"الشيباني",
|
|
2009
|
+
"الفراء",
|
|
2010
|
+
"الكسائي",
|
|
2011
|
+
"اللحياني",
|
|
2012
|
+
"الليث",
|
|
2013
|
+
"المبرد",
|
|
2014
|
+
"المنذري",
|
|
2015
|
+
"ثعلب",
|
|
2016
|
+
"شمر"
|
|
2017
|
+
];
|
|
2018
|
+
const STRONG_SENTENCE_TERMINATORS$1 = /[.!?؟؛۔…]$/u;
|
|
2019
|
+
const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
|
|
2020
|
+
const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
|
|
2021
|
+
const ARABIC_WORD_REGEX$1 = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
|
|
2022
|
+
const HEADING_PREFIX = "## ";
|
|
2023
|
+
const CODE_LINE_PATTERN = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
|
|
2024
|
+
const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN})$`, "u");
|
|
2025
|
+
const STATUS_TAIL_PATTERN = "(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)";
|
|
2026
|
+
const GATE_TOKEN_MAP = {
|
|
2027
|
+
bab: "باب",
|
|
2028
|
+
fasl: "فصل",
|
|
2029
|
+
kitab: "كتاب"
|
|
2030
|
+
};
|
|
2031
|
+
const GATE_DELIMITER_RE = /[\s:،؛()[\]{}\-–—]/u;
|
|
2032
|
+
const assertNever$1 = (value) => {
|
|
2033
|
+
throw new Error(`Unhandled dictionary runtime variant: ${JSON.stringify(value)}`);
|
|
2034
|
+
};
|
|
2035
|
+
const lineEntryRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2036
|
+
const inlineSubentryRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2037
|
+
const pairedFormsRegexCache = /* @__PURE__ */ new WeakMap();
|
|
2038
|
+
const trimTrailingPageWrapNoise$1 = (text) => text.trimEnd().replace(TRAILING_PAGE_WRAP_NOISE$1, "");
|
|
2039
|
+
const endsWithStrongSentenceTerminator$1 = (pageContent) => {
|
|
2040
|
+
return STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
|
|
2041
|
+
};
|
|
2042
|
+
const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
|
|
2043
|
+
const windowStart = Math.max(0, endExclusive - 256);
|
|
2044
|
+
const withoutTrailingDelimiters = trimTrailingPageWrapNoise$1(text.slice(windowStart, endExclusive)).replace(TRAILING_WORD_DELIMITERS$1, "");
|
|
2045
|
+
let lastMatch = "";
|
|
2046
|
+
ARABIC_WORD_REGEX$1.lastIndex = 0;
|
|
2047
|
+
for (const match of withoutTrailingDelimiters.matchAll(ARABIC_WORD_REGEX$1)) lastMatch = match[0];
|
|
2048
|
+
return lastMatch;
|
|
2049
|
+
};
|
|
2050
|
+
const previousNonWhitespaceChar = (text, endExclusive = text.length) => {
|
|
2051
|
+
for (let index = endExclusive - 1; index >= 0; index--) {
|
|
2052
|
+
const char = text[index];
|
|
2053
|
+
if (char && !/\s/u.test(char)) return char;
|
|
2054
|
+
}
|
|
2055
|
+
return "";
|
|
2056
|
+
};
|
|
2057
|
+
const normalizedEquals = (left, right) => normalizeArabicForComparison(left) === normalizeArabicForComparison(right);
|
|
2058
|
+
const normalizedStartsWith = (text, prefix) => normalizeArabicForComparison(text).startsWith(normalizeArabicForComparison(prefix));
|
|
2059
|
+
const normalizeStopLemma = (text) => normalizeArabicForComparison(text).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
|
|
2060
|
+
const getTrailingContext = (text, endExclusive, maxChars = 240) => text.slice(Math.max(0, endExclusive - maxChars), endExclusive);
|
|
2061
|
+
const isDelimitedPrefixMatch = (text, prefix) => {
|
|
2062
|
+
if (text === prefix) return true;
|
|
2063
|
+
if (!text.startsWith(prefix)) return false;
|
|
2064
|
+
const nextChar = text[prefix.length];
|
|
2065
|
+
return nextChar === void 0 || GATE_DELIMITER_RE.test(nextChar);
|
|
2066
|
+
};
|
|
2067
|
+
const createPageContexts = (pages, pageMap, normalizedPages) => {
|
|
2068
|
+
if (normalizedPages && normalizedPages.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} normalized pages, received ${normalizedPages.length}`);
|
|
2069
|
+
if (pageMap.boundaries.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} page boundaries, received ${pageMap.boundaries.length}`);
|
|
2070
|
+
const contexts = [];
|
|
2071
|
+
for (let index = 0; index < pages.length; index++) {
|
|
2072
|
+
const page = pages[index];
|
|
2073
|
+
const boundary = pageMap.boundaries[index];
|
|
2074
|
+
if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
|
|
2075
|
+
const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
|
|
2076
|
+
contexts.push({
|
|
2077
|
+
boundary,
|
|
2078
|
+
content,
|
|
2079
|
+
index,
|
|
2080
|
+
lines: buildPageLines(content),
|
|
2081
|
+
page
|
|
2082
|
+
});
|
|
2083
|
+
}
|
|
2084
|
+
return contexts;
|
|
2085
|
+
};
|
|
2086
|
+
const normalizeIntroContextText = (text) => normalizeArabicForComparison(text).replace(/[\\/]+/gu, " ").replace(/[«»"“”'‘’()[\]{}]+/gu, " ").replace(/\s+/gu, " ").trim();
|
|
2087
|
+
const startsWithConfiguredWord = (words, candidate) => words.some((word) => normalizedStartsWith(candidate, word));
|
|
2088
|
+
const buildPageLines = (content) => {
|
|
2089
|
+
const parts = content.split("\n");
|
|
2090
|
+
const lines = [];
|
|
2091
|
+
let offset = 0;
|
|
2092
|
+
for (let index = 0; index < parts.length; index++) {
|
|
2093
|
+
const text = parts[index] ?? "";
|
|
2094
|
+
lines.push({
|
|
2095
|
+
lineNumber: index + 1,
|
|
2096
|
+
start: offset,
|
|
2097
|
+
text
|
|
2098
|
+
});
|
|
2099
|
+
offset += text.length + 1;
|
|
2100
|
+
}
|
|
2101
|
+
return lines;
|
|
2102
|
+
};
|
|
2103
|
+
const headingMatchesGate = (headingText, gate) => {
|
|
2104
|
+
if (gate.use === "headingText") {
|
|
2105
|
+
const useFuzzy = gate.fuzzy ?? false;
|
|
2106
|
+
const source = useFuzzy ? normalizeArabicForComparison(headingText) : headingText.trim();
|
|
2107
|
+
const match = useFuzzy ? normalizeArabicForComparison(gate.match) : gate.match.trim();
|
|
2108
|
+
return !!match && isDelimitedPrefixMatch(source, match);
|
|
2109
|
+
}
|
|
2110
|
+
return normalizedStartsWith(headingText, GATE_TOKEN_MAP[gate.token]);
|
|
2111
|
+
};
|
|
2112
|
+
const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
|
|
2113
|
+
const trimmed = line.text.trim();
|
|
2114
|
+
if (!trimmed.startsWith(HEADING_PREFIX)) return false;
|
|
2115
|
+
const headingText = trimmed.replace(/^##\s+/u, "").trim();
|
|
2116
|
+
return gates.some((gate) => headingMatchesGate(headingText, gate));
|
|
2117
|
+
});
|
|
2118
|
+
const pageWithinZoneBounds = (zone, pageId) => {
|
|
2119
|
+
if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
|
|
2120
|
+
if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
|
|
2121
|
+
return true;
|
|
2122
|
+
};
|
|
2123
|
+
const findActivationPageId = (zone, pages) => {
|
|
2124
|
+
for (const page of pages) {
|
|
2125
|
+
if (!pageWithinZoneBounds(zone, page.page.id)) continue;
|
|
2126
|
+
if (pageMatchesAnyGate(page, zone.when?.activateAfter ?? [])) return page.page.id;
|
|
2127
|
+
}
|
|
2128
|
+
return null;
|
|
2129
|
+
};
|
|
2130
|
+
const createZoneActivationMap = (profile, pages) => {
|
|
2131
|
+
const activation = /* @__PURE__ */ new Map();
|
|
2132
|
+
for (const zone of profile.zones) {
|
|
2133
|
+
if (!zone.when?.activateAfter?.length) {
|
|
2134
|
+
activation.set(zone.name, null);
|
|
2135
|
+
continue;
|
|
2136
|
+
}
|
|
2137
|
+
activation.set(zone.name, findActivationPageId(zone, pages));
|
|
2138
|
+
}
|
|
2139
|
+
return activation;
|
|
2140
|
+
};
|
|
2141
|
+
const pageMatchesZone = (zone, activationMap, pageId) => {
|
|
2142
|
+
if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
|
|
2143
|
+
if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
|
|
2144
|
+
if (!zone.when?.activateAfter?.length) return true;
|
|
2145
|
+
const activatedAt = activationMap.get(zone.name);
|
|
2146
|
+
return activatedAt !== null && activatedAt !== void 0 && pageId >= activatedAt;
|
|
2147
|
+
};
|
|
2148
|
+
const resolveActiveZone = (profile, activationMap, pageId) => {
|
|
2149
|
+
let activeZone = null;
|
|
2150
|
+
for (const zone of profile.zones) if (pageMatchesZone(zone, activationMap, pageId)) activeZone = zone;
|
|
2151
|
+
return activeZone;
|
|
2152
|
+
};
|
|
2153
|
+
const createHeadingCandidate = (pageStartOffset, line, nextLine, family, headingClass) => {
|
|
2154
|
+
if (!family.classes.includes(headingClass)) return null;
|
|
2155
|
+
const headingText = line.text.trim().slice(3).trim();
|
|
2156
|
+
if (!family.allowSingleLetter && headingClass === "entry" && headingText.length <= 1) return null;
|
|
2157
|
+
if (headingClass === "entry" && !family.allowNextLineColon && nextLine?.text.trimStart().startsWith(":")) return null;
|
|
2158
|
+
return {
|
|
2159
|
+
absoluteIndex: pageStartOffset + line.start,
|
|
2160
|
+
contentStartOffset: 3,
|
|
2161
|
+
family: "heading",
|
|
2162
|
+
headingClass,
|
|
2163
|
+
kind: family.emit,
|
|
2164
|
+
lemma: family.emit === "entry" ? headingText : void 0,
|
|
2165
|
+
lineNumber: line.lineNumber,
|
|
2166
|
+
localIndex: line.start,
|
|
2167
|
+
probeText: line.text.trim(),
|
|
2168
|
+
text: line.text.trim()
|
|
2169
|
+
};
|
|
2170
|
+
};
|
|
2171
|
+
const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
|
|
2172
|
+
const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
|
|
2173
|
+
const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
|
|
2174
|
+
const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
|
|
2175
|
+
const createLineEntryRegex = (family) => {
|
|
2176
|
+
const cached = lineEntryRegexCache.get(family);
|
|
2177
|
+
if (cached) return cached;
|
|
2178
|
+
const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
|
|
2179
|
+
const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
2180
|
+
const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
|
|
2181
|
+
lineEntryRegexCache.set(family, regex);
|
|
2182
|
+
return regex;
|
|
2183
|
+
};
|
|
2184
|
+
const collectLineEntryCandidates = (pageStartOffset, line, family) => {
|
|
2185
|
+
const trimmed = line.text.trim();
|
|
2186
|
+
if (STATUS_LINE_RE.test(trimmed)) return [];
|
|
2187
|
+
const match = trimmed.match(createLineEntryRegex(family));
|
|
2188
|
+
if (!match?.groups?.lemma) return [];
|
|
2189
|
+
return [{
|
|
2190
|
+
absoluteIndex: pageStartOffset + line.start,
|
|
2191
|
+
family: "lineEntry",
|
|
2192
|
+
kind: "entry",
|
|
2193
|
+
lemma: match.groups.lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim(),
|
|
2194
|
+
lineNumber: line.lineNumber,
|
|
2195
|
+
localIndex: line.start,
|
|
2196
|
+
probeText: trimmed,
|
|
2197
|
+
text: trimmed
|
|
2198
|
+
}];
|
|
2199
|
+
};
|
|
2200
|
+
const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
|
|
2201
|
+
const cached = inlineSubentryRegexCache.get(family);
|
|
2202
|
+
const prefixes = family.prefixes.length > 0 ? family.prefixes.map(escapeRegex).join("|") : escapeRegex("و");
|
|
2203
|
+
const regex = cached ?? new RegExp(`(^|[\\s،؛,:.])(?<lemma>(?:${prefixes})${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})\\s*:`, "gu");
|
|
2204
|
+
if (!cached) inlineSubentryRegexCache.set(family, regex);
|
|
2205
|
+
const candidates = [];
|
|
2206
|
+
for (const match of line.text.matchAll(regex)) {
|
|
2207
|
+
if (!match.groups?.lemma || match.index === void 0) continue;
|
|
2208
|
+
const lemmaIndex = match[0].indexOf(match.groups.lemma);
|
|
2209
|
+
if (lemmaIndex < 0) continue;
|
|
2210
|
+
const candidateStart = match.index + lemmaIndex;
|
|
2211
|
+
const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(new RegExp(`^(?:${prefixes})`, "u"), "") : match.groups.lemma;
|
|
2212
|
+
candidates.push({
|
|
2213
|
+
absoluteIndex: pageStartOffset + line.start + candidateStart,
|
|
2214
|
+
family: "inlineSubentry",
|
|
2215
|
+
kind: "entry",
|
|
2216
|
+
lemma,
|
|
2217
|
+
lineNumber: line.lineNumber,
|
|
2218
|
+
localIndex: line.start + candidateStart,
|
|
2219
|
+
probeText: line.text.slice(candidateStart).trimStart(),
|
|
2220
|
+
text: line.text.trim()
|
|
2221
|
+
});
|
|
2222
|
+
}
|
|
2223
|
+
return candidates;
|
|
2224
|
+
};
|
|
2225
|
+
const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN}$`, "u");
|
|
2226
|
+
const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
|
|
2227
|
+
const parseWrappedCode = (text) => {
|
|
2228
|
+
const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[\])])$/u);
|
|
2229
|
+
if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
|
|
2230
|
+
return {
|
|
2231
|
+
close: paired.groups.close,
|
|
2232
|
+
inner: paired.groups.inner.trim(),
|
|
2233
|
+
open: paired.groups.open,
|
|
2234
|
+
paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
|
|
2235
|
+
};
|
|
2236
|
+
};
|
|
2237
|
+
const collectCodeLineCandidates = (pageStartOffset, line, family) => {
|
|
2238
|
+
const trimmed = line.text.trim();
|
|
2239
|
+
const bare = trimmed.replace(STATUS_SUFFIX_RE, "").trim();
|
|
2240
|
+
const wrapped = parseWrappedCode(bare);
|
|
2241
|
+
const inner = wrapped?.inner ?? bare;
|
|
2242
|
+
if (!CODE_CORE_RE.test(inner)) return [];
|
|
2243
|
+
if (!(family.wrappers === "either" ? true : family.wrappers === "none" ? wrapped === null : family.wrappers === "paired" ? wrapped?.paired === true : wrapped !== null && !wrapped.paired)) return [];
|
|
2244
|
+
return [{
|
|
2245
|
+
absoluteIndex: pageStartOffset + line.start,
|
|
2246
|
+
family: "codeLine",
|
|
2247
|
+
kind: "marker",
|
|
2248
|
+
lemma: inner,
|
|
2249
|
+
lineNumber: line.lineNumber,
|
|
2250
|
+
localIndex: line.start,
|
|
2251
|
+
probeText: trimmed,
|
|
2252
|
+
text: trimmed
|
|
2253
|
+
}];
|
|
2254
|
+
};
|
|
2255
|
+
const collectPairedFormsCandidates = (pageStartOffset, line, family) => {
|
|
2256
|
+
const cached = pairedFormsRegexCache.get(family);
|
|
2257
|
+
const separator = family.separator === "space" ? "\\s+" : "\\s*[،,]\\s*";
|
|
2258
|
+
const statusTail = family.requireStatusTail ? "\\s*:\\s*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة).*" : "\\s*:";
|
|
2259
|
+
const regex = cached ?? new RegExp(`^(?<forms>${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:${separator}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)${statusTail}`, "u");
|
|
2260
|
+
if (!cached) pairedFormsRegexCache.set(family, regex);
|
|
2261
|
+
const match = line.text.trim().match(regex);
|
|
2262
|
+
if (!match?.groups?.forms) return [];
|
|
2263
|
+
return [{
|
|
2264
|
+
absoluteIndex: pageStartOffset + line.start,
|
|
2265
|
+
family: "pairedForms",
|
|
2266
|
+
kind: family.emit,
|
|
2267
|
+
lemma: family.emit === "entry" ? match.groups.forms : void 0,
|
|
2268
|
+
lineNumber: line.lineNumber,
|
|
2269
|
+
localIndex: line.start,
|
|
2270
|
+
probeText: line.text.trim(),
|
|
2271
|
+
text: line.text.trim()
|
|
2272
|
+
}];
|
|
2273
|
+
};
|
|
2274
|
+
const blockerApplies = (blocker, family) => !blocker.appliesTo || blocker.appliesTo.includes(family);
|
|
2275
|
+
const isIntroCandidate = (text) => {
|
|
2276
|
+
const normalized = normalizeIntroContextText(text);
|
|
2277
|
+
return INTRO_PHRASES.some((phrase) => normalized.startsWith(normalizeArabicForComparison(phrase)));
|
|
2278
|
+
};
|
|
2279
|
+
const endsWithIntroPhrase = (text) => {
|
|
2280
|
+
const trimmed = text.trimEnd();
|
|
2281
|
+
if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
|
|
2282
|
+
const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
|
|
2283
|
+
return INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)));
|
|
2284
|
+
};
|
|
2285
|
+
const endsWithIntroContext = (text) => {
|
|
2286
|
+
const trimmed = text.trimEnd();
|
|
2287
|
+
if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
|
|
2288
|
+
const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
|
|
2289
|
+
if (!normalized) return false;
|
|
2290
|
+
if (INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
|
|
2291
|
+
if (INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
|
|
2292
|
+
return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
|
|
2293
|
+
};
|
|
2294
|
+
const isAuthorityCandidate = (text, precision) => {
|
|
2295
|
+
const head = normalizeStopLemma(text.split(":", 1)[0] ?? text);
|
|
2296
|
+
if (head && AUTHORITY_HEAD_WORDS.some((term) => normalizeStopLemma(term) === head)) return true;
|
|
2297
|
+
if (AUTHORITY_RE.test(text)) return true;
|
|
2298
|
+
if (precision === "aggressive") {
|
|
2299
|
+
const normalized = normalizeIntroContextText(text);
|
|
2300
|
+
return [
|
|
2301
|
+
"الليث",
|
|
2302
|
+
"الأزهري",
|
|
2303
|
+
"الأصمعي",
|
|
2304
|
+
"الجوهري",
|
|
2305
|
+
"الفراء",
|
|
2306
|
+
"ثعلب",
|
|
2307
|
+
"شمر"
|
|
2308
|
+
].some((term) => normalized.startsWith(normalizeArabicForComparison(term)));
|
|
2309
|
+
}
|
|
2310
|
+
return false;
|
|
2311
|
+
};
|
|
2312
|
+
const hasBlockedQualifierTail = (lemma) => {
|
|
2313
|
+
const parts = lemma.split(/[،,]/u).map((part) => part.trim()).filter(Boolean);
|
|
2314
|
+
if (parts.length < 2) return false;
|
|
2315
|
+
return startsWithConfiguredWord(QUALIFIER_TAIL_PREFIXES, parts.slice(1).join(" "));
|
|
2316
|
+
};
|
|
2317
|
+
const looksLikeStructuralLeak = (candidate) => {
|
|
2318
|
+
if (!candidate.lemma) return false;
|
|
2319
|
+
const normalizedLemma = normalizeArabicForComparison(candidate.lemma);
|
|
2320
|
+
if (candidate.kind === "entry" && (/^[^\p{Script=Arabic}\d]+/u.test(candidate.lemma) || candidate.lemma.includes("{") || candidate.lemma.includes("}") || candidate.lemma.includes("##"))) return true;
|
|
2321
|
+
if (candidate.kind === "entry" && BARE_CODE_LEMMA_RE.test(candidate.lemma) && (candidate.text === candidate.lemma || candidate.text === `${HEADING_PREFIX}${candidate.lemma}` || candidate.text.startsWith(`${HEADING_PREFIX}${candidate.lemma}`) || candidate.text.startsWith(`${candidate.lemma}\n${HEADING_PREFIX}`))) return true;
|
|
2322
|
+
if (candidate.family !== "pairedForms" && candidate.lemma.split(/\s+/u).filter(Boolean).length > 4) return true;
|
|
2323
|
+
if (startsWithConfiguredWord(STRUCTURAL_LEMMA_PREFIXES, candidate.lemma)) return true;
|
|
2324
|
+
if (normalizedLemma.startsWith(normalizeArabicForComparison("ولل"))) return true;
|
|
2325
|
+
const structuralText = candidate.text.startsWith(HEADING_PREFIX) ? candidate.text.slice(3).trim() : candidate.text;
|
|
2326
|
+
if (/^[\d\u0660-\u0669]+\s*-\s*\([^)]+\)(?:\s+##.*)?$/u.test(structuralText)) return true;
|
|
2327
|
+
const normalizedText = normalizeArabicForComparison(structuralText);
|
|
2328
|
+
if (STRUCTURAL_LINE_PATTERNS.some((pattern) => pattern.test(structuralText))) return STRUCTURAL_LINE_KEYWORDS.some((keyword) => normalizedText.includes(normalizeArabicForComparison(keyword)));
|
|
2329
|
+
return false;
|
|
2330
|
+
};
|
|
2331
|
+
const countLemma = (map, lemma) => {
|
|
2332
|
+
if (!lemma) return;
|
|
2333
|
+
map.set(lemma, (map.get(lemma) ?? 0) + 1);
|
|
2334
|
+
};
|
|
2335
|
+
const createInitialKindCounts = () => ({
|
|
2336
|
+
chapter: 0,
|
|
2337
|
+
entry: 0,
|
|
2338
|
+
marker: 0
|
|
2339
|
+
});
|
|
2340
|
+
const createInitialReasonCounts = () => ({
|
|
2341
|
+
authorityIntro: 0,
|
|
2342
|
+
intro: 0,
|
|
2343
|
+
pageContinuation: 0,
|
|
2344
|
+
previousChar: 0,
|
|
2345
|
+
previousWord: 0,
|
|
2346
|
+
qualifierTail: 0,
|
|
2347
|
+
stopLemma: 0,
|
|
2348
|
+
structuralLeak: 0
|
|
2349
|
+
});
|
|
2350
|
+
const createInitialFamilyCounts = () => ({
|
|
2351
|
+
codeLine: {
|
|
2352
|
+
accepted: 0,
|
|
2353
|
+
rejected: 0
|
|
2354
|
+
},
|
|
2355
|
+
heading: {
|
|
2356
|
+
accepted: 0,
|
|
2357
|
+
rejected: 0
|
|
2358
|
+
},
|
|
2359
|
+
inlineSubentry: {
|
|
2360
|
+
accepted: 0,
|
|
2361
|
+
rejected: 0
|
|
2362
|
+
},
|
|
2363
|
+
lineEntry: {
|
|
2364
|
+
accepted: 0,
|
|
2365
|
+
rejected: 0
|
|
2366
|
+
},
|
|
2367
|
+
pairedForms: {
|
|
2368
|
+
accepted: 0,
|
|
2369
|
+
rejected: 0
|
|
2370
|
+
}
|
|
2371
|
+
});
|
|
2372
|
+
const rejectsViaIntroBlocker = (candidate, blocker, localBeforeCandidate) => {
|
|
2373
|
+
if (blocker.use !== "intro") return false;
|
|
2374
|
+
return isIntroCandidate(candidate.probeText) || endsWithIntroPhrase(localBeforeCandidate) || endsWithIntroContext(localBeforeCandidate);
|
|
2375
|
+
};
|
|
2376
|
+
const rejectsViaAuthorityBlocker = (candidate, blocker) => blocker.use === "authorityIntro" && isAuthorityCandidate(candidate.probeText, blocker.precision);
|
|
2377
|
+
const rejectsViaStopLemmaBlocker = (candidate, blocker) => blocker.use === "stopLemma" && !!candidate.lemma && !!normalizeStopLemma(candidate.lemma) && blocker.normalizedWords.has(normalizeStopLemma(candidate.lemma));
|
|
2378
|
+
const rejectsViaPreviousWordBlocker = (pageContent, localIndex, blocker) => {
|
|
2379
|
+
if (blocker.use !== "previousWord") return false;
|
|
2380
|
+
const lastWord = extractLastArabicWord$1(pageContent, localIndex);
|
|
2381
|
+
return !!lastWord && blocker.normalizedWords.has(normalizeArabicForComparison(lastWord));
|
|
2382
|
+
};
|
|
2383
|
+
const rejectsViaPreviousCharBlocker = (pageContent, localIndex, blocker) => {
|
|
2384
|
+
if (blocker.use !== "previousChar") return false;
|
|
2385
|
+
const previousChar = previousNonWhitespaceChar(pageContent, localIndex);
|
|
2386
|
+
return !!previousChar && blocker.charSet.has(previousChar);
|
|
2387
|
+
};
|
|
2388
|
+
const rejectsViaPageContinuationBlocker = (candidate, blocker, localBeforeCandidate, pageIndex, pages) => {
|
|
2389
|
+
if (blocker.use !== "pageContinuation") return false;
|
|
2390
|
+
if (!(localBeforeCandidate.trim().length === 0) || pageIndex === 0) return false;
|
|
2391
|
+
const previousPage = pages[pageIndex - 1];
|
|
2392
|
+
if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
|
|
2393
|
+
const previousWord = extractLastArabicWord$1(previousPage.content);
|
|
2394
|
+
return !!previousWord && CONTINUATION_PREV_WORDS.some((word) => normalizedEquals(word, previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, "high");
|
|
2395
|
+
};
|
|
2396
|
+
const getBlockerRejectionReason = (blocker, candidate, localBeforeCandidate, pageContent, pageIndex, pages) => {
|
|
2397
|
+
if (rejectsViaIntroBlocker(candidate, blocker, localBeforeCandidate)) return "intro";
|
|
2398
|
+
if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
|
|
2399
|
+
if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
|
|
2400
|
+
if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker)) return "previousWord";
|
|
2401
|
+
if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
|
|
2402
|
+
if (rejectsViaPageContinuationBlocker(candidate, blocker, localBeforeCandidate, pageIndex, pages)) return "pageContinuation";
|
|
2403
|
+
return null;
|
|
2404
|
+
};
|
|
2405
|
+
const getCandidateRejection = (candidate, zone, pageContext, pages) => {
|
|
2406
|
+
const hasQualifierTail = hasBlockedQualifierTail(candidate.lemma ?? "");
|
|
2407
|
+
if (hasQualifierTail || looksLikeStructuralLeak(candidate)) return { reason: hasQualifierTail ? "qualifierTail" : "structuralLeak" };
|
|
2408
|
+
const localBeforeCandidate = getTrailingContext(pageContext.content, candidate.localIndex);
|
|
2409
|
+
for (const blocker of zone.blockers) {
|
|
2410
|
+
if (!blockerApplies(blocker, candidate.family)) continue;
|
|
2411
|
+
const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
|
|
2412
|
+
if (reason) return { reason };
|
|
2413
|
+
}
|
|
2414
|
+
return null;
|
|
2415
|
+
};
|
|
2416
|
+
const shouldRejectCandidate = (candidate, zone, pageContext, pages) => {
|
|
2417
|
+
return getCandidateRejection(candidate, zone, pageContext, pages) !== null;
|
|
2418
|
+
};
|
|
2419
|
+
const collectHeadingCandidates = (pageStartOffset, line, nextLine, family, trimmed) => {
|
|
2420
|
+
if (!trimmed.startsWith(HEADING_PREFIX)) return [];
|
|
2421
|
+
const headingClass = classifyDictionaryHeading(trimmed);
|
|
2422
|
+
if (headingClass === "noise") return [];
|
|
2423
|
+
const candidate = createHeadingCandidate(pageStartOffset, line, nextLine, family, headingClass);
|
|
2424
|
+
return candidate ? [candidate] : [];
|
|
2425
|
+
};
|
|
2426
|
+
const collectCandidatesForFamily = (pageStartOffset, line, nextLine, family, trimmed) => {
|
|
2427
|
+
switch (family.use) {
|
|
2428
|
+
case "heading": return collectHeadingCandidates(pageStartOffset, line, nextLine, family, trimmed);
|
|
2429
|
+
case "lineEntry": return collectLineEntryCandidates(pageStartOffset, line, family);
|
|
2430
|
+
case "inlineSubentry": return collectInlineSubentryCandidates(pageStartOffset, line, family);
|
|
2431
|
+
case "codeLine": return collectCodeLineCandidates(pageStartOffset, line, family);
|
|
2432
|
+
case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
|
|
2433
|
+
default: return assertNever$1(family);
|
|
2434
|
+
}
|
|
2435
|
+
};
|
|
2436
|
+
const collectCandidatesForLine = (pageStartOffset, line, nextLine, zone) => {
|
|
2437
|
+
const trimmed = line.text.trim();
|
|
2438
|
+
const candidates = [];
|
|
2439
|
+
if (!trimmed) return candidates;
|
|
2440
|
+
for (const family of zone.families) candidates.push(...collectCandidatesForFamily(pageStartOffset, line, nextLine, family, trimmed));
|
|
2441
|
+
return candidates;
|
|
2442
|
+
};
|
|
2443
|
+
const candidateToSplitPoint = (candidate, debugMetaKey) => {
|
|
2444
|
+
const baseMeta = candidate.lemma ? {
|
|
2445
|
+
kind: candidate.kind,
|
|
2446
|
+
lemma: candidate.lemma
|
|
2447
|
+
} : { kind: candidate.kind };
|
|
2448
|
+
const meta = debugMetaKey === void 0 ? baseMeta : mergeDebugIntoMeta(baseMeta, debugMetaKey, { dictionary: {
|
|
2449
|
+
family: candidate.family,
|
|
2450
|
+
...candidate.headingClass ? { headingClass: candidate.headingClass } : {}
|
|
2451
|
+
} });
|
|
2452
|
+
return {
|
|
2453
|
+
contentStartOffset: candidate.contentStartOffset,
|
|
2454
|
+
index: candidate.absoluteIndex,
|
|
2455
|
+
meta
|
|
2456
|
+
};
|
|
2457
|
+
};
|
|
2458
|
+
const pushDiagnosticSample = (samples, sampleLimit, sample) => {
|
|
2459
|
+
if (samples.length < sampleLimit) samples.push(sample);
|
|
2460
|
+
};
|
|
2461
|
+
/**
|
|
2462
|
+
* Collects dictionary-profile split points using the pages-only markdown surface.
|
|
2463
|
+
*/
|
|
2464
|
+
const collectDictionarySplitPoints = (pages, profile, pageMap, normalizedPages, logger, debugMetaKey) => {
|
|
2465
|
+
const normalizedProfile = normalizeDictionaryProfile(profile);
|
|
2466
|
+
const pageContexts = createPageContexts(pages, pageMap, normalizedPages);
|
|
2467
|
+
const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
|
|
2468
|
+
const splitPoints = [];
|
|
2469
|
+
logger?.debug?.("[dictionary] collecting split points", {
|
|
2470
|
+
pageCount: pages.length,
|
|
2471
|
+
zoneCount: normalizedProfile.zones.length
|
|
2472
|
+
});
|
|
2473
|
+
for (const pageContext of pageContexts) {
|
|
2474
|
+
const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
|
|
2475
|
+
if (!zone) continue;
|
|
2476
|
+
for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
|
|
2477
|
+
const line = pageContext.lines[lineIndex];
|
|
2478
|
+
const nextLine = pageContext.lines[lineIndex + 1];
|
|
2479
|
+
const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
|
|
2480
|
+
for (const candidate of candidates) {
|
|
2481
|
+
if (shouldRejectCandidate(candidate, zone, pageContext, pageContexts)) continue;
|
|
2482
|
+
splitPoints.push(candidateToSplitPoint(candidate, debugMetaKey));
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
logger?.debug?.("[dictionary] collected split points", { splitPointCount: splitPoints.length });
|
|
2487
|
+
return splitPoints;
|
|
2488
|
+
};
|
|
2489
|
+
/**
|
|
2490
|
+
* Collects authoring diagnostics for a dictionary profile without creating segments.
|
|
1288
2491
|
*
|
|
1289
|
-
*
|
|
1290
|
-
* // Build a dropdown/select in UI
|
|
1291
|
-
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
1292
|
-
*
|
|
1293
|
-
* @example
|
|
1294
|
-
* // Type-safe pattern key validation
|
|
1295
|
-
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
1296
|
-
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
2492
|
+
* This is useful when tuning blockers and family choices for a new dictionary.
|
|
1297
2493
|
*/
|
|
1298
|
-
const
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
2494
|
+
const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
|
|
2495
|
+
const normalizedProfile = normalizeDictionaryProfile(profile);
|
|
2496
|
+
const pageMap = {
|
|
2497
|
+
boundaries: [],
|
|
2498
|
+
getId: (offset) => {
|
|
2499
|
+
for (const boundary of pageMap.boundaries) if (offset >= boundary.start && offset <= boundary.end) return boundary.id;
|
|
2500
|
+
return pageMap.boundaries.at(-1)?.id ?? 0;
|
|
2501
|
+
},
|
|
2502
|
+
pageBreaks: [],
|
|
2503
|
+
pageIds: pages.map((page) => page.id)
|
|
2504
|
+
};
|
|
2505
|
+
let offset = 0;
|
|
2506
|
+
const pageContexts = createPageContexts(pages, pageMap, pages.map((page, pageIndex) => {
|
|
2507
|
+
const normalized = normalizeLineEndings(page.content);
|
|
2508
|
+
pageMap.boundaries.push({
|
|
2509
|
+
end: offset + normalized.length,
|
|
2510
|
+
id: page.id,
|
|
2511
|
+
start: offset
|
|
2512
|
+
});
|
|
2513
|
+
if (pageIndex < pages.length - 1) {
|
|
2514
|
+
pageMap.pageBreaks.push(offset + normalized.length);
|
|
2515
|
+
offset += normalized.length + 1;
|
|
2516
|
+
} else offset += normalized.length;
|
|
2517
|
+
return normalized;
|
|
2518
|
+
}));
|
|
2519
|
+
const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
|
|
2520
|
+
const sampleLimit = options.sampleLimit ?? 50;
|
|
2521
|
+
const acceptedKinds = createInitialKindCounts();
|
|
2522
|
+
const blockerHits = createInitialReasonCounts();
|
|
2523
|
+
const familyCounts = createInitialFamilyCounts();
|
|
2524
|
+
const zoneCounts = {};
|
|
2525
|
+
const rejectedLemmaCounts = /* @__PURE__ */ new Map();
|
|
2526
|
+
const samples = [];
|
|
2527
|
+
let acceptedCount = 0;
|
|
2528
|
+
let rejectedCount = 0;
|
|
2529
|
+
for (const pageContext of pageContexts) {
|
|
2530
|
+
const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
|
|
2531
|
+
if (!zone) continue;
|
|
2532
|
+
zoneCounts[zone.name] ??= {
|
|
2533
|
+
accepted: 0,
|
|
2534
|
+
rejected: 0
|
|
2535
|
+
};
|
|
2536
|
+
for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
|
|
2537
|
+
const line = pageContext.lines[lineIndex];
|
|
2538
|
+
const nextLine = pageContext.lines[lineIndex + 1];
|
|
2539
|
+
const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
|
|
2540
|
+
for (const candidate of candidates) {
|
|
2541
|
+
const rejection = getCandidateRejection(candidate, zone, pageContext, pageContexts);
|
|
2542
|
+
const sampleBase = {
|
|
2543
|
+
absoluteIndex: candidate.absoluteIndex,
|
|
2544
|
+
family: candidate.family,
|
|
2545
|
+
kind: candidate.kind,
|
|
2546
|
+
lemma: candidate.lemma,
|
|
2547
|
+
line: candidate.lineNumber,
|
|
2548
|
+
pageId: pageContext.page.id,
|
|
2549
|
+
text: candidate.text,
|
|
2550
|
+
zone: zone.name
|
|
2551
|
+
};
|
|
2552
|
+
if (rejection) {
|
|
2553
|
+
rejectedCount += 1;
|
|
2554
|
+
blockerHits[rejection.reason] += 1;
|
|
2555
|
+
familyCounts[candidate.family].rejected += 1;
|
|
2556
|
+
zoneCounts[zone.name].rejected += 1;
|
|
2557
|
+
countLemma(rejectedLemmaCounts, candidate.lemma);
|
|
2558
|
+
pushDiagnosticSample(samples, sampleLimit, {
|
|
2559
|
+
...sampleBase,
|
|
2560
|
+
accepted: false,
|
|
2561
|
+
reason: rejection.reason
|
|
2562
|
+
});
|
|
2563
|
+
continue;
|
|
2564
|
+
}
|
|
2565
|
+
acceptedCount += 1;
|
|
2566
|
+
acceptedKinds[candidate.kind] += 1;
|
|
2567
|
+
familyCounts[candidate.family].accepted += 1;
|
|
2568
|
+
zoneCounts[zone.name].accepted += 1;
|
|
2569
|
+
pushDiagnosticSample(samples, sampleLimit, {
|
|
2570
|
+
...sampleBase,
|
|
2571
|
+
accepted: true
|
|
2572
|
+
});
|
|
2573
|
+
}
|
|
2574
|
+
}
|
|
2575
|
+
}
|
|
2576
|
+
const rejectedLemmas = [...rejectedLemmaCounts.entries()].sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])).map(([lemma, count]) => ({
|
|
2577
|
+
count,
|
|
2578
|
+
lemma
|
|
2579
|
+
}));
|
|
2580
|
+
return {
|
|
2581
|
+
acceptedCount,
|
|
2582
|
+
acceptedKinds,
|
|
2583
|
+
blockerHits,
|
|
2584
|
+
familyCounts,
|
|
2585
|
+
pageCount: pages.length,
|
|
2586
|
+
rejectedCount,
|
|
2587
|
+
rejectedLemmas,
|
|
2588
|
+
samples,
|
|
2589
|
+
zoneCounts
|
|
2590
|
+
};
|
|
2591
|
+
};
|
|
1306
2592
|
//#endregion
|
|
1307
2593
|
//#region src/optimization/optimize-rules.ts
|
|
1308
2594
|
const MERGEABLE_KEYS = new Set([
|
|
@@ -1324,7 +2610,7 @@ const getPatternString = (rule, key) => {
|
|
|
1324
2610
|
};
|
|
1325
2611
|
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
1326
2612
|
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
1327
|
-
if (!("dictionaryEntry" in rule)) return 0;
|
|
2613
|
+
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
|
|
1328
2614
|
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
1329
2615
|
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
1330
2616
|
};
|
|
@@ -1475,115 +2761,6 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
|
1475
2761
|
}
|
|
1476
2762
|
return result;
|
|
1477
2763
|
};
|
|
1478
|
-
//#endregion
|
|
1479
|
-
//#region src/segmentation/arabic-dictionary-rule.ts
|
|
1480
|
-
const uniqueCanonicalWords = (words) => {
|
|
1481
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1482
|
-
const result = [];
|
|
1483
|
-
for (const word of words) {
|
|
1484
|
-
const normalized = normalizeArabicForComparison(word);
|
|
1485
|
-
if (!normalized || seen.has(normalized)) continue;
|
|
1486
|
-
seen.add(normalized);
|
|
1487
|
-
result.push(word);
|
|
1488
|
-
}
|
|
1489
|
-
return result;
|
|
1490
|
-
};
|
|
1491
|
-
const buildStopAlternation = (stopWords) => {
|
|
1492
|
-
const unique = uniqueCanonicalWords(stopWords);
|
|
1493
|
-
if (unique.length === 0) return "";
|
|
1494
|
-
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1495
|
-
};
|
|
1496
|
-
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1497
|
-
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
1498
|
-
const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
|
|
1499
|
-
return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
|
|
1500
|
-
};
|
|
1501
|
-
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1502
|
-
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1503
|
-
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
1504
|
-
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1505
|
-
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1506
|
-
};
|
|
1507
|
-
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
1508
|
-
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1509
|
-
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1510
|
-
if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1511
|
-
};
|
|
1512
|
-
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
1513
|
-
validateDictionaryEntryOptions({
|
|
1514
|
-
captureName,
|
|
1515
|
-
maxLetters,
|
|
1516
|
-
minLetters
|
|
1517
|
-
});
|
|
1518
|
-
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1519
|
-
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1520
|
-
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1521
|
-
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1522
|
-
const stopAlternation = buildStopAlternation(stopWords);
|
|
1523
|
-
const lemmaBody = buildHeadwordBody({
|
|
1524
|
-
allowCommaSeparated,
|
|
1525
|
-
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1526
|
-
stopAlternation,
|
|
1527
|
-
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1528
|
-
unit: lemmaUnit
|
|
1529
|
-
});
|
|
1530
|
-
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
1531
|
-
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
1532
|
-
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
1533
|
-
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
1534
|
-
allowParenthesized,
|
|
1535
|
-
allowWhitespaceBeforeColon,
|
|
1536
|
-
captureName: prefixedCaptureName,
|
|
1537
|
-
headwordBody: lemmaBody
|
|
1538
|
-
});
|
|
1539
|
-
return {
|
|
1540
|
-
captureNames: [prefixedCaptureName],
|
|
1541
|
-
regex
|
|
1542
|
-
};
|
|
1543
|
-
};
|
|
1544
|
-
/**
|
|
1545
|
-
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1546
|
-
*
|
|
1547
|
-
* The returned rule preserves authoring intent as a serializable
|
|
1548
|
-
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1549
|
-
* regex string.
|
|
1550
|
-
*
|
|
1551
|
-
* @example
|
|
1552
|
-
* createArabicDictionaryEntryRule({
|
|
1553
|
-
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1554
|
-
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1555
|
-
* })
|
|
1556
|
-
*
|
|
1557
|
-
* @example
|
|
1558
|
-
* createArabicDictionaryEntryRule({
|
|
1559
|
-
* allowParenthesized: true,
|
|
1560
|
-
* allowWhitespaceBeforeColon: true,
|
|
1561
|
-
* allowCommaSeparated: true,
|
|
1562
|
-
* stopWords: ['الليث', 'العجاج'],
|
|
1563
|
-
* })
|
|
1564
|
-
*/
|
|
1565
|
-
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1566
|
-
validateDictionaryEntryOptions({
|
|
1567
|
-
captureName,
|
|
1568
|
-
maxLetters,
|
|
1569
|
-
minLetters
|
|
1570
|
-
});
|
|
1571
|
-
return {
|
|
1572
|
-
dictionaryEntry: {
|
|
1573
|
-
allowCommaSeparated,
|
|
1574
|
-
allowParenthesized,
|
|
1575
|
-
allowWhitespaceBeforeColon,
|
|
1576
|
-
captureName,
|
|
1577
|
-
maxLetters,
|
|
1578
|
-
midLineSubentries,
|
|
1579
|
-
minLetters,
|
|
1580
|
-
stopWords: uniqueCanonicalWords(stopWords)
|
|
1581
|
-
},
|
|
1582
|
-
meta,
|
|
1583
|
-
pageStartPrevWordStoplist,
|
|
1584
|
-
samePagePrevWordStoplist
|
|
1585
|
-
};
|
|
1586
|
-
};
|
|
1587
2764
|
const WINDOW_PREFIX_LENGTHS = [
|
|
1588
2765
|
80,
|
|
1589
2766
|
60,
|
|
@@ -2491,106 +3668,37 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
|
|
|
2491
3668
|
return -1;
|
|
2492
3669
|
};
|
|
2493
3670
|
//#endregion
|
|
2494
|
-
//#region src/segmentation/debug-meta.ts
|
|
2495
|
-
const resolveDebugConfig = (debug) => {
|
|
2496
|
-
if (debug === true) return {
|
|
2497
|
-
includeBreakpoint: true,
|
|
2498
|
-
includeRule: true,
|
|
2499
|
-
metaKey: "_flappa"
|
|
2500
|
-
};
|
|
2501
|
-
if (!debug || typeof debug !== "object") return null;
|
|
2502
|
-
const { metaKey, include } = debug;
|
|
2503
|
-
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
2504
|
-
return {
|
|
2505
|
-
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
2506
|
-
includeRule,
|
|
2507
|
-
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
2508
|
-
};
|
|
2509
|
-
};
|
|
2510
|
-
const getRulePatternType = (rule) => {
|
|
2511
|
-
return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
|
|
2512
|
-
};
|
|
2513
|
-
const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
2514
|
-
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
2515
|
-
const out = meta ? { ...meta } : {};
|
|
2516
|
-
const existing = out[metaKey];
|
|
2517
|
-
out[metaKey] = {
|
|
2518
|
-
...isPlainObject(existing) ? existing : {},
|
|
2519
|
-
...patch
|
|
2520
|
-
};
|
|
2521
|
-
return out;
|
|
2522
|
-
};
|
|
2523
|
-
const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
|
|
2524
|
-
const patternType = getRulePatternType(rule);
|
|
2525
|
-
const patterns = rule[patternType];
|
|
2526
|
-
const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
|
|
2527
|
-
return { rule: {
|
|
2528
|
-
index: ruleIndex,
|
|
2529
|
-
patternType,
|
|
2530
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2531
|
-
...word !== void 0 ? { word } : {}
|
|
2532
|
-
} };
|
|
2533
|
-
};
|
|
2534
|
-
const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
|
|
2535
|
-
index: breakpointIndex,
|
|
2536
|
-
kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
|
|
2537
|
-
pattern: rule.pattern ?? rule.regex,
|
|
2538
|
-
...wordIndex !== void 0 ? { wordIndex } : {},
|
|
2539
|
-
...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
|
|
2540
|
-
} });
|
|
2541
|
-
/**
|
|
2542
|
-
* Helper to format the debug info into a human-readable string.
|
|
2543
|
-
* @param meta - The segment metadata object
|
|
2544
|
-
* @param options - Formatting options
|
|
2545
|
-
*/
|
|
2546
|
-
const formatRuleReason = (rule, concise) => {
|
|
2547
|
-
const { index, patternType, wordIndex, word } = rule;
|
|
2548
|
-
if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
|
|
2549
|
-
const wordInfo = word ? ` (Matched: "${word}")` : "";
|
|
2550
|
-
return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
|
|
2551
|
-
};
|
|
2552
|
-
const formatBreakpointReason = (breakpoint, concise) => {
|
|
2553
|
-
const { index, kind, pattern, wordIndex, word } = breakpoint;
|
|
2554
|
-
if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
|
|
2555
|
-
if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
|
|
2556
|
-
if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
|
|
2557
|
-
return `Breakpoint #${index} (${kind}) - "${pattern}"`;
|
|
2558
|
-
};
|
|
2559
|
-
const formatContentLengthReason = (split, concise) => {
|
|
2560
|
-
const { maxContentLength, splitReason } = split;
|
|
2561
|
-
if (concise) return `> ${maxContentLength} (${splitReason})`;
|
|
2562
|
-
return `Safety Split (${splitReason}) > ${maxContentLength}`;
|
|
2563
|
-
};
|
|
2564
|
-
/**
|
|
2565
|
-
* Helper to format the debug info into a human-readable string.
|
|
2566
|
-
* @param meta - The segment metadata object
|
|
2567
|
-
* @param options - Formatting options
|
|
2568
|
-
*/
|
|
2569
|
-
const getDebugReason = (meta, options) => {
|
|
2570
|
-
const debug = meta?._flappa;
|
|
2571
|
-
if (!debug) return "-";
|
|
2572
|
-
const concise = options?.concise;
|
|
2573
|
-
if (debug.rule) return formatRuleReason(debug.rule, concise);
|
|
2574
|
-
if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
|
|
2575
|
-
if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
|
|
2576
|
-
return "Unknown";
|
|
2577
|
-
};
|
|
2578
|
-
/**
|
|
2579
|
-
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
2580
|
-
* @param segment - The segment object
|
|
2581
|
-
* @param options - Formatting options
|
|
2582
|
-
*/
|
|
2583
|
-
const getSegmentDebugReason = (segment, options) => {
|
|
2584
|
-
return getDebugReason(segment.meta, options);
|
|
2585
|
-
};
|
|
2586
|
-
//#endregion
|
|
2587
3671
|
//#region src/segmentation/pattern-validator.ts
|
|
2588
3672
|
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
2589
3673
|
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
2590
|
-
const
|
|
3674
|
+
const BARE_TOKEN_REGEX = (() => {
|
|
2591
3675
|
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
2592
3676
|
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
3677
|
+
})();
|
|
3678
|
+
const createMalformedTokenIssue = (tokenLiteral, side) => {
|
|
3679
|
+
const token = tokenLiteral.split(":", 1)[0] || void 0;
|
|
3680
|
+
return {
|
|
3681
|
+
message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
|
|
3682
|
+
suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
|
|
3683
|
+
token,
|
|
3684
|
+
type: "missing_braces"
|
|
3685
|
+
};
|
|
3686
|
+
};
|
|
3687
|
+
const detectMalformedLeftToken = (pattern) => {
|
|
3688
|
+
for (let index = 0; index < pattern.length - 1; index++) {
|
|
3689
|
+
if (pattern.slice(index, index + 2) !== "{{") continue;
|
|
3690
|
+
const closeIndex = pattern.indexOf("}}", index + 2);
|
|
3691
|
+
if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
|
|
3692
|
+
index = closeIndex + 1;
|
|
3693
|
+
}
|
|
3694
|
+
};
|
|
3695
|
+
const detectMalformedRightToken = (pattern) => {
|
|
3696
|
+
for (let index = 0; index < pattern.length - 1; index++) {
|
|
3697
|
+
if (pattern.slice(index, index + 2) !== "}}") continue;
|
|
3698
|
+
if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
|
|
3699
|
+
}
|
|
2593
3700
|
};
|
|
3701
|
+
const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
|
|
2594
3702
|
/**
|
|
2595
3703
|
* Validates a single pattern for common issues.
|
|
2596
3704
|
*/
|
|
@@ -2608,14 +3716,16 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
2608
3716
|
TOKEN_INSIDE_BRACES.lastIndex = 0;
|
|
2609
3717
|
for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
|
|
2610
3718
|
const name = match[1];
|
|
2611
|
-
if (!KNOWN_TOKENS.has(name)) return {
|
|
3719
|
+
if (name && !KNOWN_TOKENS.has(name)) return {
|
|
2612
3720
|
message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
2613
3721
|
suggestion: "Check spelling or use a known token",
|
|
2614
3722
|
token: name,
|
|
2615
3723
|
type: "unknown_token"
|
|
2616
3724
|
};
|
|
2617
3725
|
}
|
|
2618
|
-
|
|
3726
|
+
const malformed = detectMalformedToken(pattern);
|
|
3727
|
+
if (malformed) return malformed;
|
|
3728
|
+
for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
|
|
2619
3729
|
const [full, name] = match;
|
|
2620
3730
|
const idx = match.index;
|
|
2621
3731
|
if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
|
|
@@ -2642,14 +3752,14 @@ const applyRulePatternValidation = (result, key, patterns) => {
|
|
|
2642
3752
|
return true;
|
|
2643
3753
|
};
|
|
2644
3754
|
const validateTemplateRule = (rule, result) => {
|
|
2645
|
-
if (
|
|
3755
|
+
if (!("template" in rule)) return false;
|
|
2646
3756
|
const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
|
|
2647
3757
|
if (!issue) return false;
|
|
2648
3758
|
result.template = issue;
|
|
2649
3759
|
return true;
|
|
2650
3760
|
};
|
|
2651
3761
|
const validateRegexRule = (rule, result) => {
|
|
2652
|
-
if (
|
|
3762
|
+
if (!("regex" in rule)) return false;
|
|
2653
3763
|
if (!rule.regex.trim()) {
|
|
2654
3764
|
result.regex = {
|
|
2655
3765
|
message: "Empty pattern is not allowed",
|
|
@@ -2673,18 +3783,31 @@ const invalidDictionaryEntryIssue = (message) => ({
|
|
|
2673
3783
|
message,
|
|
2674
3784
|
type: "invalid_option"
|
|
2675
3785
|
});
|
|
3786
|
+
const addBooleanDictionaryEntryIssue = (issues, key, value) => {
|
|
3787
|
+
if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
|
|
3788
|
+
};
|
|
3789
|
+
const addCaptureNameIssue = (issues, captureName) => {
|
|
3790
|
+
if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
3791
|
+
};
|
|
3792
|
+
const addMinLettersIssue = (issues, minLetters) => {
|
|
3793
|
+
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
3794
|
+
};
|
|
3795
|
+
const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
|
|
3796
|
+
const min = minLetters ?? 2;
|
|
3797
|
+
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
|
|
3798
|
+
};
|
|
2676
3799
|
const validateDictionaryEntryRule = (rule, result) => {
|
|
2677
3800
|
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
2678
3801
|
const issues = {};
|
|
2679
3802
|
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
2680
3803
|
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2687
|
-
|
|
3804
|
+
addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
|
|
3805
|
+
addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
|
|
3806
|
+
addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
|
|
3807
|
+
addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
|
|
3808
|
+
addCaptureNameIssue(issues, captureName);
|
|
3809
|
+
addMinLettersIssue(issues, minLetters);
|
|
3810
|
+
addMaxLettersIssue(issues, maxLetters, minLetters);
|
|
2688
3811
|
if (Object.keys(issues).length === 0) return false;
|
|
2689
3812
|
result.dictionaryEntry = issues;
|
|
2690
3813
|
return true;
|
|
@@ -2718,9 +3841,9 @@ const formatValidationIssue = (_type, issue, loc) => {
|
|
|
2718
3841
|
*/
|
|
2719
3842
|
const validateRules = (rules) => rules.map((rule) => {
|
|
2720
3843
|
const result = {};
|
|
2721
|
-
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
|
|
2722
|
-
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
|
|
2723
|
-
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
|
|
3844
|
+
const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
|
|
3845
|
+
const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
|
|
3846
|
+
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
|
|
2724
3847
|
const templateIssues = validateTemplateRule(rule, result);
|
|
2725
3848
|
const regexIssues = validateRegexRule(rule, result);
|
|
2726
3849
|
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
@@ -3961,14 +5084,20 @@ const mergeRecord = (existing, incoming) => existing || incoming ? {
|
|
|
3961
5084
|
...existing ?? {},
|
|
3962
5085
|
...incoming ?? {}
|
|
3963
5086
|
} : void 0;
|
|
5087
|
+
const isPlainObject = (value) => typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3964
5088
|
const mergeSplitPoints = (existing, incoming) => {
|
|
3965
5089
|
const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
|
|
3966
5090
|
const fallback = preferred === incoming ? existing : incoming;
|
|
5091
|
+
const meta = mergeRecord(existing.meta, incoming.meta);
|
|
5092
|
+
if (meta && isPlainObject(existing.meta?._flappa) && isPlainObject(incoming.meta?._flappa)) meta._flappa = {
|
|
5093
|
+
...existing.meta._flappa,
|
|
5094
|
+
...incoming.meta._flappa
|
|
5095
|
+
};
|
|
3967
5096
|
return {
|
|
3968
5097
|
...fallback,
|
|
3969
5098
|
...preferred,
|
|
3970
5099
|
contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
|
|
3971
|
-
meta
|
|
5100
|
+
meta,
|
|
3972
5101
|
namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
|
|
3973
5102
|
};
|
|
3974
5103
|
};
|
|
@@ -4094,7 +5223,7 @@ const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
|
|
|
4094
5223
|
* });
|
|
4095
5224
|
*/
|
|
4096
5225
|
const segmentPages = (pages, options) => {
|
|
4097
|
-
const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
|
|
5226
|
+
const { dictionary, rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
|
|
4098
5227
|
if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
|
|
4099
5228
|
const maxPages = options.maxPages ?? Number.MAX_SAFE_INTEGER;
|
|
4100
5229
|
const hasLimits = options.maxPages !== void 0 || maxContentLength !== void 0;
|
|
@@ -4118,13 +5247,17 @@ const segmentPages = (pages, options) => {
|
|
|
4118
5247
|
pageIds: pageMap.pageIds,
|
|
4119
5248
|
totalContentLength: matchContent.length
|
|
4120
5249
|
});
|
|
4121
|
-
const
|
|
5250
|
+
const splitPointsFromRules = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
|
|
5251
|
+
const splitPointsFromDictionary = dictionary ? collectDictionarySplitPoints(preprocessedPages, dictionary, pageMap, normalizedContent, logger, debugMetaKey) : [];
|
|
5252
|
+
const splitPoints = [...splitPointsFromRules, ...splitPointsFromDictionary];
|
|
4122
5253
|
const unique = dedupeSplitPoints(splitPoints);
|
|
4123
5254
|
logger?.debug?.("[segmenter] split points collected", {
|
|
5255
|
+
dictionarySplitPoints: splitPointsFromDictionary.length,
|
|
4124
5256
|
rawSplitPoints: splitPoints.length,
|
|
5257
|
+
ruleSplitPoints: splitPointsFromRules.length,
|
|
4125
5258
|
uniqueSplitPoints: unique.length
|
|
4126
5259
|
});
|
|
4127
|
-
let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner);
|
|
5260
|
+
let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner, dictionary !== void 0);
|
|
4128
5261
|
logger?.debug?.("[segmenter] structural segments built", { segmentCount: segments.length });
|
|
4129
5262
|
segments = ensureFallbackSegment(segments, preprocessedPages, normalizedContent, pageJoiner);
|
|
4130
5263
|
if (hasLimits) {
|
|
@@ -4151,7 +5284,7 @@ const segmentPages = (pages, options) => {
|
|
|
4151
5284
|
* @param rules - Original rules (for constraint checking on first segment)
|
|
4152
5285
|
* @returns Array of segment objects
|
|
4153
5286
|
*/
|
|
4154
|
-
const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
5287
|
+
const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner, hasDictionaryProfile) => {
|
|
4155
5288
|
const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
|
|
4156
5289
|
const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
|
|
4157
5290
|
const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
|
|
@@ -4195,14 +5328,16 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
|
|
|
4195
5328
|
};
|
|
4196
5329
|
const segments = [];
|
|
4197
5330
|
if (!splitPoints.length) {
|
|
4198
|
-
|
|
5331
|
+
const firstId = pageMap.getId(0);
|
|
5332
|
+
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
4199
5333
|
const s = createSegment(0, content.length);
|
|
4200
5334
|
if (s) segments.push(s);
|
|
4201
5335
|
}
|
|
4202
5336
|
return segments;
|
|
4203
5337
|
}
|
|
4204
5338
|
if (splitPoints[0].index > 0) {
|
|
4205
|
-
|
|
5339
|
+
const firstId = pageMap.getId(0);
|
|
5340
|
+
if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
|
|
4206
5341
|
const s = createSegment(0, splitPoints[0].index);
|
|
4207
5342
|
if (s) segments.push(s);
|
|
4208
5343
|
}
|
|
@@ -4603,6 +5738,6 @@ const validateSegments = (pages, options, segments, validationOptions) => {
|
|
|
4603
5738
|
};
|
|
4604
5739
|
};
|
|
4605
5740
|
//#endregion
|
|
4606
|
-
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
5741
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
|
|
4607
5742
|
|
|
4608
5743
|
//# sourceMappingURL=index.mjs.map
|