flappa-doormal 2.18.0 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -148,7 +148,7 @@ numbered: "{{raqms}} {{dash}} " };
148
148
  const expandCompositeTokensInTemplate = (template) => {
149
149
  let out = template;
150
150
  for (let i = 0; i < 10; i++) {
151
- const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => COMPOSITE_TOKENS[tokenName] ?? m);
151
+ const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => tokenName in COMPOSITE_TOKENS ? COMPOSITE_TOKENS[tokenName] : m);
152
152
  if (next === out) break;
153
153
  out = next;
154
154
  }
@@ -162,7 +162,8 @@ const expandCompositeTokensInTemplate = (template) => {
162
162
  * @returns Expanded pattern with base tokens replaced
163
163
  * @internal
164
164
  */
165
- const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`);
165
+ const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => tokenName in BASE_TOKENS ? BASE_TOKENS[tokenName] : `{{${tokenName}}}`);
166
+ const EXPANDED_COMPOSITE_TOKENS = Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([key, value]) => [key, expandBaseTokens(value)]));
166
167
  /**
167
168
  * Token definitions mapping human-readable token names to regex patterns.
168
169
  *
@@ -190,7 +191,7 @@ const expandBaseTokens = (template) => template.replace(/\{\{(\w+)\}\}/g, (_, to
190
191
  */
191
192
  const TOKEN_PATTERNS = {
192
193
  ...BASE_TOKENS,
193
- ...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
194
+ ...EXPANDED_COMPOSITE_TOKENS
194
195
  };
195
196
  /**
196
197
  * Regex pattern for matching tokens with optional named capture syntax.
@@ -283,8 +284,8 @@ const expandTokenLiteral = (literal, opts) => {
283
284
  if (!parsed) return literal;
284
285
  const { tokenName, captureName } = parsed;
285
286
  if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
287
+ if (!(tokenName in TOKEN_PATTERNS)) return literal;
286
288
  let tokenPattern = TOKEN_PATTERNS[tokenName];
287
- if (!tokenPattern) return literal;
288
289
  tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
289
290
  if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
290
291
  return tokenPattern;
@@ -490,7 +491,7 @@ const applyTokenMappings = (template, mappings) => {
490
491
  * // → '{{raqms}} {{dash}}'
491
492
  */
492
493
  const stripTokenMappings = (template) => {
493
- return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
494
+ return template.replace(/\{\{([^:}]*)?:[^}]+\}\}/g, (_match, tokenName) => `{{${tokenName ?? ""}}}`);
494
495
  };
495
496
  //#endregion
496
497
  //#region src/utils/textUtils.ts
@@ -1279,6 +1280,451 @@ const analyzeTextForRule = (text) => {
1279
1280
  };
1280
1281
  };
1281
1282
  //#endregion
1283
+ //#region src/dictionary/arabic-dictionary-rule.ts
1284
+ const uniqueCanonicalWords = (words) => {
1285
+ const seen = /* @__PURE__ */ new Set();
1286
+ const result = [];
1287
+ for (const word of words) {
1288
+ const normalized = normalizeArabicForComparison(word);
1289
+ if (!normalized || seen.has(normalized)) continue;
1290
+ seen.add(normalized);
1291
+ result.push(word);
1292
+ }
1293
+ return result;
1294
+ };
1295
+ const buildStopAlternation = (stopWords) => {
1296
+ const unique = uniqueCanonicalWords(stopWords);
1297
+ if (unique.length === 0) return "";
1298
+ return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
1299
+ };
1300
+ const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1301
+ if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1302
+ const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1303
+ return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1304
+ };
1305
+ const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1306
+ const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1307
+ const withCapture = `(?<${captureName}>${headwordBody})`;
1308
+ if (!allowParenthesized) return `${withCapture}${colon}`;
1309
+ return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1310
+ };
1311
+ const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
1312
+ if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1313
+ if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1314
+ if (!/^[A-Za-z_]\w*$/.test(captureName)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1315
+ };
1316
+ const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
1317
+ validateDictionaryEntryOptions({
1318
+ captureName,
1319
+ maxLetters,
1320
+ minLetters
1321
+ });
1322
+ const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1323
+ const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1324
+ const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1325
+ const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1326
+ const stopAlternation = buildStopAlternation(stopWords);
1327
+ const lemmaBody = buildHeadwordBody({
1328
+ allowCommaSeparated,
1329
+ colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1330
+ stopAlternation,
1331
+ stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1332
+ unit: lemmaUnit
1333
+ });
1334
+ const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
1335
+ const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
1336
+ const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
1337
+ const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
1338
+ allowParenthesized,
1339
+ allowWhitespaceBeforeColon,
1340
+ captureName: prefixedCaptureName,
1341
+ headwordBody: lemmaBody
1342
+ });
1343
+ return {
1344
+ captureNames: [prefixedCaptureName],
1345
+ regex
1346
+ };
1347
+ };
1348
+ /**
1349
+ * Creates a reusable split rule for Arabic dictionary entries.
1350
+ *
1351
+ * The returned rule preserves authoring intent as a serializable
1352
+ * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1353
+ * regex string.
1354
+ *
1355
+ * @example
1356
+ * createArabicDictionaryEntryRule({
1357
+ * stopWords: ['وقيل', 'ويقال', 'قال'],
1358
+ * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1359
+ * })
1360
+ *
1361
+ * @example
1362
+ * createArabicDictionaryEntryRule({
1363
+ * allowParenthesized: true,
1364
+ * allowWhitespaceBeforeColon: true,
1365
+ * allowCommaSeparated: true,
1366
+ * stopWords: ['الليث', 'العجاج'],
1367
+ * })
1368
+ */
1369
+ /**
1370
+ * @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
1371
+ * whole-book dictionary segmentation. Keep this helper for advanced single-rule
1372
+ * composition inside a broader `SplitRule[]` pipeline.
1373
+ */
1374
+ const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1375
+ validateDictionaryEntryOptions({
1376
+ captureName,
1377
+ maxLetters,
1378
+ minLetters
1379
+ });
1380
+ return {
1381
+ dictionaryEntry: {
1382
+ allowCommaSeparated,
1383
+ allowParenthesized,
1384
+ allowWhitespaceBeforeColon,
1385
+ captureName,
1386
+ maxLetters,
1387
+ midLineSubentries,
1388
+ minLetters,
1389
+ stopWords: uniqueCanonicalWords(stopWords)
1390
+ },
1391
+ meta,
1392
+ pageStartPrevWordStoplist,
1393
+ samePagePrevWordStoplist
1394
+ };
1395
+ };
1396
+ //#endregion
1397
+ //#region src/dictionary/heading-classifier.ts
1398
+ const HEADING_PREFIX$1 = "## ";
1399
+ const CODE_LINE_PATTERN$1 = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
1400
+ const ARABIC_WORD_PATTERN = ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN;
1401
+ const PLAIN_ENTRY_RE = new RegExp(`^(?<lemma>${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}|[([{]${ARABIC_WORD_PATTERN}(?:\\s+${ARABIC_WORD_PATTERN}){0,1}[)\\]}])\\s*:`, "u");
1402
+ const INLINE_SUBENTRY_RE = new RegExp(`(^|[\\s،؛,:.])(?<lemma>و${ARABIC_WORD_PATTERN})\\s*:`, "gu");
1403
+ const CODE_LINE_RE = new RegExp(`^(?:[[(])?(?<codes>${CODE_LINE_PATTERN$1})(?:[)\\]])?$`, "u");
1404
+ const PAIRED_FORMS_RE = new RegExp(`^(?<forms>${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+)\\s*:`, "u");
1405
+ const ARABIC_BOUNDARY_OR_PUNCTUATION = "(?=$|[\\s:،؛()\\[\\]{}\\-–—]|[^\\p{Script=Arabic}])";
1406
+ const CHAPTER_HEADING_RE = new RegExp(`^(?:[([{]\\s*)?(?:باب|فصل|كتاب|حرف|أبواب)${ARABIC_BOUNDARY_OR_PUNCTUATION}`, "u");
1407
+ const CLUSTER_HEADING_RE = new RegExp(`^(?:\\(?\\s*)?(?:أبواب|أبنية)${ARABIC_BOUNDARY_OR_PUNCTUATION}|^(?=.{1,80}$).+?[،,].+?(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\\s])`, "u");
1408
+ const STATUS_HEADING_RE = new RegExp(`^(?:${CODE_LINE_PATTERN$1}|(?:(?:${ARABIC_WORD_PATTERN}\\s+){1,3}${ARABIC_WORD_PATTERN}|${ARABIC_WORD_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_PATTERN})+))\\s*:?[\\s]*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)(?=$|[.،,:؛\\s])`, "u");
1409
+ const CODE_NOTE_HEADING_RE = new RegExp(`^(?:${ARABIC_WORD_PATTERN}\\s+){1,3}\\(.+\\)$`, "u");
1410
+ const COLON_NOISE_RE = /^.+:\s*.+$/u;
1411
+ const CHAPTER_TERMS = [
1412
+ "باب",
1413
+ "فصل",
1414
+ "كتاب",
1415
+ "حرف",
1416
+ "أبواب"
1417
+ ];
1418
+ const MARKER_PREFIXES = [
1419
+ "بسم الله",
1420
+ "توكلت على الله",
1421
+ "آخر كتاب",
1422
+ "ويتلوه"
1423
+ ];
1424
+ const NOISE_TOKENS = [
1425
+ "قال",
1426
+ "وقيل",
1427
+ "ويقال",
1428
+ "وفي",
1429
+ "يعني",
1430
+ "فإذا"
1431
+ ];
1432
+ const emptyCounts = () => ({
1433
+ chapter: 0,
1434
+ cluster: 0,
1435
+ codeLine: 0,
1436
+ entry: 0,
1437
+ inlineSubentry: 0,
1438
+ lineEntry: 0,
1439
+ marker: 0,
1440
+ noise: 0,
1441
+ pairedForms: 0
1442
+ });
1443
+ const extractWrappedLemma = (lemma) => lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim();
1444
+ const stripLeadingWrappers = (text) => text.replace(/^[[{(]+\s*/u, "").trim();
1445
+ const isDelimitedPrefixMatch$1 = (text, prefix) => {
1446
+ if (text === prefix) return true;
1447
+ if (!text.startsWith(prefix)) return false;
1448
+ const nextChar = text[prefix.length];
1449
+ return nextChar === void 0 || /[\s:،؛()[\]{}\-–—]/u.test(nextChar);
1450
+ };
1451
+ const isCodeHeading = (text) => {
1452
+ if (CODE_LINE_RE.test(text)) return true;
1453
+ const words = text.trim().split(/\s+/u).filter(Boolean);
1454
+ return words.length === 1 && (words[0]?.length ?? 0) === 1;
1455
+ };
1456
+ const looksLikeNoiseHeading = (text) => {
1457
+ const normalized = normalizeArabicForComparison(text);
1458
+ const wordCount = text.trim().split(/\s+/u).filter(Boolean).length;
1459
+ if (/(?:مستعمل|مهمل|مستعملة|مستعملان)(?=$|[.،,:؛\s])/u.test(text)) return false;
1460
+ if (wordCount >= 8 && COLON_NOISE_RE.test(text)) return true;
1461
+ return NOISE_TOKENS.some((token) => normalized.includes(normalizeArabicForComparison(token))) && wordCount >= 4;
1462
+ };
1463
+ /**
1464
+ * Classifies a markdown heading line produced by `convertContentToMarkdown()`.
1465
+ */
1466
+ const classifyDictionaryHeading = (line) => {
1467
+ const text = line.startsWith(HEADING_PREFIX$1) ? line.slice(3).trim() : line.trim();
1468
+ const unwrapped = stripLeadingWrappers(text);
1469
+ if (!text) return "noise";
1470
+ if (CHAPTER_HEADING_RE.test(text) || CHAPTER_TERMS.some((term) => isDelimitedPrefixMatch$1(normalizeArabicForComparison(unwrapped), normalizeArabicForComparison(term)))) return "chapter";
1471
+ if (looksLikeNoiseHeading(text)) return "noise";
1472
+ if (isCodeHeading(text)) return "marker";
1473
+ if (MARKER_PREFIXES.some((token) => normalizeArabicForComparison(unwrapped).startsWith(normalizeArabicForComparison(token)))) return "marker";
1474
+ if (STATUS_HEADING_RE.test(text) || CODE_NOTE_HEADING_RE.test(text)) return "marker";
1475
+ if (CLUSTER_HEADING_RE.test(text)) return "cluster";
1476
+ return "entry";
1477
+ };
1478
+ const createHeadingMatch = (kind, page, rawLine, lineNumber) => ({
1479
+ kind,
1480
+ lemma: kind === "entry" ? rawLine.slice(3).trim() : void 0,
1481
+ line: lineNumber,
1482
+ pageId: page.id,
1483
+ text: rawLine
1484
+ });
1485
+ const createSurfaceMatch = (kind, page, text, lineNumber, lemma) => ({
1486
+ kind,
1487
+ lemma,
1488
+ line: lineNumber,
1489
+ pageId: page.id,
1490
+ text
1491
+ });
1492
+ const scanHeadingLine = (page, rawLine, lineNumber, matches) => {
1493
+ if (!rawLine.startsWith(HEADING_PREFIX$1)) return false;
1494
+ const kind = classifyDictionaryHeading(rawLine);
1495
+ matches.push(createHeadingMatch(kind, page, rawLine, lineNumber));
1496
+ return true;
1497
+ };
1498
+ const scanLineEntry = (page, rawLine, lineNumber, matches) => {
1499
+ const lineEntry = rawLine.match(PLAIN_ENTRY_RE);
1500
+ if (!lineEntry?.groups?.lemma) return;
1501
+ matches.push(createSurfaceMatch("lineEntry", page, rawLine, lineNumber, extractWrappedLemma(lineEntry.groups.lemma)));
1502
+ };
1503
+ const scanPairedForms = (page, rawLine, lineNumber, matches) => {
1504
+ const pairedForms = rawLine.match(PAIRED_FORMS_RE);
1505
+ if (!pairedForms?.groups?.forms) return;
1506
+ matches.push(createSurfaceMatch("pairedForms", page, rawLine, lineNumber, pairedForms.groups.forms));
1507
+ };
1508
+ const scanCodeLine = (page, rawLine, lineNumber, matches) => {
1509
+ const codeLine = rawLine.match(CODE_LINE_RE);
1510
+ if (!codeLine?.groups?.codes) return;
1511
+ matches.push(createSurfaceMatch("codeLine", page, rawLine, lineNumber, codeLine.groups.codes));
1512
+ };
1513
+ const scanInlineSubentries = (page, rawLine, lineNumber, matches) => {
1514
+ for (const match of rawLine.matchAll(INLINE_SUBENTRY_RE)) {
1515
+ if (!match.groups?.lemma) continue;
1516
+ matches.push(createSurfaceMatch("inlineSubentry", page, match.groups.lemma, lineNumber, match.groups.lemma));
1517
+ }
1518
+ };
1519
+ /**
1520
+ * Extracts dictionary surface matches from a markdown page.
1521
+ */
1522
+ const scanDictionaryMarkdownPage = (page) => {
1523
+ const lines = page.content.split(/\n/u);
1524
+ const matches = [];
1525
+ for (let index = 0; index < lines.length; index++) {
1526
+ const rawLine = lines[index]?.trim() ?? "";
1527
+ if (!rawLine) continue;
1528
+ if (scanHeadingLine(page, rawLine, index + 1, matches)) continue;
1529
+ scanLineEntry(page, rawLine, index + 1, matches);
1530
+ scanPairedForms(page, rawLine, index + 1, matches);
1531
+ scanCodeLine(page, rawLine, index + 1, matches);
1532
+ scanInlineSubentries(page, rawLine, index + 1, matches);
1533
+ }
1534
+ return matches;
1535
+ };
1536
+ /**
1537
+ * Aggregates dictionary surface counts across markdown pages.
1538
+ */
1539
+ const analyzeDictionaryMarkdownPages = (pages) => {
1540
+ const counts = emptyCounts();
1541
+ const matches = [];
1542
+ for (const page of pages) {
1543
+ const pageMatches = scanDictionaryMarkdownPage(page);
1544
+ for (const match of pageMatches) {
1545
+ counts[match.kind] += 1;
1546
+ matches.push(match);
1547
+ }
1548
+ }
1549
+ return {
1550
+ counts,
1551
+ matches
1552
+ };
1553
+ };
1554
+ //#endregion
1555
+ //#region src/dictionary/profile.ts
1556
+ const normalizedProfileCache = /* @__PURE__ */ new WeakMap();
1557
+ const normalizeStopLemmaWord = (word) => normalizeArabicForComparison(word).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
1558
+ const uniqueNormalizedSet = (values, normalize) => new Set(values.map(normalize).filter(Boolean));
1559
+ const assertNever$2 = (value) => {
1560
+ throw new Error(`Unhandled dictionary profile variant: ${JSON.stringify(value)}`);
1561
+ };
1562
+ const normalizeFamily = (family) => {
1563
+ switch (family.use) {
1564
+ case "heading": return {
1565
+ ...family,
1566
+ allowNextLineColon: family.allowNextLineColon ?? false,
1567
+ allowSingleLetter: family.allowSingleLetter ?? false
1568
+ };
1569
+ case "lineEntry": return {
1570
+ ...family,
1571
+ allowMultiWord: family.allowMultiWord ?? false,
1572
+ allowWhitespaceBeforeColon: family.allowWhitespaceBeforeColon ?? false,
1573
+ wrappers: family.wrappers ?? "none"
1574
+ };
1575
+ case "inlineSubentry": return {
1576
+ ...family,
1577
+ prefixes: family.prefixes ?? ["و"],
1578
+ stripPrefixesFromLemma: family.stripPrefixesFromLemma ?? true
1579
+ };
1580
+ case "codeLine": return {
1581
+ ...family,
1582
+ wrappers: family.wrappers ?? "either"
1583
+ };
1584
+ case "pairedForms": return {
1585
+ ...family,
1586
+ requireStatusTail: family.requireStatusTail ?? false,
1587
+ separator: family.separator ?? "comma"
1588
+ };
1589
+ default: return assertNever$2(family);
1590
+ }
1591
+ };
1592
+ const normalizeBlocker = (blocker) => {
1593
+ switch (blocker.use) {
1594
+ case "authorityIntro": return {
1595
+ ...blocker,
1596
+ precision: blocker.precision ?? "high"
1597
+ };
1598
+ case "stopLemma": return {
1599
+ ...blocker,
1600
+ normalizedWords: uniqueNormalizedSet(blocker.words, normalizeStopLemmaWord)
1601
+ };
1602
+ case "previousWord": return {
1603
+ ...blocker,
1604
+ normalizedWords: uniqueNormalizedSet(blocker.words, normalizeArabicForComparison)
1605
+ };
1606
+ case "previousChar": return {
1607
+ ...blocker,
1608
+ charSet: new Set(blocker.chars)
1609
+ };
1610
+ case "intro":
1611
+ case "pageContinuation": return blocker;
1612
+ default: return assertNever$2(blocker);
1613
+ }
1614
+ };
1615
+ const normalizeZone = (zone) => ({
1616
+ blockers: (zone.blockers ?? []).map(normalizeBlocker),
1617
+ families: zone.families.map(normalizeFamily),
1618
+ name: zone.name,
1619
+ when: zone.when ? {
1620
+ activateAfter: zone.when.activateAfter,
1621
+ maxPageId: zone.when.maxPageId,
1622
+ minPageId: zone.when.minPageId
1623
+ } : void 0
1624
+ });
1625
+ const createIssue$1 = (code, path, message, zoneName) => ({
1626
+ code,
1627
+ message,
1628
+ path,
1629
+ ...zoneName ? { zoneName } : {}
1630
+ });
1631
+ const validateGate = (gate, zone, gateIndex, seenActivateAfterKeys, issues) => {
1632
+ const gatePath = `zones[].when.activateAfter[${gateIndex}]`.replace("[]", `[${zone.name}]`);
1633
+ if (gate.use === "headingText") {
1634
+ if (!gate.match.trim()) issues.push(createIssue$1("invalid_gate_match", `${gatePath}.match`, `dictionary gate match must be non-empty`, zone.name));
1635
+ if (gate.fuzzy !== void 0 && typeof gate.fuzzy !== "boolean") issues.push(createIssue$1("invalid_gate_fuzzy", `${gatePath}.fuzzy`, `dictionary gate fuzzy must be a boolean when provided`, zone.name));
1636
+ }
1637
+ const dedupeKey = `${gate.use}:${JSON.stringify(gate)}`;
1638
+ if (seenActivateAfterKeys.has(dedupeKey)) issues.push(createIssue$1("duplicate_activate_after_gate", gatePath, `dictionary zone "${zone.name}" has duplicate activateAfter gates`, zone.name));
1639
+ seenActivateAfterKeys.add(dedupeKey);
1640
+ };
1641
+ const validateFamily = (family, zone, familyIndex, issues) => {
1642
+ const familyPath = `zones[].families[${familyIndex}]`.replace("[]", `[${zone.name}]`);
1643
+ switch (family.use) {
1644
+ case "heading":
1645
+ if (family.classes.length === 0) issues.push(createIssue$1("empty_heading_classes", `${familyPath}.classes`, `dictionary heading family in zone "${zone.name}" must include at least one class`, zone.name));
1646
+ if (family.emit === "chapter" && !family.classes.includes("chapter")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "chapter" but never matches chapter headings`, zone.name));
1647
+ if (family.emit === "marker" && !family.classes.includes("marker")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "marker" but never matches marker headings`, zone.name));
1648
+ if (family.emit === "entry" && !family.classes.includes("entry")) issues.push(createIssue$1("inert_heading_family", familyPath, `dictionary heading family in zone "${zone.name}" emits "entry" but never matches entry headings`, zone.name));
1649
+ break;
1650
+ case "lineEntry": break;
1651
+ case "inlineSubentry":
1652
+ if (family.prefixes?.some((prefix) => !prefix.trim())) issues.push(createIssue$1("empty_inline_prefixes", `${familyPath}.prefixes`, `inlineSubentry prefixes must be non-empty strings`, zone.name));
1653
+ break;
1654
+ case "codeLine": break;
1655
+ case "pairedForms": break;
1656
+ default: assertNever$2(family);
1657
+ }
1658
+ };
1659
+ const validateBlocker = (blocker, zone, blockerIndex, issues) => {
1660
+ const blockerPath = `zones[].blockers[${blockerIndex}]`.replace("[]", `[${zone.name}]`);
1661
+ switch (blocker.use) {
1662
+ case "stopLemma":
1663
+ if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_stop_words", `${blockerPath}.words`, `stopLemma blocker in zone "${zone.name}" must include non-empty words`, zone.name));
1664
+ break;
1665
+ case "previousWord":
1666
+ if (blocker.words.length === 0 || blocker.words.some((word) => !word.trim())) issues.push(createIssue$1("invalid_previous_words", `${blockerPath}.words`, `previousWord blocker in zone "${zone.name}" must include non-empty words`, zone.name));
1667
+ break;
1668
+ case "previousChar":
1669
+ if (blocker.chars.length === 0 || blocker.chars.some((char) => !char)) issues.push(createIssue$1("invalid_previous_chars", `${blockerPath}.chars`, `previousChar blocker in zone "${zone.name}" must include chars`, zone.name));
1670
+ break;
1671
+ case "authorityIntro":
1672
+ case "intro":
1673
+ case "pageContinuation": break;
1674
+ default: assertNever$2(blocker);
1675
+ }
1676
+ };
1677
+ var DictionaryProfileValidationError = class extends Error {
1678
+ issues;
1679
+ constructor(issues) {
1680
+ super(issues.length === 1 ? issues[0].message : `Dictionary profile validation failed with ${issues.length} issues`);
1681
+ this.name = "DictionaryProfileValidationError";
1682
+ this.issues = issues;
1683
+ }
1684
+ };
1685
+ const validateZone = (zone, zoneIndex, seenZoneNames, issues) => {
1686
+ const zonePath = `zones[${zoneIndex}]`;
1687
+ const trimmedName = zone.name.trim();
1688
+ if (!trimmedName) issues.push(createIssue$1("empty_zone_name", `${zonePath}.name`, `dictionary zone name must be non-empty`));
1689
+ else if (seenZoneNames.has(trimmedName)) issues.push(createIssue$1("duplicate_zone_name", `${zonePath}.name`, `dictionary zone names must be unique; duplicated "${trimmedName}"`, trimmedName));
1690
+ else seenZoneNames.add(trimmedName);
1691
+ if (zone.families.length === 0) issues.push(createIssue$1("empty_zone_families", `${zonePath}.families`, `dictionary zone "${zone.name}" must declare at least one family`, zone.name));
1692
+ if (zone.when?.minPageId !== void 0 && zone.when?.maxPageId !== void 0 && zone.when.minPageId > zone.when.maxPageId) issues.push(createIssue$1("invalid_zone_page_range", `${zonePath}.when`, `dictionary zone "${zone.name}" has minPageId greater than maxPageId`, zone.name));
1693
+ const seenActivateAfterKeys = /* @__PURE__ */ new Set();
1694
+ for (let gateIndex = 0; gateIndex < (zone.when?.activateAfter?.length ?? 0); gateIndex++) validateGate(zone.when.activateAfter[gateIndex], zone, gateIndex, seenActivateAfterKeys, issues);
1695
+ for (let familyIndex = 0; familyIndex < zone.families.length; familyIndex++) validateFamily(zone.families[familyIndex], zone, familyIndex, issues);
1696
+ for (let blockerIndex = 0; blockerIndex < (zone.blockers?.length ?? 0); blockerIndex++) validateBlocker(zone.blockers[blockerIndex], zone, blockerIndex, issues);
1697
+ };
1698
+ /**
1699
+ * Validates a dictionary profile without normalizing it.
1700
+ */
1701
+ const validateDictionaryProfile = (profile) => {
1702
+ const issues = [];
1703
+ if (profile.version !== 2) issues.push(createIssue$1("invalid_version", "version", `dictionary profile version must be 2, got ${profile.version}`));
1704
+ if (profile.zones.length === 0) {
1705
+ issues.push(createIssue$1("missing_zones", "zones", `dictionary profile must contain at least one zone`));
1706
+ return issues;
1707
+ }
1708
+ const seenZoneNames = /* @__PURE__ */ new Set();
1709
+ for (let zoneIndex = 0; zoneIndex < profile.zones.length; zoneIndex++) validateZone(profile.zones[zoneIndex], zoneIndex, seenZoneNames, issues);
1710
+ return issues;
1711
+ };
1712
+ /**
1713
+ * Normalizes and validates a dictionary profile before runtime matching.
1714
+ */
1715
+ const normalizeDictionaryProfile = (profile) => {
1716
+ const cached = normalizedProfileCache.get(profile);
1717
+ if (cached) return cached;
1718
+ const issues = validateDictionaryProfile(profile);
1719
+ if (issues.length > 0) throw new DictionaryProfileValidationError(issues);
1720
+ const normalized = {
1721
+ version: 2,
1722
+ zones: profile.zones.map(normalizeZone)
1723
+ };
1724
+ normalizedProfileCache.set(profile, normalized);
1725
+ return normalized;
1726
+ };
1727
+ //#endregion
1282
1728
  //#region src/types/rules.ts
1283
1729
  /**
1284
1730
  * Pattern type key names for split rules.
@@ -1300,9 +1746,850 @@ const PATTERN_TYPE_KEYS = [
1300
1746
  "lineStartsAfter",
1301
1747
  "lineEndsWith",
1302
1748
  "template",
1303
- "regex"
1749
+ "regex",
1750
+ "dictionaryEntry"
1304
1751
  ];
1305
1752
  //#endregion
1753
+ //#region src/segmentation/debug-meta.ts
1754
+ const resolveDebugConfig = (debug) => {
1755
+ if (debug === true) return {
1756
+ includeBreakpoint: true,
1757
+ includeRule: true,
1758
+ metaKey: "_flappa"
1759
+ };
1760
+ if (!debug || typeof debug !== "object") return null;
1761
+ const { metaKey, include } = debug;
1762
+ const includeRule = Array.isArray(include) ? include.includes("rule") : true;
1763
+ return {
1764
+ includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
1765
+ includeRule,
1766
+ metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
1767
+ };
1768
+ };
1769
+ const getRulePatternType = (rule) => {
1770
+ return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
1771
+ };
1772
+ const isPlainObject$1 = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
1773
+ const mergeDebugIntoMeta = (meta, metaKey, patch) => {
1774
+ const out = meta ? { ...meta } : {};
1775
+ const existing = out[metaKey];
1776
+ out[metaKey] = {
1777
+ ...isPlainObject$1(existing) ? existing : {},
1778
+ ...patch
1779
+ };
1780
+ return out;
1781
+ };
1782
+ const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
1783
+ const patternType = getRulePatternType(rule);
1784
+ const patterns = rule[patternType];
1785
+ const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
1786
+ return { rule: {
1787
+ index: ruleIndex,
1788
+ patternType,
1789
+ ...wordIndex !== void 0 ? { wordIndex } : {},
1790
+ ...word !== void 0 ? { word } : {}
1791
+ } };
1792
+ };
1793
+ const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
1794
+ index: breakpointIndex,
1795
+ kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
1796
+ pattern: rule.pattern ?? rule.regex,
1797
+ ...wordIndex !== void 0 ? { wordIndex } : {},
1798
+ ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
1799
+ } });
1800
+ /**
1801
+ * Helper to format the debug info into a human-readable string.
1802
+ * @param meta - The segment metadata object
1803
+ * @param options - Formatting options
1804
+ */
1805
+ const formatRuleReason = (rule, concise) => {
1806
+ const { index, patternType, wordIndex, word } = rule;
1807
+ if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
1808
+ const wordInfo = word ? ` (Matched: "${word}")` : "";
1809
+ return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
1810
+ };
1811
+ const formatBreakpointReason = (breakpoint, concise) => {
1812
+ const { index, kind, pattern, wordIndex, word } = breakpoint;
1813
+ if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
1814
+ if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
1815
+ if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
1816
+ return `Breakpoint #${index} (${kind}) - "${pattern}"`;
1817
+ };
1818
+ const formatContentLengthReason = (split, concise) => {
1819
+ const { maxContentLength, splitReason } = split;
1820
+ if (concise) return `> ${maxContentLength} (${splitReason})`;
1821
+ return `Safety Split (${splitReason}) > ${maxContentLength}`;
1822
+ };
1823
+ /**
1824
+ * Helper to format the debug info into a human-readable string.
1825
+ * @param meta - The segment metadata object
1826
+ * @param options - Formatting options
1827
+ */
1828
+ const getDebugReason = (meta, options) => {
1829
+ const debug = meta?._flappa;
1830
+ if (!debug) return "-";
1831
+ const concise = options?.concise;
1832
+ if (debug.rule) return formatRuleReason(debug.rule, concise);
1833
+ if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
1834
+ if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
1835
+ return "Unknown";
1836
+ };
1837
+ /**
1838
+ * Convenience helper to get the formatted debug reason directly from a segment.
1839
+ * @param segment - The segment object
1840
+ * @param options - Formatting options
1841
+ */
1842
+ const getSegmentDebugReason = (segment, options) => {
1843
+ return getDebugReason(segment.meta, options);
1844
+ };
1845
+ //#endregion
1846
+ //#region src/dictionary/runtime.ts
1847
+ const INTRO_PHRASES = [
1848
+ "وقال",
1849
+ "قال",
1850
+ "وفي الحديث",
1851
+ "في الحديث",
1852
+ "وفي حديث",
1853
+ "في حديث",
1854
+ "وفي رواية",
1855
+ "في رواية",
1856
+ "وفي قراءة",
1857
+ "في قراءة",
1858
+ "وفي قول",
1859
+ "في قول",
1860
+ "وفي كلام",
1861
+ "في كلام",
1862
+ "ومنه قول",
1863
+ "ومنها قول",
1864
+ "وقرأ",
1865
+ "قرأ",
1866
+ "قراءة",
1867
+ "حديث",
1868
+ "ويقال",
1869
+ "وقيل",
1870
+ "قلت",
1871
+ "فقال",
1872
+ "قال الشاعر",
1873
+ "أنشد",
1874
+ "وأنشد"
1875
+ ];
1876
+ const INTRO_TAIL_PHRASES = [
1877
+ "بفتح",
1878
+ "بالفتح",
1879
+ "بكسر",
1880
+ "بالكسر",
1881
+ "بضم",
1882
+ "بالضم",
1883
+ "بالتحريك",
1884
+ "حديث",
1885
+ "الحديث",
1886
+ "في التنزيل",
1887
+ "وفي التنزيل",
1888
+ "في التنزيل العزيز",
1889
+ "وفي التنزيل العزيز",
1890
+ "في مقتل",
1891
+ "وفي مقتل",
1892
+ "في المجاز",
1893
+ "وفي المجاز",
1894
+ "من المجاز",
1895
+ "ومن المجاز",
1896
+ "في رواية",
1897
+ "وفي رواية",
1898
+ "في قراءة",
1899
+ "وفي قراءة",
1900
+ "في قول",
1901
+ "وفي قول",
1902
+ "في كلام",
1903
+ "وفي كلام",
1904
+ "في صفة",
1905
+ "وفي صفة",
1906
+ "في خطبته",
1907
+ "وفي خطبته",
1908
+ "ومنه قول",
1909
+ "ومنها قول",
1910
+ "يقال لرقبة",
1911
+ "على جهتين",
1912
+ "قوله جل",
1913
+ "قوله جل وعز",
1914
+ "جل وعز",
1915
+ "ومنه حديث",
1916
+ "ومنه الحديث",
1917
+ "كرم الله",
1918
+ "صلى الله عليه",
1919
+ "رضي الله عنه",
1920
+ "رضي الله عنها",
1921
+ "رضي الله عنهما",
1922
+ "قال ابو",
1923
+ "وقال ابو",
1924
+ "عن ابي",
1925
+ "قال ابن",
1926
+ "وقال ابن",
1927
+ "عن ابن"
1928
+ ];
1929
+ const INTRO_TAIL_PATTERNS = [
1930
+ /(?:^|\s)(?:في|وفي|ومنه|ومنها)\s+(?:حديث|الحديث|رواية|قراءة|قول|كلام|مقتل|صفة|خطبته)(?:\s+\S+){0,8}$/u,
1931
+ /(?:^|\s)(?:حديث|الحديث|رواية|قراءة|قول|كلام)(?:\s+\S+){1,8}$/u,
1932
+ /(?:^|\s)(?:قوله|قول(?:ه|هم)?|قال(?:\s+قائل)?|وقرأ|قرأ|قراءة)\s+(?:جل(?:\s+وعز)?|[^\s]+)$/u,
1933
+ /(?:^|\s)(?:ابو|ابي|ابا|ابن|بن|بنت)(?:\s+\S+){1,4}$/u,
1934
+ /(?:^|\s)(?:قال|وقال|انشد|وانشد|روي|وروي|اخبر|واخبر)(?:\s+\S+){0,4}$/u
1935
+ ];
1936
+ const QUALIFIER_TAIL_PREFIXES = [
1937
+ "أي",
1938
+ "قال",
1939
+ "تقول",
1940
+ "يقال",
1941
+ "يقول",
1942
+ "يريد",
1943
+ "يُريد",
1944
+ "ويقال",
1945
+ "ويقول",
1946
+ "وجمعه",
1947
+ "وجمعها",
1948
+ "والجميع",
1949
+ "والجمع"
1950
+ ];
1951
+ const STRUCTURAL_LEMMA_PREFIXES = [
1952
+ "لجزء",
1953
+ "جزء",
1954
+ "ومما يستدرك عليه",
1955
+ "آخر حرف",
1956
+ "كتاب حرف"
1957
+ ];
1958
+ const STRUCTURAL_LINE_PATTERNS = [
1959
+ /^\d+\s*-\s*\(.+\)$/u,
1960
+ /^\(.+\)$/u,
1961
+ /^\(.+\)\s*##\s*/u
1962
+ ];
1963
+ const STRUCTURAL_LINE_KEYWORDS = [
1964
+ "باب",
1965
+ "فصل",
1966
+ "حرف",
1967
+ "أبواب",
1968
+ "كتاب",
1969
+ "المعجمة",
1970
+ "المهملة",
1971
+ "المثناة"
1972
+ ];
1973
+ const CONTINUATION_PREV_WORDS = [
1974
+ "بفتح",
1975
+ "بالفتح",
1976
+ "بكسر",
1977
+ "بالكسر",
1978
+ "بضم",
1979
+ "بالضم",
1980
+ "بالتحريك",
1981
+ "قال",
1982
+ "وقال",
1983
+ "وقيل",
1984
+ "ويقال",
1985
+ "يقال",
1986
+ "قلت",
1987
+ "فقال",
1988
+ "قالوا",
1989
+ "من",
1990
+ "في",
1991
+ "على",
1992
+ "إذا",
1993
+ "نحو",
1994
+ "ثم",
1995
+ "وجل"
1996
+ ];
1997
+ const AUTHORITY_RE = /^(?:(?:و)?قال\s+(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\b|(?:أبو|ابن|ثعلب|الليث|الأزهري|الجوهري|الفراء)\s+\S+)/u;
1998
+ const AUTHORITY_HEAD_WORDS = [
1999
+ "الأزهري",
2000
+ "الأصمعي",
2001
+ "الأشجعي",
2002
+ "الأموي",
2003
+ "الأمويّ",
2004
+ "الجوهري",
2005
+ "الرياشي",
2006
+ "الزجاج",
2007
+ "الزجاجي",
2008
+ "الشيباني",
2009
+ "الفراء",
2010
+ "الكسائي",
2011
+ "اللحياني",
2012
+ "الليث",
2013
+ "المبرد",
2014
+ "المنذري",
2015
+ "ثعلب",
2016
+ "شمر"
2017
+ ];
2018
+ const STRONG_SENTENCE_TERMINATORS$1 = /[.!?؟؛۔…]$/u;
2019
+ const TRAILING_PAGE_WRAP_NOISE$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>]+$/u;
2020
+ const TRAILING_WORD_DELIMITERS$1 = /[\s\u0660-\u0669\d«»"“”'‘’()[\]{}<>.,!?؟؛،:]+$/u;
2021
+ const ARABIC_WORD_REGEX$1 = new RegExp(ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, "gu");
2022
+ const HEADING_PREFIX = "## ";
2023
+ const CODE_LINE_PATTERN = getTokenPattern("harfs").replaceAll("\\s+", "[ \\t]+");
2024
+ const BARE_CODE_LEMMA_RE = new RegExp(`^(?:${CODE_LINE_PATTERN})$`, "u");
2025
+ const STATUS_TAIL_PATTERN = "(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة)";
2026
+ const GATE_TOKEN_MAP = {
2027
+ bab: "باب",
2028
+ fasl: "فصل",
2029
+ kitab: "كتاب"
2030
+ };
2031
+ const GATE_DELIMITER_RE = /[\s:،؛()[\]{}\-–—]/u;
2032
+ const assertNever$1 = (value) => {
2033
+ throw new Error(`Unhandled dictionary runtime variant: ${JSON.stringify(value)}`);
2034
+ };
2035
+ const lineEntryRegexCache = /* @__PURE__ */ new WeakMap();
2036
+ const inlineSubentryRegexCache = /* @__PURE__ */ new WeakMap();
2037
+ const pairedFormsRegexCache = /* @__PURE__ */ new WeakMap();
2038
+ const trimTrailingPageWrapNoise$1 = (text) => text.trimEnd().replace(TRAILING_PAGE_WRAP_NOISE$1, "");
2039
+ const endsWithStrongSentenceTerminator$1 = (pageContent) => {
2040
+ return STRONG_SENTENCE_TERMINATORS$1.test(trimTrailingPageWrapNoise$1(pageContent));
2041
+ };
2042
+ const extractLastArabicWord$1 = (text, endExclusive = text.length) => {
2043
+ const windowStart = Math.max(0, endExclusive - 256);
2044
+ const withoutTrailingDelimiters = trimTrailingPageWrapNoise$1(text.slice(windowStart, endExclusive)).replace(TRAILING_WORD_DELIMITERS$1, "");
2045
+ let lastMatch = "";
2046
+ ARABIC_WORD_REGEX$1.lastIndex = 0;
2047
+ for (const match of withoutTrailingDelimiters.matchAll(ARABIC_WORD_REGEX$1)) lastMatch = match[0];
2048
+ return lastMatch;
2049
+ };
2050
+ const previousNonWhitespaceChar = (text, endExclusive = text.length) => {
2051
+ for (let index = endExclusive - 1; index >= 0; index--) {
2052
+ const char = text[index];
2053
+ if (char && !/\s/u.test(char)) return char;
2054
+ }
2055
+ return "";
2056
+ };
2057
+ const normalizedEquals = (left, right) => normalizeArabicForComparison(left) === normalizeArabicForComparison(right);
2058
+ const normalizedStartsWith = (text, prefix) => normalizeArabicForComparison(text).startsWith(normalizeArabicForComparison(prefix));
2059
+ const normalizeStopLemma = (text) => normalizeArabicForComparison(text).replace(/^[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+/gu, "").replace(/[\s:؛،,.!?؟()[\]{}«»"'“”‘’]+$/gu, "").trim();
2060
+ const getTrailingContext = (text, endExclusive, maxChars = 240) => text.slice(Math.max(0, endExclusive - maxChars), endExclusive);
2061
+ const isDelimitedPrefixMatch = (text, prefix) => {
2062
+ if (text === prefix) return true;
2063
+ if (!text.startsWith(prefix)) return false;
2064
+ const nextChar = text[prefix.length];
2065
+ return nextChar === void 0 || GATE_DELIMITER_RE.test(nextChar);
2066
+ };
2067
+ const createPageContexts = (pages, pageMap, normalizedPages) => {
2068
+ if (normalizedPages && normalizedPages.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} normalized pages, received ${normalizedPages.length}`);
2069
+ if (pageMap.boundaries.length !== pages.length) throw new Error(`Dictionary runtime expected ${pages.length} page boundaries, received ${pageMap.boundaries.length}`);
2070
+ const contexts = [];
2071
+ for (let index = 0; index < pages.length; index++) {
2072
+ const page = pages[index];
2073
+ const boundary = pageMap.boundaries[index];
2074
+ if (!page || !boundary) throw new Error(`Dictionary runtime encountered a missing page or boundary at index ${index}`);
2075
+ const content = normalizedPages?.[index] ?? normalizeLineEndings(page.content);
2076
+ contexts.push({
2077
+ boundary,
2078
+ content,
2079
+ index,
2080
+ lines: buildPageLines(content),
2081
+ page
2082
+ });
2083
+ }
2084
+ return contexts;
2085
+ };
2086
+ const normalizeIntroContextText = (text) => normalizeArabicForComparison(text).replace(/[\\/]+/gu, " ").replace(/[«»"“”'‘’()[\]{}]+/gu, " ").replace(/\s+/gu, " ").trim();
2087
+ const startsWithConfiguredWord = (words, candidate) => words.some((word) => normalizedStartsWith(candidate, word));
2088
+ const buildPageLines = (content) => {
2089
+ const parts = content.split("\n");
2090
+ const lines = [];
2091
+ let offset = 0;
2092
+ for (let index = 0; index < parts.length; index++) {
2093
+ const text = parts[index] ?? "";
2094
+ lines.push({
2095
+ lineNumber: index + 1,
2096
+ start: offset,
2097
+ text
2098
+ });
2099
+ offset += text.length + 1;
2100
+ }
2101
+ return lines;
2102
+ };
2103
+ const headingMatchesGate = (headingText, gate) => {
2104
+ if (gate.use === "headingText") {
2105
+ const useFuzzy = gate.fuzzy ?? false;
2106
+ const source = useFuzzy ? normalizeArabicForComparison(headingText) : headingText.trim();
2107
+ const match = useFuzzy ? normalizeArabicForComparison(gate.match) : gate.match.trim();
2108
+ return !!match && isDelimitedPrefixMatch(source, match);
2109
+ }
2110
+ return normalizedStartsWith(headingText, GATE_TOKEN_MAP[gate.token]);
2111
+ };
2112
+ const pageMatchesAnyGate = (page, gates) => page.lines.some((line) => {
2113
+ const trimmed = line.text.trim();
2114
+ if (!trimmed.startsWith(HEADING_PREFIX)) return false;
2115
+ const headingText = trimmed.replace(/^##\s+/u, "").trim();
2116
+ return gates.some((gate) => headingMatchesGate(headingText, gate));
2117
+ });
2118
+ const pageWithinZoneBounds = (zone, pageId) => {
2119
+ if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2120
+ if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2121
+ return true;
2122
+ };
2123
+ const findActivationPageId = (zone, pages) => {
2124
+ for (const page of pages) {
2125
+ if (!pageWithinZoneBounds(zone, page.page.id)) continue;
2126
+ if (pageMatchesAnyGate(page, zone.when?.activateAfter ?? [])) return page.page.id;
2127
+ }
2128
+ return null;
2129
+ };
2130
+ const createZoneActivationMap = (profile, pages) => {
2131
+ const activation = /* @__PURE__ */ new Map();
2132
+ for (const zone of profile.zones) {
2133
+ if (!zone.when?.activateAfter?.length) {
2134
+ activation.set(zone.name, null);
2135
+ continue;
2136
+ }
2137
+ activation.set(zone.name, findActivationPageId(zone, pages));
2138
+ }
2139
+ return activation;
2140
+ };
2141
+ const pageMatchesZone = (zone, activationMap, pageId) => {
2142
+ if (zone.when?.minPageId !== void 0 && pageId < zone.when.minPageId) return false;
2143
+ if (zone.when?.maxPageId !== void 0 && pageId > zone.when.maxPageId) return false;
2144
+ if (!zone.when?.activateAfter?.length) return true;
2145
+ const activatedAt = activationMap.get(zone.name);
2146
+ return activatedAt !== null && activatedAt !== void 0 && pageId >= activatedAt;
2147
+ };
2148
+ const resolveActiveZone = (profile, activationMap, pageId) => {
2149
+ let activeZone = null;
2150
+ for (const zone of profile.zones) if (pageMatchesZone(zone, activationMap, pageId)) activeZone = zone;
2151
+ return activeZone;
2152
+ };
2153
+ const createHeadingCandidate = (pageStartOffset, line, nextLine, family, headingClass) => {
2154
+ if (!family.classes.includes(headingClass)) return null;
2155
+ const headingText = line.text.trim().slice(3).trim();
2156
+ if (!family.allowSingleLetter && headingClass === "entry" && headingText.length <= 1) return null;
2157
+ if (headingClass === "entry" && !family.allowNextLineColon && nextLine?.text.trimStart().startsWith(":")) return null;
2158
+ return {
2159
+ absoluteIndex: pageStartOffset + line.start,
2160
+ contentStartOffset: 3,
2161
+ family: "heading",
2162
+ headingClass,
2163
+ kind: family.emit,
2164
+ lemma: family.emit === "entry" ? headingText : void 0,
2165
+ lineNumber: line.lineNumber,
2166
+ localIndex: line.start,
2167
+ probeText: line.text.trim(),
2168
+ text: line.text.trim()
2169
+ };
2170
+ };
2171
+ const optionalSecondWord = (allowMultiWord) => allowMultiWord ? `(?:\\s+${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})?` : "";
2172
+ const wrappedWordPattern = (open, close, allowMultiWord) => `${open}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}${close}`;
2173
+ const bareWordPattern = (allowMultiWord) => `${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}${optionalSecondWord(allowMultiWord)}`;
2174
+ const STATUS_LINE_RE = new RegExp(`^(?:${CODE_LINE_PATTERN}|${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:\\s*[،,]\\s*${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)\\s*:?[\\s]*${STATUS_TAIL_PATTERN}(?=$|[.،,:؛\\s])`, "u");
2175
+ const createLineEntryRegex = (family) => {
2176
+ const cached = lineEntryRegexCache.get(family);
2177
+ if (cached) return cached;
2178
+ const wrapperPattern = family.wrappers === "parentheses" ? wrappedWordPattern("\\(", "\\)", family.allowMultiWord) : family.wrappers === "brackets" ? wrappedWordPattern("\\[", "\\]", family.allowMultiWord) : family.wrappers === "curly" ? wrappedWordPattern("\\{", "\\}", family.allowMultiWord) : family.wrappers === "any" ? `(?:${wrappedWordPattern("\\(", "\\)", family.allowMultiWord)}|${wrappedWordPattern("\\[", "\\]", family.allowMultiWord)}|${wrappedWordPattern("\\{", "\\}", family.allowMultiWord)})` : bareWordPattern(family.allowMultiWord);
2179
+ const colonSpacing = family.allowWhitespaceBeforeColon ? "\\s*:" : ":";
2180
+ const regex = new RegExp(`^(?<lemma>${wrapperPattern})${colonSpacing}`, "u");
2181
+ lineEntryRegexCache.set(family, regex);
2182
+ return regex;
2183
+ };
2184
+ const collectLineEntryCandidates = (pageStartOffset, line, family) => {
2185
+ const trimmed = line.text.trim();
2186
+ if (STATUS_LINE_RE.test(trimmed)) return [];
2187
+ const match = trimmed.match(createLineEntryRegex(family));
2188
+ if (!match?.groups?.lemma) return [];
2189
+ return [{
2190
+ absoluteIndex: pageStartOffset + line.start,
2191
+ family: "lineEntry",
2192
+ kind: "entry",
2193
+ lemma: match.groups.lemma.replace(/^[[{(]+|[\])}]+$/gu, "").trim(),
2194
+ lineNumber: line.lineNumber,
2195
+ localIndex: line.start,
2196
+ probeText: trimmed,
2197
+ text: trimmed
2198
+ }];
2199
+ };
2200
+ const collectInlineSubentryCandidates = (pageStartOffset, line, family) => {
2201
+ const cached = inlineSubentryRegexCache.get(family);
2202
+ const prefixes = family.prefixes.length > 0 ? family.prefixes.map(escapeRegex).join("|") : escapeRegex("و");
2203
+ const regex = cached ?? new RegExp(`(^|[\\s،؛,:.])(?<lemma>(?:${prefixes})${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})\\s*:`, "gu");
2204
+ if (!cached) inlineSubentryRegexCache.set(family, regex);
2205
+ const candidates = [];
2206
+ for (const match of line.text.matchAll(regex)) {
2207
+ if (!match.groups?.lemma || match.index === void 0) continue;
2208
+ const lemmaIndex = match[0].indexOf(match.groups.lemma);
2209
+ if (lemmaIndex < 0) continue;
2210
+ const candidateStart = match.index + lemmaIndex;
2211
+ const lemma = family.stripPrefixesFromLemma ? match.groups.lemma.replace(new RegExp(`^(?:${prefixes})`, "u"), "") : match.groups.lemma;
2212
+ candidates.push({
2213
+ absoluteIndex: pageStartOffset + line.start + candidateStart,
2214
+ family: "inlineSubentry",
2215
+ kind: "entry",
2216
+ lemma,
2217
+ lineNumber: line.lineNumber,
2218
+ localIndex: line.start + candidateStart,
2219
+ probeText: line.text.slice(candidateStart).trimStart(),
2220
+ text: line.text.trim()
2221
+ });
2222
+ }
2223
+ return candidates;
2224
+ };
2225
+ const CODE_CORE_RE = new RegExp(`^${CODE_LINE_PATTERN}$`, "u");
2226
+ const STATUS_SUFFIX_RE = new RegExp(`(?:\\s*:?[\\s]*${STATUS_TAIL_PATTERN}.*)?$`, "u");
2227
+ const parseWrappedCode = (text) => {
2228
+ const paired = text.match(/^(?<open>[[(])(?<inner>.+)(?<close>[\])])$/u);
2229
+ if (!paired?.groups?.inner || !paired.groups.open || !paired.groups.close) return null;
2230
+ return {
2231
+ close: paired.groups.close,
2232
+ inner: paired.groups.inner.trim(),
2233
+ open: paired.groups.open,
2234
+ paired: paired.groups.open === "(" && paired.groups.close === ")" || paired.groups.open === "[" && paired.groups.close === "]"
2235
+ };
2236
+ };
2237
+ const collectCodeLineCandidates = (pageStartOffset, line, family) => {
2238
+ const trimmed = line.text.trim();
2239
+ const bare = trimmed.replace(STATUS_SUFFIX_RE, "").trim();
2240
+ const wrapped = parseWrappedCode(bare);
2241
+ const inner = wrapped?.inner ?? bare;
2242
+ if (!CODE_CORE_RE.test(inner)) return [];
2243
+ if (!(family.wrappers === "either" ? true : family.wrappers === "none" ? wrapped === null : family.wrappers === "paired" ? wrapped?.paired === true : wrapped !== null && !wrapped.paired)) return [];
2244
+ return [{
2245
+ absoluteIndex: pageStartOffset + line.start,
2246
+ family: "codeLine",
2247
+ kind: "marker",
2248
+ lemma: inner,
2249
+ lineNumber: line.lineNumber,
2250
+ localIndex: line.start,
2251
+ probeText: trimmed,
2252
+ text: trimmed
2253
+ }];
2254
+ };
2255
+ const collectPairedFormsCandidates = (pageStartOffset, line, family) => {
2256
+ const cached = pairedFormsRegexCache.get(family);
2257
+ const separator = family.separator === "space" ? "\\s+" : "\\s*[،,]\\s*";
2258
+ const statusTail = family.requireStatusTail ? "\\s*:\\s*(?:مستعمل|مستعملة|مستعملان|مهمل|مهملة).*" : "\\s*:";
2259
+ const regex = cached ?? new RegExp(`^(?<forms>${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN}(?:${separator}${ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN})+)${statusTail}`, "u");
2260
+ if (!cached) pairedFormsRegexCache.set(family, regex);
2261
+ const match = line.text.trim().match(regex);
2262
+ if (!match?.groups?.forms) return [];
2263
+ return [{
2264
+ absoluteIndex: pageStartOffset + line.start,
2265
+ family: "pairedForms",
2266
+ kind: family.emit,
2267
+ lemma: family.emit === "entry" ? match.groups.forms : void 0,
2268
+ lineNumber: line.lineNumber,
2269
+ localIndex: line.start,
2270
+ probeText: line.text.trim(),
2271
+ text: line.text.trim()
2272
+ }];
2273
+ };
2274
+ const blockerApplies = (blocker, family) => !blocker.appliesTo || blocker.appliesTo.includes(family);
2275
+ const isIntroCandidate = (text) => {
2276
+ const normalized = normalizeIntroContextText(text);
2277
+ return INTRO_PHRASES.some((phrase) => normalized.startsWith(normalizeArabicForComparison(phrase)));
2278
+ };
2279
+ const endsWithIntroPhrase = (text) => {
2280
+ const trimmed = text.trimEnd();
2281
+ if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
2282
+ const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
2283
+ return INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)));
2284
+ };
2285
+ const endsWithIntroContext = (text) => {
2286
+ const trimmed = text.trimEnd();
2287
+ if (STRONG_SENTENCE_TERMINATORS$1.test(trimmed)) return false;
2288
+ const normalized = normalizeIntroContextText(trimmed).trimEnd().replace(/[:؛،,.!?؟]+$/u, "").trimEnd();
2289
+ if (!normalized) return false;
2290
+ if (INTRO_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
2291
+ if (INTRO_TAIL_PHRASES.some((phrase) => normalized.endsWith(normalizeArabicForComparison(phrase)))) return true;
2292
+ return INTRO_TAIL_PATTERNS.some((pattern) => pattern.test(normalized));
2293
+ };
2294
+ const isAuthorityCandidate = (text, precision) => {
2295
+ const head = normalizeStopLemma(text.split(":", 1)[0] ?? text);
2296
+ if (head && AUTHORITY_HEAD_WORDS.some((term) => normalizeStopLemma(term) === head)) return true;
2297
+ if (AUTHORITY_RE.test(text)) return true;
2298
+ if (precision === "aggressive") {
2299
+ const normalized = normalizeIntroContextText(text);
2300
+ return [
2301
+ "الليث",
2302
+ "الأزهري",
2303
+ "الأصمعي",
2304
+ "الجوهري",
2305
+ "الفراء",
2306
+ "ثعلب",
2307
+ "شمر"
2308
+ ].some((term) => normalized.startsWith(normalizeArabicForComparison(term)));
2309
+ }
2310
+ return false;
2311
+ };
2312
+ const hasBlockedQualifierTail = (lemma) => {
2313
+ const parts = lemma.split(/[،,]/u).map((part) => part.trim()).filter(Boolean);
2314
+ if (parts.length < 2) return false;
2315
+ return startsWithConfiguredWord(QUALIFIER_TAIL_PREFIXES, parts.slice(1).join(" "));
2316
+ };
2317
+ const looksLikeStructuralLeak = (candidate) => {
2318
+ if (!candidate.lemma) return false;
2319
+ const normalizedLemma = normalizeArabicForComparison(candidate.lemma);
2320
+ if (candidate.kind === "entry" && (/^[^\p{Script=Arabic}\d]+/u.test(candidate.lemma) || candidate.lemma.includes("{") || candidate.lemma.includes("}") || candidate.lemma.includes("##"))) return true;
2321
+ if (candidate.kind === "entry" && BARE_CODE_LEMMA_RE.test(candidate.lemma) && (candidate.text === candidate.lemma || candidate.text === `${HEADING_PREFIX}${candidate.lemma}` || candidate.text.startsWith(`${HEADING_PREFIX}${candidate.lemma}`) || candidate.text.startsWith(`${candidate.lemma}\n${HEADING_PREFIX}`))) return true;
2322
+ if (candidate.family !== "pairedForms" && candidate.lemma.split(/\s+/u).filter(Boolean).length > 4) return true;
2323
+ if (startsWithConfiguredWord(STRUCTURAL_LEMMA_PREFIXES, candidate.lemma)) return true;
2324
+ if (normalizedLemma.startsWith(normalizeArabicForComparison("ولل"))) return true;
2325
+ const structuralText = candidate.text.startsWith(HEADING_PREFIX) ? candidate.text.slice(3).trim() : candidate.text;
2326
+ if (/^[\d\u0660-\u0669]+\s*-\s*\([^)]+\)(?:\s+##.*)?$/u.test(structuralText)) return true;
2327
+ const normalizedText = normalizeArabicForComparison(structuralText);
2328
+ if (STRUCTURAL_LINE_PATTERNS.some((pattern) => pattern.test(structuralText))) return STRUCTURAL_LINE_KEYWORDS.some((keyword) => normalizedText.includes(normalizeArabicForComparison(keyword)));
2329
+ return false;
2330
+ };
2331
+ const countLemma = (map, lemma) => {
2332
+ if (!lemma) return;
2333
+ map.set(lemma, (map.get(lemma) ?? 0) + 1);
2334
+ };
2335
+ const createInitialKindCounts = () => ({
2336
+ chapter: 0,
2337
+ entry: 0,
2338
+ marker: 0
2339
+ });
2340
+ const createInitialReasonCounts = () => ({
2341
+ authorityIntro: 0,
2342
+ intro: 0,
2343
+ pageContinuation: 0,
2344
+ previousChar: 0,
2345
+ previousWord: 0,
2346
+ qualifierTail: 0,
2347
+ stopLemma: 0,
2348
+ structuralLeak: 0
2349
+ });
2350
+ const createInitialFamilyCounts = () => ({
2351
+ codeLine: {
2352
+ accepted: 0,
2353
+ rejected: 0
2354
+ },
2355
+ heading: {
2356
+ accepted: 0,
2357
+ rejected: 0
2358
+ },
2359
+ inlineSubentry: {
2360
+ accepted: 0,
2361
+ rejected: 0
2362
+ },
2363
+ lineEntry: {
2364
+ accepted: 0,
2365
+ rejected: 0
2366
+ },
2367
+ pairedForms: {
2368
+ accepted: 0,
2369
+ rejected: 0
2370
+ }
2371
+ });
2372
+ const rejectsViaIntroBlocker = (candidate, blocker, localBeforeCandidate) => {
2373
+ if (blocker.use !== "intro") return false;
2374
+ return isIntroCandidate(candidate.probeText) || endsWithIntroPhrase(localBeforeCandidate) || endsWithIntroContext(localBeforeCandidate);
2375
+ };
2376
+ const rejectsViaAuthorityBlocker = (candidate, blocker) => blocker.use === "authorityIntro" && isAuthorityCandidate(candidate.probeText, blocker.precision);
2377
+ const rejectsViaStopLemmaBlocker = (candidate, blocker) => blocker.use === "stopLemma" && !!candidate.lemma && !!normalizeStopLemma(candidate.lemma) && blocker.normalizedWords.has(normalizeStopLemma(candidate.lemma));
2378
+ const rejectsViaPreviousWordBlocker = (pageContent, localIndex, blocker) => {
2379
+ if (blocker.use !== "previousWord") return false;
2380
+ const lastWord = extractLastArabicWord$1(pageContent, localIndex);
2381
+ return !!lastWord && blocker.normalizedWords.has(normalizeArabicForComparison(lastWord));
2382
+ };
2383
+ const rejectsViaPreviousCharBlocker = (pageContent, localIndex, blocker) => {
2384
+ if (blocker.use !== "previousChar") return false;
2385
+ const previousChar = previousNonWhitespaceChar(pageContent, localIndex);
2386
+ return !!previousChar && blocker.charSet.has(previousChar);
2387
+ };
2388
+ const rejectsViaPageContinuationBlocker = (candidate, blocker, localBeforeCandidate, pageIndex, pages) => {
2389
+ if (blocker.use !== "pageContinuation") return false;
2390
+ if (!(localBeforeCandidate.trim().length === 0) || pageIndex === 0) return false;
2391
+ const previousPage = pages[pageIndex - 1];
2392
+ if (!previousPage || endsWithStrongSentenceTerminator$1(previousPage.content)) return false;
2393
+ const previousWord = extractLastArabicWord$1(previousPage.content);
2394
+ return !!previousWord && CONTINUATION_PREV_WORDS.some((word) => normalizedEquals(word, previousWord)) || endsWithIntroContext(previousPage.content) || isIntroCandidate(candidate.probeText) || isAuthorityCandidate(candidate.probeText, "high");
2395
+ };
2396
+ const getBlockerRejectionReason = (blocker, candidate, localBeforeCandidate, pageContent, pageIndex, pages) => {
2397
+ if (rejectsViaIntroBlocker(candidate, blocker, localBeforeCandidate)) return "intro";
2398
+ if (rejectsViaAuthorityBlocker(candidate, blocker)) return "authorityIntro";
2399
+ if (rejectsViaStopLemmaBlocker(candidate, blocker)) return "stopLemma";
2400
+ if (rejectsViaPreviousWordBlocker(pageContent, candidate.localIndex, blocker)) return "previousWord";
2401
+ if (rejectsViaPreviousCharBlocker(pageContent, candidate.localIndex, blocker)) return "previousChar";
2402
+ if (rejectsViaPageContinuationBlocker(candidate, blocker, localBeforeCandidate, pageIndex, pages)) return "pageContinuation";
2403
+ return null;
2404
+ };
2405
+ const getCandidateRejection = (candidate, zone, pageContext, pages) => {
2406
+ const hasQualifierTail = hasBlockedQualifierTail(candidate.lemma ?? "");
2407
+ if (hasQualifierTail || looksLikeStructuralLeak(candidate)) return { reason: hasQualifierTail ? "qualifierTail" : "structuralLeak" };
2408
+ const localBeforeCandidate = getTrailingContext(pageContext.content, candidate.localIndex);
2409
+ for (const blocker of zone.blockers) {
2410
+ if (!blockerApplies(blocker, candidate.family)) continue;
2411
+ const reason = getBlockerRejectionReason(blocker, candidate, localBeforeCandidate, pageContext.content, pageContext.index, pages);
2412
+ if (reason) return { reason };
2413
+ }
2414
+ return null;
2415
+ };
2416
+ const shouldRejectCandidate = (candidate, zone, pageContext, pages) => {
2417
+ return getCandidateRejection(candidate, zone, pageContext, pages) !== null;
2418
+ };
2419
+ const collectHeadingCandidates = (pageStartOffset, line, nextLine, family, trimmed) => {
2420
+ if (!trimmed.startsWith(HEADING_PREFIX)) return [];
2421
+ const headingClass = classifyDictionaryHeading(trimmed);
2422
+ if (headingClass === "noise") return [];
2423
+ const candidate = createHeadingCandidate(pageStartOffset, line, nextLine, family, headingClass);
2424
+ return candidate ? [candidate] : [];
2425
+ };
2426
+ const collectCandidatesForFamily = (pageStartOffset, line, nextLine, family, trimmed) => {
2427
+ switch (family.use) {
2428
+ case "heading": return collectHeadingCandidates(pageStartOffset, line, nextLine, family, trimmed);
2429
+ case "lineEntry": return collectLineEntryCandidates(pageStartOffset, line, family);
2430
+ case "inlineSubentry": return collectInlineSubentryCandidates(pageStartOffset, line, family);
2431
+ case "codeLine": return collectCodeLineCandidates(pageStartOffset, line, family);
2432
+ case "pairedForms": return collectPairedFormsCandidates(pageStartOffset, line, family);
2433
+ default: return assertNever$1(family);
2434
+ }
2435
+ };
2436
+ const collectCandidatesForLine = (pageStartOffset, line, nextLine, zone) => {
2437
+ const trimmed = line.text.trim();
2438
+ const candidates = [];
2439
+ if (!trimmed) return candidates;
2440
+ for (const family of zone.families) candidates.push(...collectCandidatesForFamily(pageStartOffset, line, nextLine, family, trimmed));
2441
+ return candidates;
2442
+ };
2443
+ const candidateToSplitPoint = (candidate, debugMetaKey) => {
2444
+ const baseMeta = candidate.lemma ? {
2445
+ kind: candidate.kind,
2446
+ lemma: candidate.lemma
2447
+ } : { kind: candidate.kind };
2448
+ const meta = debugMetaKey === void 0 ? baseMeta : mergeDebugIntoMeta(baseMeta, debugMetaKey, { dictionary: {
2449
+ family: candidate.family,
2450
+ ...candidate.headingClass ? { headingClass: candidate.headingClass } : {}
2451
+ } });
2452
+ return {
2453
+ contentStartOffset: candidate.contentStartOffset,
2454
+ index: candidate.absoluteIndex,
2455
+ meta
2456
+ };
2457
+ };
2458
+ const pushDiagnosticSample = (samples, sampleLimit, sample) => {
2459
+ if (samples.length < sampleLimit) samples.push(sample);
2460
+ };
2461
+ /**
2462
+ * Collects dictionary-profile split points using the pages-only markdown surface.
2463
+ */
2464
+ const collectDictionarySplitPoints = (pages, profile, pageMap, normalizedPages, logger, debugMetaKey) => {
2465
+ const normalizedProfile = normalizeDictionaryProfile(profile);
2466
+ const pageContexts = createPageContexts(pages, pageMap, normalizedPages);
2467
+ const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
2468
+ const splitPoints = [];
2469
+ logger?.debug?.("[dictionary] collecting split points", {
2470
+ pageCount: pages.length,
2471
+ zoneCount: normalizedProfile.zones.length
2472
+ });
2473
+ for (const pageContext of pageContexts) {
2474
+ const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
2475
+ if (!zone) continue;
2476
+ for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
2477
+ const line = pageContext.lines[lineIndex];
2478
+ const nextLine = pageContext.lines[lineIndex + 1];
2479
+ const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
2480
+ for (const candidate of candidates) {
2481
+ if (shouldRejectCandidate(candidate, zone, pageContext, pageContexts)) continue;
2482
+ splitPoints.push(candidateToSplitPoint(candidate, debugMetaKey));
2483
+ }
2484
+ }
2485
+ }
2486
+ logger?.debug?.("[dictionary] collected split points", { splitPointCount: splitPoints.length });
2487
+ return splitPoints;
2488
+ };
2489
+ /**
2490
+ * Collects authoring diagnostics for a dictionary profile without creating segments.
2491
+ *
2492
+ * This is useful when tuning blockers and family choices for a new dictionary.
2493
+ */
2494
+ const diagnoseDictionaryProfile = (pages, profile, options = {}) => {
2495
+ const normalizedProfile = normalizeDictionaryProfile(profile);
2496
+ const pageMap = {
2497
+ boundaries: [],
2498
+ getId: (offset) => {
2499
+ for (const boundary of pageMap.boundaries) if (offset >= boundary.start && offset <= boundary.end) return boundary.id;
2500
+ return pageMap.boundaries.at(-1)?.id ?? 0;
2501
+ },
2502
+ pageBreaks: [],
2503
+ pageIds: pages.map((page) => page.id)
2504
+ };
2505
+ let offset = 0;
2506
+ const pageContexts = createPageContexts(pages, pageMap, pages.map((page, pageIndex) => {
2507
+ const normalized = normalizeLineEndings(page.content);
2508
+ pageMap.boundaries.push({
2509
+ end: offset + normalized.length,
2510
+ id: page.id,
2511
+ start: offset
2512
+ });
2513
+ if (pageIndex < pages.length - 1) {
2514
+ pageMap.pageBreaks.push(offset + normalized.length);
2515
+ offset += normalized.length + 1;
2516
+ } else offset += normalized.length;
2517
+ return normalized;
2518
+ }));
2519
+ const activationMap = createZoneActivationMap(normalizedProfile, pageContexts);
2520
+ const sampleLimit = options.sampleLimit ?? 50;
2521
+ const acceptedKinds = createInitialKindCounts();
2522
+ const blockerHits = createInitialReasonCounts();
2523
+ const familyCounts = createInitialFamilyCounts();
2524
+ const zoneCounts = {};
2525
+ const rejectedLemmaCounts = /* @__PURE__ */ new Map();
2526
+ const samples = [];
2527
+ let acceptedCount = 0;
2528
+ let rejectedCount = 0;
2529
+ for (const pageContext of pageContexts) {
2530
+ const zone = resolveActiveZone(normalizedProfile, activationMap, pageContext.page.id);
2531
+ if (!zone) continue;
2532
+ zoneCounts[zone.name] ??= {
2533
+ accepted: 0,
2534
+ rejected: 0
2535
+ };
2536
+ for (let lineIndex = 0; lineIndex < pageContext.lines.length; lineIndex++) {
2537
+ const line = pageContext.lines[lineIndex];
2538
+ const nextLine = pageContext.lines[lineIndex + 1];
2539
+ const candidates = collectCandidatesForLine(pageContext.boundary.start, line, nextLine, zone);
2540
+ for (const candidate of candidates) {
2541
+ const rejection = getCandidateRejection(candidate, zone, pageContext, pageContexts);
2542
+ const sampleBase = {
2543
+ absoluteIndex: candidate.absoluteIndex,
2544
+ family: candidate.family,
2545
+ kind: candidate.kind,
2546
+ lemma: candidate.lemma,
2547
+ line: candidate.lineNumber,
2548
+ pageId: pageContext.page.id,
2549
+ text: candidate.text,
2550
+ zone: zone.name
2551
+ };
2552
+ if (rejection) {
2553
+ rejectedCount += 1;
2554
+ blockerHits[rejection.reason] += 1;
2555
+ familyCounts[candidate.family].rejected += 1;
2556
+ zoneCounts[zone.name].rejected += 1;
2557
+ countLemma(rejectedLemmaCounts, candidate.lemma);
2558
+ pushDiagnosticSample(samples, sampleLimit, {
2559
+ ...sampleBase,
2560
+ accepted: false,
2561
+ reason: rejection.reason
2562
+ });
2563
+ continue;
2564
+ }
2565
+ acceptedCount += 1;
2566
+ acceptedKinds[candidate.kind] += 1;
2567
+ familyCounts[candidate.family].accepted += 1;
2568
+ zoneCounts[zone.name].accepted += 1;
2569
+ pushDiagnosticSample(samples, sampleLimit, {
2570
+ ...sampleBase,
2571
+ accepted: true
2572
+ });
2573
+ }
2574
+ }
2575
+ }
2576
+ const rejectedLemmas = [...rejectedLemmaCounts.entries()].sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])).map(([lemma, count]) => ({
2577
+ count,
2578
+ lemma
2579
+ }));
2580
+ return {
2581
+ acceptedCount,
2582
+ acceptedKinds,
2583
+ blockerHits,
2584
+ familyCounts,
2585
+ pageCount: pages.length,
2586
+ rejectedCount,
2587
+ rejectedLemmas,
2588
+ samples,
2589
+ zoneCounts
2590
+ };
2591
+ };
2592
+ //#endregion
1306
2593
  //#region src/optimization/optimize-rules.ts
1307
2594
  const MERGEABLE_KEYS = new Set([
1308
2595
  "lineStartsWith",
@@ -1319,11 +2606,17 @@ const getPatternArray = (rule, key) => {
1319
2606
  };
1320
2607
  const getPatternString = (rule, key) => {
1321
2608
  const value = rule[key];
1322
- return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : "";
2609
+ return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
1323
2610
  };
1324
2611
  const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
2612
+ const getDictionaryEntrySpecificityScore = (rule) => {
2613
+ if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return 0;
2614
+ const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
2615
+ return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
2616
+ };
1325
2617
  const getSpecificityScore = (rule) => {
1326
2618
  const key = getPatternKey(rule);
2619
+ if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
1327
2620
  return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
1328
2621
  };
1329
2622
  const createMergeKey = (rule) => {
@@ -1468,89 +2761,6 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
1468
2761
  }
1469
2762
  return result;
1470
2763
  };
1471
- //#endregion
1472
- //#region src/segmentation/arabic-dictionary-rule.ts
1473
- const uniqueNormalizedWords = (words) => {
1474
- const seen = /* @__PURE__ */ new Set();
1475
- const result = [];
1476
- for (const word of words) {
1477
- const normalized = normalizeArabicForComparison(word);
1478
- if (!normalized || seen.has(normalized)) continue;
1479
- seen.add(normalized);
1480
- result.push(normalized);
1481
- }
1482
- return result;
1483
- };
1484
- const buildStopAlternation = (stopWords) => {
1485
- const unique = uniqueNormalizedWords(stopWords);
1486
- if (unique.length === 0) return "";
1487
- return unique.map((word) => makeDiacriticInsensitive(word)).join("|");
1488
- };
1489
- const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
1490
- if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
1491
- const guardedUnit = `(?!(?:${stopwordBody})${allowCommaSeparated ? `(?:\\s*[،,]\\s*|${colonPattern})` : colonPattern})${unit}`;
1492
- return allowCommaSeparated ? `${guardedUnit}(?:\\s*[،,]\\s*${guardedUnit})*` : guardedUnit;
1493
- };
1494
- const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
1495
- const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
1496
- const withCapture = captureName ? `(?<${captureName}>${headwordBody})` : `(?:${headwordBody})`;
1497
- if (!allowParenthesized) return `${withCapture}${colon}`;
1498
- return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
1499
- };
1500
- /**
1501
- * Creates a reusable split rule for Arabic dictionary entries.
1502
- *
1503
- * The generated rule:
1504
- * - keeps the lemma marker in `segment.content`
1505
- * - stores the lemma in `segment.meta[captureName]`
1506
- * - matches root entries at true line/page starts
1507
- * - matches mid-line subentries conservatively when they begin with `و`
1508
- * - can optionally support parenthesized headwords like `(عنبر) :`
1509
- * - can optionally support comma-separated headword lists like `سبد، دبس:`
1510
- *
1511
- * @example
1512
- * createArabicDictionaryEntryRule({
1513
- * stopWords: ['وقيل', 'ويقال', 'قال'],
1514
- * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1515
- * })
1516
- *
1517
- * @example
1518
- * createArabicDictionaryEntryRule({
1519
- * allowParenthesized: true,
1520
- * allowWhitespaceBeforeColon: true,
1521
- * allowCommaSeparated: true,
1522
- * stopWords: ['الليث', 'العجاج'],
1523
- * })
1524
- */
1525
- const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
1526
- if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
1527
- if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
1528
- if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
1529
- const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
1530
- const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
1531
- const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
1532
- const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
1533
- const stopAlternation = buildStopAlternation(stopWords);
1534
- const lemmaBody = buildHeadwordBody({
1535
- allowCommaSeparated,
1536
- colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
1537
- stopAlternation,
1538
- stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
1539
- unit: lemmaUnit
1540
- });
1541
- return {
1542
- meta,
1543
- pageStartPrevWordStoplist,
1544
- regex: `(?:${`(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`}|${allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`})` + buildBalancedMarker({
1545
- allowParenthesized,
1546
- allowWhitespaceBeforeColon,
1547
- captureName,
1548
- headwordBody: lemmaBody
1549
- }),
1550
- samePagePrevWordStoplist,
1551
- split: "at"
1552
- };
1553
- };
1554
2764
  const WINDOW_PREFIX_LENGTHS = [
1555
2765
  80,
1556
2766
  60,
@@ -2458,106 +3668,37 @@ const findSafeBreakPosition = (content, targetPosition, lookbackChars = 100) =>
2458
3668
  return -1;
2459
3669
  };
2460
3670
  //#endregion
2461
- //#region src/segmentation/debug-meta.ts
2462
- const resolveDebugConfig = (debug) => {
2463
- if (debug === true) return {
2464
- includeBreakpoint: true,
2465
- includeRule: true,
2466
- metaKey: "_flappa"
2467
- };
2468
- if (!debug || typeof debug !== "object") return null;
2469
- const { metaKey, include } = debug;
2470
- const includeRule = Array.isArray(include) ? include.includes("rule") : true;
2471
- return {
2472
- includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
2473
- includeRule,
2474
- metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
2475
- };
2476
- };
2477
- const getRulePatternType = (rule) => {
2478
- return PATTERN_TYPE_KEYS.find((key) => key in rule) ?? "regex";
2479
- };
2480
- const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
2481
- const mergeDebugIntoMeta = (meta, metaKey, patch) => {
2482
- const out = meta ? { ...meta } : {};
2483
- const existing = out[metaKey];
2484
- out[metaKey] = {
2485
- ...isPlainObject(existing) ? existing : {},
2486
- ...patch
2487
- };
2488
- return out;
2489
- };
2490
- const buildRuleDebugPatch = (ruleIndex, rule, wordIndex) => {
2491
- const patternType = getRulePatternType(rule);
2492
- const patterns = rule[patternType];
2493
- const word = wordIndex !== void 0 && Array.isArray(patterns) && patterns[wordIndex] !== void 0 ? patterns[wordIndex] : void 0;
2494
- return { rule: {
2495
- index: ruleIndex,
2496
- patternType,
2497
- ...wordIndex !== void 0 ? { wordIndex } : {},
2498
- ...word !== void 0 ? { word } : {}
2499
- } };
2500
- };
2501
- const buildBreakpointDebugPatch = (breakpointIndex, rule, wordIndex) => ({ breakpoint: {
2502
- index: breakpointIndex,
2503
- kind: rule.pattern === "" ? "pageBoundary" : rule.regex ? "regex" : "pattern",
2504
- pattern: rule.pattern ?? rule.regex,
2505
- ...wordIndex !== void 0 ? { wordIndex } : {},
2506
- ...wordIndex !== void 0 && rule.words ? { word: rule.words[wordIndex] } : {}
2507
- } });
2508
- /**
2509
- * Helper to format the debug info into a human-readable string.
2510
- * @param meta - The segment metadata object
2511
- * @param options - Formatting options
2512
- */
2513
- const formatRuleReason = (rule, concise) => {
2514
- const { index, patternType, wordIndex, word } = rule;
2515
- if (concise) return `Rule: ${word ? `"${word}"` : patternType}`;
2516
- const wordInfo = word ? ` (Matched: "${word}")` : "";
2517
- return `Rule #${index} (${patternType})${wordIndex !== void 0 ? ` [idx:${wordIndex}]` : ""}${wordInfo}`;
2518
- };
2519
- const formatBreakpointReason = (breakpoint, concise) => {
2520
- const { index, kind, pattern, wordIndex, word } = breakpoint;
2521
- if (kind === "pageBoundary") return concise ? "Breakpoint: <page-boundary>" : "Page Boundary (Fallback)";
2522
- if (concise) return `Breakpoint: ${word ? `"${word}"` : `"${pattern}"`}`;
2523
- if (word) return `Breakpoint #${index} (Words) [idx:${wordIndex}] - "${word}"`;
2524
- return `Breakpoint #${index} (${kind}) - "${pattern}"`;
2525
- };
2526
- const formatContentLengthReason = (split, concise) => {
2527
- const { maxContentLength, splitReason } = split;
2528
- if (concise) return `> ${maxContentLength} (${splitReason})`;
2529
- return `Safety Split (${splitReason}) > ${maxContentLength}`;
2530
- };
2531
- /**
2532
- * Helper to format the debug info into a human-readable string.
2533
- * @param meta - The segment metadata object
2534
- * @param options - Formatting options
2535
- */
2536
- const getDebugReason = (meta, options) => {
2537
- const debug = meta?._flappa;
2538
- if (!debug) return "-";
2539
- const concise = options?.concise;
2540
- if (debug.rule) return formatRuleReason(debug.rule, concise);
2541
- if (debug.breakpoint) return formatBreakpointReason(debug.breakpoint, concise);
2542
- if (debug.contentLengthSplit) return formatContentLengthReason(debug.contentLengthSplit, concise);
2543
- return "Unknown";
2544
- };
2545
- /**
2546
- * Convenience helper to get the formatted debug reason directly from a segment.
2547
- * @param segment - The segment object
2548
- * @param options - Formatting options
2549
- */
2550
- const getSegmentDebugReason = (segment, options) => {
2551
- return getDebugReason(segment.meta, options);
2552
- };
2553
- //#endregion
2554
3671
  //#region src/segmentation/pattern-validator.ts
2555
3672
  const KNOWN_TOKENS = new Set(getAvailableTokens());
2556
3673
  const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
2557
- const buildBareTokenRegex = () => {
3674
+ const BARE_TOKEN_REGEX = (() => {
2558
3675
  const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
2559
3676
  return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
3677
+ })();
3678
+ const createMalformedTokenIssue = (tokenLiteral, side) => {
3679
+ const token = tokenLiteral.split(":", 1)[0] || void 0;
3680
+ return {
3681
+ message: `Token "${tokenLiteral || "unknown"}" appears to be missing ${side} braces.`,
3682
+ suggestion: tokenLiteral ? `{{${tokenLiteral}}}` : void 0,
3683
+ token,
3684
+ type: "missing_braces"
3685
+ };
3686
+ };
3687
+ const detectMalformedLeftToken = (pattern) => {
3688
+ for (let index = 0; index < pattern.length - 1; index++) {
3689
+ if (pattern.slice(index, index + 2) !== "{{") continue;
3690
+ const closeIndex = pattern.indexOf("}}", index + 2);
3691
+ if (closeIndex === -1) return createMalformedTokenIssue(pattern.slice(index + 2).match(/^\w+(?::\w+)?/u)?.[0] ?? "", "closing");
3692
+ index = closeIndex + 1;
3693
+ }
2560
3694
  };
3695
+ const detectMalformedRightToken = (pattern) => {
3696
+ for (let index = 0; index < pattern.length - 1; index++) {
3697
+ if (pattern.slice(index, index + 2) !== "}}") continue;
3698
+ if (pattern.lastIndexOf("{{", index) === -1) return createMalformedTokenIssue(pattern.slice(0, index).match(/(\w+(?::\w+)?)$/u)?.[1] ?? "", "opening");
3699
+ }
3700
+ };
3701
+ const detectMalformedToken = (pattern) => detectMalformedLeftToken(pattern) ?? detectMalformedRightToken(pattern);
2561
3702
  /**
2562
3703
  * Validates a single pattern for common issues.
2563
3704
  */
@@ -2575,14 +3716,16 @@ const validatePattern = (pattern, seenPatterns) => {
2575
3716
  TOKEN_INSIDE_BRACES.lastIndex = 0;
2576
3717
  for (const match of pattern.matchAll(TOKEN_INSIDE_BRACES)) {
2577
3718
  const name = match[1];
2578
- if (!KNOWN_TOKENS.has(name)) return {
3719
+ if (name && !KNOWN_TOKENS.has(name)) return {
2579
3720
  message: `Unknown token: {{${name}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
2580
3721
  suggestion: "Check spelling or use a known token",
2581
3722
  token: name,
2582
3723
  type: "unknown_token"
2583
3724
  };
2584
3725
  }
2585
- for (const match of pattern.matchAll(buildBareTokenRegex())) {
3726
+ const malformed = detectMalformedToken(pattern);
3727
+ if (malformed) return malformed;
3728
+ for (const match of pattern.matchAll(BARE_TOKEN_REGEX)) {
2586
3729
  const [full, name] = match;
2587
3730
  const idx = match.index;
2588
3731
  if (pattern.slice(Math.max(0, idx - 2), idx) !== "{{" || pattern.slice(idx + full.length, idx + full.length + 2) !== "}}") return {
@@ -2609,14 +3752,14 @@ const applyRulePatternValidation = (result, key, patterns) => {
2609
3752
  return true;
2610
3753
  };
2611
3754
  const validateTemplateRule = (rule, result) => {
2612
- if (rule.template === void 0) return false;
3755
+ if (!("template" in rule)) return false;
2613
3756
  const issue = validatePattern(rule.template, /* @__PURE__ */ new Set());
2614
3757
  if (!issue) return false;
2615
3758
  result.template = issue;
2616
3759
  return true;
2617
3760
  };
2618
3761
  const validateRegexRule = (rule, result) => {
2619
- if (rule.regex === void 0) return false;
3762
+ if (!("regex" in rule)) return false;
2620
3763
  if (!rule.regex.trim()) {
2621
3764
  result.regex = {
2622
3765
  message: "Empty pattern is not allowed",
@@ -2636,6 +3779,39 @@ const validateRegexRule = (rule, result) => {
2636
3779
  return true;
2637
3780
  }
2638
3781
  };
3782
+ const invalidDictionaryEntryIssue = (message) => ({
3783
+ message,
3784
+ type: "invalid_option"
3785
+ });
3786
+ const addBooleanDictionaryEntryIssue = (issues, key, value) => {
3787
+ if (value !== void 0 && typeof value !== "boolean") issues[key] = invalidDictionaryEntryIssue(`${key} must be a boolean`);
3788
+ };
3789
+ const addCaptureNameIssue = (issues, captureName) => {
3790
+ if (captureName !== void 0 && !/^[A-Za-z_]\w*$/.test(captureName)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
3791
+ };
3792
+ const addMinLettersIssue = (issues, minLetters) => {
3793
+ if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
3794
+ };
3795
+ const addMaxLettersIssue = (issues, maxLetters, minLetters) => {
3796
+ const min = minLetters ?? 2;
3797
+ if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < min)) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${min}`);
3798
+ };
3799
+ const validateDictionaryEntryRule = (rule, result) => {
3800
+ if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
3801
+ const issues = {};
3802
+ const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
3803
+ if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
3804
+ addBooleanDictionaryEntryIssue(issues, "allowCommaSeparated", allowCommaSeparated);
3805
+ addBooleanDictionaryEntryIssue(issues, "allowParenthesized", allowParenthesized);
3806
+ addBooleanDictionaryEntryIssue(issues, "allowWhitespaceBeforeColon", allowWhitespaceBeforeColon);
3807
+ addBooleanDictionaryEntryIssue(issues, "midLineSubentries", midLineSubentries);
3808
+ addCaptureNameIssue(issues, captureName);
3809
+ addMinLettersIssue(issues, minLetters);
3810
+ addMaxLettersIssue(issues, maxLetters, minLetters);
3811
+ if (Object.keys(issues).length === 0) return false;
3812
+ result.dictionaryEntry = issues;
3813
+ return true;
3814
+ };
2639
3815
  const formatValidationIssue = (_type, issue, loc) => {
2640
3816
  if (!issue) return null;
2641
3817
  if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
@@ -2665,12 +3841,13 @@ const formatValidationIssue = (_type, issue, loc) => {
2665
3841
  */
2666
3842
  const validateRules = (rules) => rules.map((rule) => {
2667
3843
  const result = {};
2668
- const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", rule.lineStartsWith);
2669
- const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", rule.lineStartsAfter);
2670
- const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
3844
+ const startsWithIssues = applyRulePatternValidation(result, "lineStartsWith", "lineStartsWith" in rule ? rule.lineStartsWith : void 0);
3845
+ const startsAfterIssues = applyRulePatternValidation(result, "lineStartsAfter", "lineStartsAfter" in rule ? rule.lineStartsAfter : void 0);
3846
+ const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", "lineEndsWith" in rule ? rule.lineEndsWith : void 0);
2671
3847
  const templateIssues = validateTemplateRule(rule, result);
2672
3848
  const regexIssues = validateRegexRule(rule, result);
2673
- return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues ? result : void 0;
3849
+ const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
3850
+ return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
2674
3851
  });
2675
3852
  /**
2676
3853
  * Formats a validation result array into a list of human-readable error messages.
@@ -2687,8 +3864,12 @@ const validateRules = (rules) => rules.map((rule) => {
2687
3864
  */
2688
3865
  const formatValidationReport = (results) => results.flatMap((result, i) => {
2689
3866
  if (!result) return [];
2690
- return Object.entries(result).flatMap(([type, issues]) => (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${i + 1}, ${type}`)).filter((msg) => msg !== null));
3867
+ return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
2691
3868
  });
3869
+ const formatValidationIssues = (type, issues, ruleNumber) => {
3870
+ if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
3871
+ return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
3872
+ };
2692
3873
  //#endregion
2693
3874
  //#region src/segmentation/breakpoint-processor.ts
2694
3875
  const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
@@ -3336,6 +4517,7 @@ const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
3336
4517
  if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
3337
4518
  if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
3338
4519
  if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
4520
+ if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
3339
4521
  return null;
3340
4522
  };
3341
4523
  /**
@@ -3358,7 +4540,7 @@ const buildRuleRegex = (rule, capturePrefix) => {
3358
4540
  let finalRegex = ruleRegexSource?.regex;
3359
4541
  let allCaptureNames = ruleRegexSource?.captureNames ?? [];
3360
4542
  if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
3361
- if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or lineEndsWith");
4543
+ if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
3362
4544
  if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
3363
4545
  return {
3364
4546
  captureNames: allCaptureNames,
@@ -3902,14 +5084,20 @@ const mergeRecord = (existing, incoming) => existing || incoming ? {
3902
5084
  ...existing ?? {},
3903
5085
  ...incoming ?? {}
3904
5086
  } : void 0;
5087
+ const isPlainObject = (value) => typeof value === "object" && value !== null && !Array.isArray(value);
3905
5088
  const mergeSplitPoints = (existing, incoming) => {
3906
5089
  const preferred = prefersIncomingSplitPoint(existing, incoming) ? incoming : existing;
3907
5090
  const fallback = preferred === incoming ? existing : incoming;
5091
+ const meta = mergeRecord(existing.meta, incoming.meta);
5092
+ if (meta && isPlainObject(existing.meta?._flappa) && isPlainObject(incoming.meta?._flappa)) meta._flappa = {
5093
+ ...existing.meta._flappa,
5094
+ ...incoming.meta._flappa
5095
+ };
3908
5096
  return {
3909
5097
  ...fallback,
3910
5098
  ...preferred,
3911
5099
  contentStartOffset: preferred.contentStartOffset ?? fallback.contentStartOffset,
3912
- meta: mergeRecord(existing.meta, incoming.meta),
5100
+ meta,
3913
5101
  namedCaptures: mergeRecord(existing.namedCaptures, incoming.namedCaptures)
3914
5102
  };
3915
5103
  };
@@ -4035,7 +5223,7 @@ const convertPageBreaks = (content, startOffset, pageBreaks, pageJoiner) => {
4035
5223
  * });
4036
5224
  */
4037
5225
  const segmentPages = (pages, options) => {
4038
- const { rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
5226
+ const { dictionary, rules = [], breakpoints = [], prefer = "longer", pageJoiner = "space", logger, maxContentLength, preprocess } = options;
4039
5227
  if (maxContentLength && maxContentLength < 50) throw new Error(`maxContentLength must be at least 50 characters.`);
4040
5228
  const maxPages = options.maxPages ?? Number.MAX_SAFE_INTEGER;
4041
5229
  const hasLimits = options.maxPages !== void 0 || maxContentLength !== void 0;
@@ -4059,13 +5247,17 @@ const segmentPages = (pages, options) => {
4059
5247
  pageIds: pageMap.pageIds,
4060
5248
  totalContentLength: matchContent.length
4061
5249
  });
4062
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
5250
+ const splitPointsFromRules = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
5251
+ const splitPointsFromDictionary = dictionary ? collectDictionarySplitPoints(preprocessedPages, dictionary, pageMap, normalizedContent, logger, debugMetaKey) : [];
5252
+ const splitPoints = [...splitPointsFromRules, ...splitPointsFromDictionary];
4063
5253
  const unique = dedupeSplitPoints(splitPoints);
4064
5254
  logger?.debug?.("[segmenter] split points collected", {
5255
+ dictionarySplitPoints: splitPointsFromDictionary.length,
4065
5256
  rawSplitPoints: splitPoints.length,
5257
+ ruleSplitPoints: splitPointsFromRules.length,
4066
5258
  uniqueSplitPoints: unique.length
4067
5259
  });
4068
- let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner);
5260
+ let segments = buildSegments(unique, matchContent, pageMap, rules, pageJoiner, dictionary !== void 0);
4069
5261
  logger?.debug?.("[segmenter] structural segments built", { segmentCount: segments.length });
4070
5262
  segments = ensureFallbackSegment(segments, preprocessedPages, normalizedContent, pageJoiner);
4071
5263
  if (hasLimits) {
@@ -4092,7 +5284,7 @@ const segmentPages = (pages, options) => {
4092
5284
  * @param rules - Original rules (for constraint checking on first segment)
4093
5285
  * @returns Array of segment objects
4094
5286
  */
4095
- const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
5287
+ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner, hasDictionaryProfile) => {
4096
5288
  const getActualStart = (start, contentStartOffset) => start + (contentStartOffset ?? 0);
4097
5289
  const trimSegmentText = (sliced, capturedContent, contentStartOffset) => capturedContent?.trim() ?? (contentStartOffset ? sliced.trim() : sliced.replace(/[\s\n]+$/, ""));
4098
5290
  const getAdjustedStart = (actualStart, sliced, contentStartOffset) => actualStart + (contentStartOffset ? sliced.length - sliced.trimStart().length : 0);
@@ -4136,14 +5328,16 @@ const buildSegments = (splitPoints, content, pageMap, rules, pageJoiner) => {
4136
5328
  };
4137
5329
  const segments = [];
4138
5330
  if (!splitPoints.length) {
4139
- if (anyRuleAllowsId(rules, pageMap.getId(0))) {
5331
+ const firstId = pageMap.getId(0);
5332
+ if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
4140
5333
  const s = createSegment(0, content.length);
4141
5334
  if (s) segments.push(s);
4142
5335
  }
4143
5336
  return segments;
4144
5337
  }
4145
5338
  if (splitPoints[0].index > 0) {
4146
- if (anyRuleAllowsId(rules, pageMap.getId(0))) {
5339
+ const firstId = pageMap.getId(0);
5340
+ if (hasDictionaryProfile || anyRuleAllowsId(rules, firstId)) {
4147
5341
  const s = createSegment(0, splitPoints[0].index);
4148
5342
  if (s) segments.push(s);
4149
5343
  }
@@ -4544,6 +5738,6 @@ const validateSegments = (pages, options, segments, validationOptions) => {
4544
5738
  };
4545
5739
  };
4546
5740
  //#endregion
4547
- export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
5741
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, DictionaryProfileValidationError, PATTERN_TYPE_KEYS, TOKEN_PATTERNS, Token, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
4548
5742
 
4549
5743
  //# sourceMappingURL=index.mjs.map