flappa-doormal 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -662,12 +662,24 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
662
662
  break;
663
663
  }
664
664
  const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
665
- if (computeRemainingSpan(currentFromIdx, toIdx, pageIds) <= maxPages && !remainingHasExclusions) {
665
+ const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
666
+ if (remainingSpan <= maxPages && !remainingHasExclusions) {
666
667
  const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
667
668
  if (finalSeg) result.push(finalSeg);
668
669
  break;
669
670
  }
670
671
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
672
+ logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
673
+ currentFromIdx,
674
+ currentFromPageId: pageIds[currentFromIdx],
675
+ remainingContentStart: remainingContent.slice(0, 50),
676
+ remainingContentLength: remainingContent.length,
677
+ remainingSpan,
678
+ toIdx,
679
+ toPageId: pageIds[toIdx],
680
+ windowEndIdx,
681
+ windowEndPageId: pageIds[windowEndIdx]
682
+ });
671
683
  const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
672
684
  const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
673
685
  let breakPosition = -1;
@@ -680,16 +692,35 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
680
692
  });
681
693
  if (breakPosition <= 0) breakPosition = windowEndPosition;
682
694
  const pieceContent = remainingContent.slice(0, breakPosition).trim();
695
+ logger?.debug?.("[breakpoints] selectedBreak", {
696
+ breakPosition,
697
+ pieceContentEnd: pieceContent.slice(-50),
698
+ pieceContentLength: pieceContent.length,
699
+ windowEndPosition
700
+ });
683
701
  const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
684
702
  if (pieceContent) {
685
703
  const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
686
704
  if (pieceSeg) result.push(pieceSeg);
687
705
  }
688
706
  remainingContent = remainingContent.slice(breakPosition).trim();
689
- if (!remainingContent) break;
707
+ logger?.debug?.("[breakpoints] afterSlice", {
708
+ actualEndIdx,
709
+ remainingContentLength: remainingContent.length,
710
+ remainingContentStart: remainingContent.slice(0, 60)
711
+ });
712
+ if (!remainingContent) {
713
+ logger?.debug?.("[breakpoints] done: no remaining content");
714
+ break;
715
+ }
690
716
  currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
717
+ logger?.debug?.("[breakpoints] nextIteration", {
718
+ currentFromIdx,
719
+ currentFromPageId: pageIds[currentFromIdx]
720
+ });
691
721
  isFirstPiece = false;
692
722
  }
723
+ logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
693
724
  return result;
694
725
  };
695
726
  /**
@@ -708,6 +739,14 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
708
739
  maxPages,
709
740
  segmentCount: segments.length
710
741
  });
742
+ logger?.debug?.("[breakpoints] inputSegments", {
743
+ segmentCount: segments.length,
744
+ segments: segments.map((s) => ({
745
+ contentLength: s.content.length,
746
+ from: s.from,
747
+ to: s.to
748
+ }))
749
+ });
711
750
  for (const segment of segments) {
712
751
  const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
713
752
  const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
@@ -830,39 +869,6 @@ const filterByConstraints = (matches, rule, getId) => {
830
869
  });
831
870
  };
832
871
  /**
833
- * Filters matches based on occurrence setting (first, last, or all).
834
- *
835
- * Applies occurrence-based selection to a list of matches:
836
- * - `'all'` or `undefined`: Return all matches (default)
837
- * - `'first'`: Return only the first match
838
- * - `'last'`: Return only the last match
839
- *
840
- * @param matches - Array of match results to filter
841
- * @param occurrence - Which occurrence(s) to keep
842
- * @returns Filtered array based on occurrence setting
843
- *
844
- * @example
845
- * const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
846
- *
847
- * filterByOccurrence(matches, 'first')
848
- * // → [{ start: 0 }]
849
- *
850
- * filterByOccurrence(matches, 'last')
851
- * // → [{ start: 20 }]
852
- *
853
- * filterByOccurrence(matches, 'all')
854
- * // → [{ start: 0 }, { start: 10 }, { start: 20 }]
855
- *
856
- * filterByOccurrence(matches, undefined)
857
- * // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
858
- */
859
- const filterByOccurrence = (matches, occurrence) => {
860
- if (!matches.length) return [];
861
- if (occurrence === "first") return [matches[0]];
862
- if (occurrence === "last") return [matches[matches.length - 1]];
863
- return matches;
864
- };
865
- /**
866
872
  * Checks if any rule in the list allows the given page ID.
867
873
  *
868
874
  * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
@@ -963,19 +969,13 @@ const anyRuleAllowsId = (rules, pageId) => {
963
969
  * // → '{{harf}}' (unchanged - no brackets outside tokens)
964
970
  */
965
971
  const escapeTemplateBrackets = (pattern) => {
966
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (match, token, bracket) => {
972
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
967
973
  if (token) return token;
968
974
  return `\\${bracket}`;
969
975
  });
970
976
  };
971
- /**
972
- * Base token definitions mapping human-readable token names to regex patterns.
973
- *
974
- * These tokens contain raw regex patterns and do not reference other tokens.
975
- * For composite tokens that build on these, see `COMPOSITE_TOKENS`.
976
- *
977
- * @internal
978
- */
977
+ const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|كن|مد|قد|خد|فد|دل|كد|غد|صد|تم|فق|دق|[خرزيمنصدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
978
+ const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
979
979
  const BASE_TOKENS = {
980
980
  bab: "باب",
981
981
  basmalah: ["بسم الله", "﷽"].join("|"),
@@ -983,7 +983,7 @@ const BASE_TOKENS = {
983
983
  dash: "[-–—ـ]",
984
984
  fasl: ["مسألة", "فصل"].join("|"),
985
985
  harf: "[أ-ي]",
986
- harfs: "[أ-ي](?:[أ-ي\\s]*[أ-ي])?",
986
+ harfs: "[أ-ي](?:\\s+[أ-ي])*",
987
987
  kitab: "كتاب",
988
988
  naql: [
989
989
  "حدثني",
@@ -996,6 +996,7 @@ const BASE_TOKENS = {
996
996
  ].join("|"),
997
997
  raqm: "[\\u0660-\\u0669]",
998
998
  raqms: "[\\u0660-\\u0669]+",
999
+ rumuz: RUMUZ_BLOCK,
999
1000
  tarqim: "[.!?؟؛]"
1000
1001
  };
1001
1002
  /**
@@ -1127,7 +1128,7 @@ const containsTokens = (query) => {
1127
1128
  * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
1128
1129
  * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
1129
1130
  */
1130
- const expandTokensWithCaptures = (query, fuzzyTransform) => {
1131
+ const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
1131
1132
  const captureNames = [];
1132
1133
  const captureNameCounts = /* @__PURE__ */ new Map();
1133
1134
  /**
@@ -1169,16 +1170,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
1169
1170
  const [, tokenName, captureName] = tokenMatch;
1170
1171
  if (!tokenName && captureName) {
1171
1172
  const uniqueName = getUniqueCaptureName(captureName);
1172
- captureNames.push(uniqueName);
1173
- return `(?<${uniqueName}>.+)`;
1173
+ const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
1174
+ captureNames.push(prefixedName);
1175
+ return `(?<${prefixedName}>.+)`;
1174
1176
  }
1175
1177
  let tokenPattern = TOKEN_PATTERNS[tokenName];
1176
1178
  if (!tokenPattern) return segment.value;
1177
1179
  if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
1178
1180
  if (captureName) {
1179
1181
  const uniqueName = getUniqueCaptureName(captureName);
1180
- captureNames.push(uniqueName);
1181
- return `(?<${uniqueName}>${tokenPattern})`;
1182
+ const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
1183
+ captureNames.push(prefixedName);
1184
+ return `(?<${prefixedName}>${tokenPattern})`;
1182
1185
  }
1183
1186
  return tokenPattern;
1184
1187
  });
@@ -1319,39 +1322,42 @@ const compileRuleRegex = (pattern) => {
1319
1322
  *
1320
1323
  * Brackets `()[]` outside `{{tokens}}` are auto-escaped.
1321
1324
  */
1322
- const processPattern = (pattern, fuzzy) => {
1323
- const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
1325
+ const processPattern = (pattern, fuzzy, capturePrefix) => {
1326
+ const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
1324
1327
  return {
1325
1328
  captureNames,
1326
1329
  pattern: expanded
1327
1330
  };
1328
1331
  };
1329
- const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
1330
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1332
+ const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
1333
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1331
1334
  const union = processed.map((p) => p.pattern).join("|");
1335
+ const captureNames = processed.flatMap((p) => p.captureNames);
1336
+ const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
1337
+ if (capturePrefix) captureNames.push(`${capturePrefix}content`);
1332
1338
  return {
1333
- captureNames: processed.flatMap((p) => p.captureNames),
1334
- regex: `^(?:${union})(.*)`
1339
+ captureNames,
1340
+ regex: `^(?:${union})${contentCapture}`
1335
1341
  };
1336
1342
  };
1337
- const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
1338
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1343
+ const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1344
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1339
1345
  const union = processed.map((p) => p.pattern).join("|");
1340
1346
  return {
1341
1347
  captureNames: processed.flatMap((p) => p.captureNames),
1342
1348
  regex: `^(?:${union})`
1343
1349
  };
1344
1350
  };
1345
- const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
1346
- const processed = patterns.map((p) => processPattern(p, fuzzy));
1351
+ const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
1352
+ const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
1347
1353
  const union = processed.map((p) => p.pattern).join("|");
1348
1354
  return {
1349
1355
  captureNames: processed.flatMap((p) => p.captureNames),
1350
1356
  regex: `(?:${union})$`
1351
1357
  };
1352
1358
  };
1353
- const buildTemplateRegexSource = (template) => {
1354
- const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
1359
+ const buildTemplateRegexSource = (template, capturePrefix) => {
1360
+ const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
1355
1361
  return {
1356
1362
  captureNames,
1357
1363
  regex: pattern
@@ -1363,12 +1369,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
1363
1369
  *
1364
1370
  * Behavior mirrors the previous implementation in `segmenter.ts`.
1365
1371
  */
1366
- const buildRuleRegex = (rule) => {
1372
+ const buildRuleRegex = (rule, capturePrefix) => {
1367
1373
  const s = { ...rule };
1368
1374
  const fuzzy = rule.fuzzy ?? false;
1369
1375
  let allCaptureNames = [];
1370
1376
  if (s.lineStartsAfter?.length) {
1371
- const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
1377
+ const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
1372
1378
  allCaptureNames = captureNames;
1373
1379
  return {
1374
1380
  captureNames: allCaptureNames,
@@ -1378,17 +1384,17 @@ const buildRuleRegex = (rule) => {
1378
1384
  };
1379
1385
  }
1380
1386
  if (s.lineStartsWith?.length) {
1381
- const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
1387
+ const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
1382
1388
  s.regex = regex;
1383
1389
  allCaptureNames = captureNames;
1384
1390
  }
1385
1391
  if (s.lineEndsWith?.length) {
1386
- const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
1392
+ const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
1387
1393
  s.regex = regex;
1388
1394
  allCaptureNames = captureNames;
1389
1395
  }
1390
1396
  if (s.template) {
1391
- const { regex, captureNames } = buildTemplateRegexSource(s.template);
1397
+ const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
1392
1398
  s.regex = regex;
1393
1399
  allCaptureNames = [...allCaptureNames, ...captureNames];
1394
1400
  }
@@ -1404,16 +1410,231 @@ const buildRuleRegex = (rule) => {
1404
1410
  };
1405
1411
 
1406
1412
  //#endregion
1407
- //#region src/segmentation/textUtils.ts
1413
+ //#region src/segmentation/fast-fuzzy-prefix.ts
1414
+ /**
1415
+ * Fast-path fuzzy prefix matching for common Arabic line-start markers.
1416
+ *
1417
+ * This exists to avoid running expensive fuzzy-expanded regex alternations over
1418
+ * a giant concatenated string. Instead, we match only at known line-start
1419
+ * offsets and perform a small deterministic comparison:
1420
+ * - Skip Arabic diacritics in the CONTENT
1421
+ * - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
1422
+ *
1423
+ * This module is intentionally conservative: it only supports "literal"
1424
+ * token patterns (plain text alternation via `|`), not general regex.
1425
+ */
1426
+ const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
1427
+ const equivKey = (ch) => {
1428
+ switch (ch) {
1429
+ case "آ":
1430
+ case "أ":
1431
+ case "إ": return "ا";
1432
+ case "ه": return "ة";
1433
+ case "ي": return "ى";
1434
+ default: return ch;
1435
+ }
1436
+ };
1408
1437
  /**
1409
- * Strip all HTML tags from content, keeping only text.
1438
+ * Match a fuzzy literal prefix at a given offset.
1439
+ *
1440
+ * - Skips diacritics in the content
1441
+ * - Applies equivalence groups on both content and literal
1410
1442
  *
1411
- * @param html - HTML content
1412
- * @returns Plain text content
1443
+ * @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
1413
1444
  */
1414
- const stripHtmlTags = (html) => {
1415
- return html.replace(/<[^>]*>/g, "");
1445
+ const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
1446
+ let i = offset;
1447
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1448
+ for (let j = 0; j < literal.length; j++) {
1449
+ const litCh = literal[j];
1450
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1451
+ if (i >= content.length) return null;
1452
+ const cCh = content[i];
1453
+ if (equivKey(cCh) !== equivKey(litCh)) return null;
1454
+ i++;
1455
+ }
1456
+ while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
1457
+ return i;
1458
+ };
1459
+ const isLiteralOnly = (s) => {
1460
+ return !/[\\[\]{}()^$.*+?]/.test(s);
1461
+ };
1462
+ const compileLiteralAlternation = (pattern) => {
1463
+ if (!pattern) return null;
1464
+ if (!isLiteralOnly(pattern)) return null;
1465
+ const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
1466
+ if (!alternatives.length) return null;
1467
+ return { alternatives };
1468
+ };
1469
+ /**
1470
+ * Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
1471
+ * Returns null if not eligible.
1472
+ */
1473
+ const compileFastFuzzyTokenRule = (tokenTemplate) => {
1474
+ const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
1475
+ if (!m) return null;
1476
+ const token = m[1];
1477
+ const tokenPattern = getTokenPattern(token);
1478
+ if (!tokenPattern) return null;
1479
+ const compiled = compileLiteralAlternation(tokenPattern);
1480
+ if (!compiled) return null;
1481
+ return {
1482
+ alternatives: compiled.alternatives,
1483
+ token
1484
+ };
1485
+ };
1486
+ /**
1487
+ * Try matching any alternative for a compiled token at a line-start offset.
1488
+ * Returns endOffset (exclusive) on match, else null.
1489
+ */
1490
+ const matchFastFuzzyTokenAt = (content, offset, compiled) => {
1491
+ for (const alt of compiled.alternatives) {
1492
+ const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
1493
+ if (end !== null) return end;
1494
+ }
1495
+ return null;
1496
+ };
1497
+
1498
+ //#endregion
1499
+ //#region src/segmentation/segmenter-rule-utils.ts
1500
+ const partitionRulesForMatching = (rules) => {
1501
+ const combinableRules = [];
1502
+ const standaloneRules = [];
1503
+ const fastFuzzyRules = [];
1504
+ rules.forEach((rule, index) => {
1505
+ if (rule.fuzzy && "lineStartsWith" in rule) {
1506
+ const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
1507
+ if (compiled) {
1508
+ fastFuzzyRules.push({
1509
+ compiled,
1510
+ kind: "startsWith",
1511
+ rule,
1512
+ ruleIndex: index
1513
+ });
1514
+ return;
1515
+ }
1516
+ }
1517
+ if (rule.fuzzy && "lineStartsAfter" in rule) {
1518
+ const compiled = rule.lineStartsAfter.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsAfter[0]) : null;
1519
+ if (compiled) {
1520
+ fastFuzzyRules.push({
1521
+ compiled,
1522
+ kind: "startsAfter",
1523
+ rule,
1524
+ ruleIndex: index
1525
+ });
1526
+ return;
1527
+ }
1528
+ }
1529
+ let isCombinable = true;
1530
+ if ("regex" in rule && rule.regex) {
1531
+ const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
1532
+ const hasBackreferences = /\\[1-9]/.test(rule.regex);
1533
+ const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
1534
+ if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
1535
+ }
1536
+ if (isCombinable) combinableRules.push({
1537
+ index,
1538
+ prefix: `r${index}_`,
1539
+ rule
1540
+ });
1541
+ else standaloneRules.push(rule);
1542
+ });
1543
+ return {
1544
+ combinableRules,
1545
+ fastFuzzyRules,
1546
+ standaloneRules
1547
+ };
1548
+ };
1549
+ const createPageStartGuardChecker = (matchContent, pageMap) => {
1550
+ const pageStartToBoundaryIndex = /* @__PURE__ */ new Map();
1551
+ for (let i = 0; i < pageMap.boundaries.length; i++) pageStartToBoundaryIndex.set(pageMap.boundaries[i].start, i);
1552
+ const compiledPageStartPrev = /* @__PURE__ */ new Map();
1553
+ const getPageStartPrevRegex = (rule, ruleIndex) => {
1554
+ if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
1555
+ const pattern = rule.pageStartGuard;
1556
+ if (!pattern) {
1557
+ compiledPageStartPrev.set(ruleIndex, null);
1558
+ return null;
1559
+ }
1560
+ const expanded = processPattern(pattern, false).pattern;
1561
+ const re = new RegExp(`(?:${expanded})$`, "u");
1562
+ compiledPageStartPrev.set(ruleIndex, re);
1563
+ return re;
1564
+ };
1565
+ const getPrevPageLastNonWsChar = (boundaryIndex) => {
1566
+ if (boundaryIndex <= 0) return "";
1567
+ const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
1568
+ for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
1569
+ const ch = matchContent[i];
1570
+ if (!ch) continue;
1571
+ if (/\s/u.test(ch)) continue;
1572
+ return ch;
1573
+ }
1574
+ return "";
1575
+ };
1576
+ return (rule, ruleIndex, matchStart) => {
1577
+ const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
1578
+ if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
1579
+ const prevReq = getPageStartPrevRegex(rule, ruleIndex);
1580
+ if (!prevReq) return true;
1581
+ const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
1582
+ if (!lastChar) return false;
1583
+ return prevReq.test(lastChar);
1584
+ };
1585
+ };
1586
+ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
1587
+ const splitPointsByRule = /* @__PURE__ */ new Map();
1588
+ if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
1589
+ let boundaryIdx = 0;
1590
+ let currentBoundary = pageMap.boundaries[boundaryIdx];
1591
+ const advanceBoundaryTo = (offset) => {
1592
+ while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
1593
+ boundaryIdx++;
1594
+ currentBoundary = pageMap.boundaries[boundaryIdx];
1595
+ }
1596
+ };
1597
+ const recordSplitPoint = (ruleIndex, sp) => {
1598
+ const arr = splitPointsByRule.get(ruleIndex);
1599
+ if (!arr) {
1600
+ splitPointsByRule.set(ruleIndex, [sp]);
1601
+ return;
1602
+ }
1603
+ arr.push(sp);
1604
+ };
1605
+ const isPageStart = (offset) => offset === currentBoundary?.start;
1606
+ for (let lineStart = 0; lineStart <= matchContent.length;) {
1607
+ advanceBoundaryTo(lineStart);
1608
+ const pageId = currentBoundary?.id ?? 0;
1609
+ if (lineStart >= matchContent.length) break;
1610
+ for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
1611
+ if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
1612
+ if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
1613
+ const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
1614
+ if (end === null) continue;
1615
+ const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
1616
+ if (kind === "startsWith") recordSplitPoint(ruleIndex, {
1617
+ index: splitIndex,
1618
+ meta: rule.meta
1619
+ });
1620
+ else {
1621
+ const markerLength = end - lineStart;
1622
+ recordSplitPoint(ruleIndex, {
1623
+ contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
1624
+ index: splitIndex,
1625
+ meta: rule.meta
1626
+ });
1627
+ }
1628
+ }
1629
+ const nextNl = matchContent.indexOf("\n", lineStart);
1630
+ if (nextNl === -1) break;
1631
+ lineStart = nextNl + 1;
1632
+ }
1633
+ return splitPointsByRule;
1416
1634
  };
1635
+
1636
+ //#endregion
1637
+ //#region src/segmentation/textUtils.ts
1417
1638
  /**
1418
1639
  * Normalizes line endings to Unix-style (`\n`).
1419
1640
  *
@@ -1423,7 +1644,9 @@ const stripHtmlTags = (html) => {
1423
1644
  * @param content - Raw content with potentially mixed line endings
1424
1645
  * @returns Content with all line endings normalized to `\n`
1425
1646
  */
1426
- const normalizeLineEndings = (content) => content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1647
+ const normalizeLineEndings = (content) => {
1648
+ return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
1649
+ };
1427
1650
 
1428
1651
  //#endregion
1429
1652
  //#region src/segmentation/segmenter.ts
@@ -1544,9 +1767,63 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
1544
1767
  return [initialSeg];
1545
1768
  };
1546
1769
  const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1547
- const collectSplitPointsFromRule = (rule) => {
1770
+ const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
1771
+ const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
1772
+ const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
1773
+ if (combinableRules.length > 0) {
1774
+ const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
1775
+ const built = buildRuleRegex(rule, prefix);
1776
+ return {
1777
+ prefix,
1778
+ source: `(?<${prefix}>${built.regex.source})`,
1779
+ ...built
1780
+ };
1781
+ });
1782
+ const combinedSource = ruleRegexes.map((r) => r.source).join("|");
1783
+ const combinedRegex = new RegExp(combinedSource, "gm");
1784
+ combinedRegex.lastIndex = 0;
1785
+ let m = combinedRegex.exec(matchContent);
1786
+ while (m !== null) {
1787
+ const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
1788
+ if (matchedRuleIndex !== -1) {
1789
+ const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
1790
+ const ruleInfo = ruleRegexes[matchedRuleIndex];
1791
+ const namedCaptures = {};
1792
+ if (m.groups) {
1793
+ for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
1794
+ const cleanName = prefixedName.slice(prefix.length);
1795
+ namedCaptures[cleanName] = m.groups[prefixedName];
1796
+ }
1797
+ }
1798
+ let capturedContent;
1799
+ let contentStartOffset;
1800
+ if (ruleInfo.usesLineStartsAfter) {
1801
+ capturedContent = m.groups?.[`${prefix}content`];
1802
+ if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
1803
+ }
1804
+ const start = m.index;
1805
+ const end = m.index + m[0].length;
1806
+ const pageId = pageMap.getId(start);
1807
+ if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
1808
+ if (!passesPageStartGuard(rule, originalIndex, start)) continue;
1809
+ const sp = {
1810
+ capturedContent: void 0,
1811
+ contentStartOffset,
1812
+ index: (rule.split ?? "at") === "at" ? start : end,
1813
+ meta: rule.meta,
1814
+ namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
1815
+ };
1816
+ if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
1817
+ splitPointsByRule.get(originalIndex).push(sp);
1818
+ }
1819
+ }
1820
+ if (m[0].length === 0) combinedRegex.lastIndex++;
1821
+ m = combinedRegex.exec(matchContent);
1822
+ }
1823
+ }
1824
+ const collectSplitPointsFromRule = (rule, ruleIndex) => {
1548
1825
  const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
1549
- return filterByOccurrence(filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId), rule.occurrence).map((m) => {
1826
+ const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
1550
1827
  const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
1551
1828
  const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
1552
1829
  return {
@@ -1557,8 +1834,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1557
1834
  namedCaptures: m.namedCaptures
1558
1835
  };
1559
1836
  });
1837
+ if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
1838
+ splitPointsByRule.get(ruleIndex).push(...points);
1560
1839
  };
1561
- return rules.flatMap(collectSplitPointsFromRule);
1840
+ standaloneRules.forEach((rule) => {
1841
+ collectSplitPointsFromRule(rule, rules.indexOf(rule));
1842
+ });
1843
+ const finalSplitPoints = [];
1844
+ rules.forEach((rule, index) => {
1845
+ const points = splitPointsByRule.get(index);
1846
+ if (!points || points.length === 0) return;
1847
+ let filtered = points;
1848
+ if (rule.occurrence === "first") filtered = [points[0]];
1849
+ else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
1850
+ finalSplitPoints.push(...filtered);
1851
+ });
1852
+ return finalSplitPoints;
1562
1853
  };
1563
1854
  /**
1564
1855
  * Executes a regex against content and extracts match results with capture information.
@@ -1686,12 +1977,11 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1686
1977
  * });
1687
1978
  */
1688
1979
  const segmentPages = (pages, options) => {
1689
- const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
1690
- if (!pages.length) return [];
1980
+ const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
1691
1981
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1692
1982
  let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
1693
1983
  segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
1694
- if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) {
1984
+ if (maxPages >= 0 && breakpoints.length) {
1695
1985
  const patternProcessor = (p) => processPattern(p, false).pattern;
1696
1986
  return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
1697
1987
  }
@@ -1766,7 +2056,225 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
1766
2056
  };
1767
2057
 
1768
2058
  //#endregion
1769
- //#region src/pattern-detection.ts
2059
+ //#region src/analysis.ts
2060
+ const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
2061
+ const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "");
2062
+ const computeSpecificity = (pattern) => {
2063
+ const tokenCount = countTokenMarkers(pattern);
2064
+ return {
2065
+ literalLen: stripWhitespacePlaceholders(pattern).length,
2066
+ tokenCount
2067
+ };
2068
+ };
2069
+ const DEFAULT_OPTIONS = {
2070
+ includeFirstWordFallback: true,
2071
+ lineFilter: void 0,
2072
+ maxExamples: 1,
2073
+ minCount: 3,
2074
+ minLineLength: 6,
2075
+ normalizeArabicDiacritics: true,
2076
+ prefixChars: 60,
2077
+ prefixMatchers: [/^#+/u],
2078
+ sortBy: "specificity",
2079
+ topK: 40
2080
+ };
2081
+ const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2082
+ const TOKEN_PRIORITY_ORDER$1 = [
2083
+ "basmalah",
2084
+ "kitab",
2085
+ "bab",
2086
+ "fasl",
2087
+ "naql",
2088
+ "rumuz",
2089
+ "numbered",
2090
+ "raqms",
2091
+ "raqm",
2092
+ "dash",
2093
+ "bullet",
2094
+ "tarqim"
2095
+ ];
2096
+ const buildTokenPriority = () => {
2097
+ const allTokens = new Set(getAvailableTokens());
2098
+ return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
2099
+ };
2100
+ const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
2101
+ const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
2102
+ const compileTokenRegexes = (tokenNames) => {
2103
+ const compiled = [];
2104
+ for (const token of tokenNames) {
2105
+ const pat = TOKEN_PATTERNS[token];
2106
+ if (!pat) continue;
2107
+ try {
2108
+ compiled.push({
2109
+ re: new RegExp(pat, "uy"),
2110
+ token
2111
+ });
2112
+ } catch {}
2113
+ }
2114
+ return compiled;
2115
+ };
2116
+ const appendWs = (out) => out && !out.endsWith("\\s*") ? `${out}\\s*` : out;
2117
+ const consumeLeadingPrefixes = (s, pos, out, prefixMatchers) => {
2118
+ let matchedAny = false;
2119
+ let currentPos = pos;
2120
+ let currentOut = out;
2121
+ for (const re of prefixMatchers) {
2122
+ if (currentPos >= s.length) break;
2123
+ const m = re.exec(s.slice(currentPos));
2124
+ if (!m || m.index !== 0 || !m[0]) continue;
2125
+ currentOut += escapeRegexLiteral(m[0]);
2126
+ currentPos += m[0].length;
2127
+ matchedAny = true;
2128
+ const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
2129
+ if (wsAfter) {
2130
+ currentPos += wsAfter[0].length;
2131
+ currentOut = appendWs(currentOut);
2132
+ }
2133
+ }
2134
+ return {
2135
+ matchedAny,
2136
+ out: currentOut,
2137
+ pos: currentPos
2138
+ };
2139
+ };
2140
+ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
2141
+ let best = null;
2142
+ for (const { token, re } of compiled) {
2143
+ re.lastIndex = pos;
2144
+ const m = re.exec(s);
2145
+ if (!m || m.index !== pos) continue;
2146
+ if (!best || m[0].length > best.text.length) best = {
2147
+ text: m[0],
2148
+ token
2149
+ };
2150
+ }
2151
+ if (best?.token === "rumuz") {
2152
+ const end = pos + best.text.length;
2153
+ const next = end < s.length ? s[end] : "";
2154
+ if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
2155
+ }
2156
+ return best;
2157
+ };
2158
+ const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers) => {
2159
+ const trimmed = collapseWhitespace(line);
2160
+ if (!trimmed) return null;
2161
+ const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
2162
+ let pos = 0;
2163
+ let out = "";
2164
+ let matchedAny = false;
2165
+ let matchedToken = false;
2166
+ const compiled = compileTokenRegexes(tokenNames);
2167
+ const isArabicLetter = (ch) => /[\u0600-\u06FF]/u.test(ch);
2168
+ const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
2169
+ {
2170
+ const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers);
2171
+ pos = consumed.pos;
2172
+ out = consumed.out;
2173
+ matchedAny = consumed.matchedAny;
2174
+ }
2175
+ for (let steps = 0; steps < 6 && pos < s.length; steps++) {
2176
+ const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
2177
+ if (wsMatch) {
2178
+ pos += wsMatch[0].length;
2179
+ out = appendWs(out);
2180
+ continue;
2181
+ }
2182
+ const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
2183
+ if (best) {
2184
+ if (out && !out.endsWith("\\s*")) {}
2185
+ out += `{{${best.token}}}`;
2186
+ matchedAny = true;
2187
+ matchedToken = true;
2188
+ pos += best.text.length;
2189
+ continue;
2190
+ }
2191
+ if (matchedAny) {
2192
+ const ch = s[pos];
2193
+ if (ch && isCommonDelimiter(ch)) {
2194
+ out += escapeRegexLiteral(ch);
2195
+ pos += 1;
2196
+ continue;
2197
+ }
2198
+ }
2199
+ if (matchedAny) {
2200
+ if (includeFirstWordFallback && !matchedToken) {
2201
+ const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2202
+ if (!firstWord$1) break;
2203
+ out += escapeRegexLiteral(firstWord$1);
2204
+ }
2205
+ break;
2206
+ }
2207
+ if (!includeFirstWordFallback) return null;
2208
+ const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
2209
+ if (!firstWord) return null;
2210
+ out += escapeRegexLiteral(firstWord);
2211
+ return out;
2212
+ }
2213
+ if (!matchedAny) return null;
2214
+ while (out.endsWith("\\s*")) out = out.slice(0, -3);
2215
+ return out;
2216
+ };
2217
+ /**
2218
+ * Analyze pages and return the most common line-start patterns (top K).
2219
+ *
2220
+ * This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
2221
+ * template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
2222
+ */
2223
+ const analyzeCommonLineStarts = (pages, options = {}) => {
2224
+ const o = {
2225
+ ...DEFAULT_OPTIONS,
2226
+ ...options,
2227
+ lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
2228
+ prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers
2229
+ };
2230
+ const tokenPriority = buildTokenPriority();
2231
+ const counts = /* @__PURE__ */ new Map();
2232
+ for (const page of pages) {
2233
+ const lines = normalizeLineEndings(page.content ?? "").split("\n");
2234
+ for (const line of lines) {
2235
+ const trimmed = collapseWhitespace(line);
2236
+ if (trimmed.length < o.minLineLength) continue;
2237
+ if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
2238
+ const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers);
2239
+ if (!sig) continue;
2240
+ const existing = counts.get(sig);
2241
+ if (!existing) counts.set(sig, {
2242
+ count: 1,
2243
+ examples: [{
2244
+ line: trimmed,
2245
+ pageId: page.id
2246
+ }]
2247
+ });
2248
+ else {
2249
+ existing.count++;
2250
+ if (existing.examples.length < o.maxExamples) existing.examples.push({
2251
+ line: trimmed,
2252
+ pageId: page.id
2253
+ });
2254
+ }
2255
+ }
2256
+ }
2257
+ const compareSpecificityThenCount = (a, b) => {
2258
+ const sa = computeSpecificity(a.pattern);
2259
+ const sb = computeSpecificity(b.pattern);
2260
+ if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
2261
+ if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
2262
+ if (b.count !== a.count) return b.count - a.count;
2263
+ return a.pattern.localeCompare(b.pattern);
2264
+ };
2265
+ const compareCountThenSpecificity = (a, b) => {
2266
+ if (b.count !== a.count) return b.count - a.count;
2267
+ return compareSpecificityThenCount(a, b);
2268
+ };
2269
+ return [...counts.entries()].map(([pattern, v]) => ({
2270
+ count: v.count,
2271
+ examples: v.examples,
2272
+ pattern
2273
+ })).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
2274
+ };
2275
+
2276
+ //#endregion
2277
+ //#region src/detection.ts
1770
2278
  /**
1771
2279
  * Pattern detection utilities for recognizing template tokens in Arabic text.
1772
2280
  * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
@@ -1785,6 +2293,7 @@ const TOKEN_PRIORITY_ORDER = [
1785
2293
  "bab",
1786
2294
  "fasl",
1787
2295
  "naql",
2296
+ "rumuz",
1788
2297
  "numbered",
1789
2298
  "raqms",
1790
2299
  "raqm",
@@ -1921,5 +2430,5 @@ const analyzeTextForRule = (text) => {
1921
2430
  };
1922
2431
 
1923
2432
  //#endregion
1924
- export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
2433
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
1925
2434
  //# sourceMappingURL=index.mjs.map