flappa-doormal 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1227,7 +1227,7 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
1227
1227
  * @param prefer - 'longer' for last match, 'shorter' for first match
1228
1228
  * @returns Processed segments with oversized ones broken up
1229
1229
  */
1230
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer) => {
1230
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger) => {
1231
1231
  const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
1232
1232
  const startingPageId = pageIds$1[currentFromIdx];
1233
1233
  if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
@@ -1259,72 +1259,168 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1259
1259
  const patternProcessor = (p) => processPattern(p, false).pattern;
1260
1260
  const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
1261
1261
  const result = [];
1262
+ logger?.info?.("Starting breakpoint processing", {
1263
+ maxPages,
1264
+ segmentCount: segments.length
1265
+ });
1262
1266
  for (const segment of segments) {
1263
1267
  const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
1264
1268
  const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1269
+ logger?.debug?.("Processing segment", {
1270
+ contentLength: segment.content.length,
1271
+ contentPreview: segment.content.slice(0, 100),
1272
+ from: segment.from,
1273
+ fromIdx,
1274
+ to: segment.to,
1275
+ toIdx
1276
+ });
1265
1277
  const segmentSpan = (segment.to ?? segment.from) - segment.from;
1266
1278
  const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
1267
1279
  if (segmentSpan <= maxPages && !hasExclusions) {
1280
+ logger?.trace?.("Segment within limit, keeping as-is");
1268
1281
  result.push(segment);
1269
1282
  continue;
1270
1283
  }
1284
+ logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
1271
1285
  let remainingContent = segment.content;
1272
1286
  let currentFromIdx = fromIdx;
1273
1287
  let isFirstPiece = true;
1288
+ let iterationCount = 0;
1289
+ const maxIterations = 1e4;
1274
1290
  while (currentFromIdx <= toIdx) {
1291
+ iterationCount++;
1292
+ if (iterationCount > maxIterations) {
1293
+ logger?.error?.("INFINITE LOOP DETECTED! Breaking out", { iterationCount: maxIterations });
1294
+ logger?.error?.("Loop state", {
1295
+ currentFromIdx,
1296
+ remainingContentLength: remainingContent.length,
1297
+ toIdx
1298
+ });
1299
+ break;
1300
+ }
1275
1301
  const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
1302
+ logger?.trace?.("Loop iteration", {
1303
+ currentFromIdx,
1304
+ currentPageId: pageIds[currentFromIdx],
1305
+ iterationCount,
1306
+ remainingContentLength: remainingContent.length,
1307
+ remainingContentPreview: remainingContent.slice(0, 80),
1308
+ remainingSpan,
1309
+ toIdx,
1310
+ toPageId: pageIds[toIdx]
1311
+ });
1276
1312
  const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
1277
1313
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
1314
+ logger?.debug?.("Remaining span within limit, outputting final segment");
1278
1315
  const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1279
1316
  if (finalSeg) result.push(finalSeg);
1280
1317
  break;
1281
1318
  }
1282
- const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
1319
+ const currentPageId = pageIds[currentFromIdx];
1320
+ const maxWindowPageId = currentPageId + maxPages;
1283
1321
  let windowEndIdx = currentFromIdx;
1284
1322
  for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
1285
1323
  else break;
1324
+ logger?.trace?.("Window calculation", {
1325
+ currentPageId,
1326
+ maxWindowPageId,
1327
+ windowEndIdx,
1328
+ windowEndPageId: pageIds[windowEndIdx]
1329
+ });
1286
1330
  const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
1287
1331
  let breakPosition = -1;
1288
- if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1289
- if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, {
1290
- cumulativeOffsets,
1291
- expandedBreakpoints,
1292
- normalizedPages,
1293
- pageIds,
1294
- prefer
1295
- });
1332
+ if (windowHasExclusions) {
1333
+ logger?.trace?.("Window has exclusions, finding exclusion break position");
1334
+ breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1335
+ logger?.trace?.("Exclusion break position", { breakPosition });
1336
+ }
1296
1337
  if (breakPosition <= 0) {
1338
+ const breakpointCtx = {
1339
+ cumulativeOffsets,
1340
+ expandedBreakpoints,
1341
+ normalizedPages,
1342
+ pageIds,
1343
+ prefer
1344
+ };
1345
+ logger?.trace?.("Finding break position using patterns...");
1346
+ breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
1347
+ logger?.trace?.("Pattern break position", { breakPosition });
1348
+ }
1349
+ if (breakPosition <= 0) {
1350
+ logger?.debug?.("No pattern matched, falling back to page boundary");
1297
1351
  if (windowEndIdx === currentFromIdx) {
1352
+ logger?.trace?.("Single page window, outputting page and advancing");
1298
1353
  const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
1299
1354
  const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
1300
1355
  if (pageSeg) result.push(pageSeg);
1301
1356
  remainingContent = remainingContent.slice(pageContent.length).trim();
1302
1357
  currentFromIdx++;
1303
1358
  isFirstPiece = false;
1359
+ logger?.trace?.("After single page", {
1360
+ currentFromIdx,
1361
+ remainingContentLength: remainingContent.length
1362
+ });
1304
1363
  continue;
1305
1364
  }
1306
1365
  breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
1366
+ logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
1307
1367
  }
1308
1368
  const pieceContent = remainingContent.slice(0, breakPosition).trim();
1369
+ logger?.trace?.("Piece extracted", {
1370
+ breakPosition,
1371
+ pieceContentLength: pieceContent.length,
1372
+ pieceContentPreview: pieceContent.slice(0, 80)
1373
+ });
1309
1374
  const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
1310
1375
  const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
1376
+ logger?.trace?.("Actual page indices", {
1377
+ actualEndIdx,
1378
+ actualStartIdx,
1379
+ pieceHasContent: !!pieceContent
1380
+ });
1311
1381
  if (pieceContent) {
1312
1382
  const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
1313
- if (pieceSeg) result.push(pieceSeg);
1383
+ if (pieceSeg) {
1384
+ result.push(pieceSeg);
1385
+ logger?.debug?.("Created segment", {
1386
+ contentLength: pieceSeg.content.length,
1387
+ from: pieceSeg.from,
1388
+ to: pieceSeg.to
1389
+ });
1390
+ }
1314
1391
  }
1392
+ const prevRemainingLength = remainingContent.length;
1315
1393
  remainingContent = remainingContent.slice(breakPosition).trim();
1394
+ logger?.trace?.("After slicing remainingContent", {
1395
+ newLength: remainingContent.length,
1396
+ prevLength: prevRemainingLength,
1397
+ slicedAmount: breakPosition
1398
+ });
1399
+ if (!remainingContent) {
1400
+ logger?.debug?.("No remaining content, breaking out of loop");
1401
+ break;
1402
+ }
1316
1403
  let nextFromIdx = actualEndIdx;
1317
- if (remainingContent && actualEndIdx + 1 <= toIdx) {
1404
+ if (actualEndIdx + 1 <= toIdx) {
1318
1405
  const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
1319
1406
  if (nextPageData) {
1320
1407
  const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
1321
- if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
1408
+ if (nextPrefix && remainingContent.startsWith(nextPrefix)) {
1409
+ nextFromIdx = actualEndIdx + 1;
1410
+ logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
1411
+ }
1322
1412
  }
1323
1413
  }
1414
+ logger?.trace?.("End of iteration", {
1415
+ nextFromIdx,
1416
+ prevCurrentFromIdx: currentFromIdx,
1417
+ willAdvance: nextFromIdx !== currentFromIdx
1418
+ });
1324
1419
  currentFromIdx = nextFromIdx;
1325
1420
  isFirstPiece = false;
1326
1421
  }
1327
1422
  }
1423
+ logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
1328
1424
  return result;
1329
1425
  };
1330
1426
  /**
@@ -1370,7 +1466,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1370
1466
  * });
1371
1467
  */
1372
1468
  const segmentPages = (pages, options) => {
1373
- const { rules = [], maxPages, breakpoints, prefer = "longer" } = options;
1469
+ const { rules = [], maxPages, breakpoints, prefer = "longer", logger } = options;
1374
1470
  if (!pages.length) return [];
1375
1471
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
1376
1472
  const splitPoints = [];
@@ -1408,7 +1504,7 @@ const segmentPages = (pages, options) => {
1408
1504
  if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1409
1505
  if (initialSeg.content) segments = [initialSeg];
1410
1506
  }
1411
- if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer);
1507
+ if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, logger);
1412
1508
  return segments;
1413
1509
  };
1414
1510
  /**
@@ -1480,5 +1576,160 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
1480
1576
  };
1481
1577
 
1482
1578
  //#endregion
1483
- export { TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
1579
+ //#region src/pattern-detection.ts
1580
+ /**
1581
+ * Pattern detection utilities for recognizing template tokens in Arabic text.
1582
+ * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
1583
+ *
1584
+ * @module pattern-detection
1585
+ */
1586
+ /**
1587
+ * Token detection order - more specific patterns first to avoid partial matches.
1588
+ * Example: 'raqms' before 'raqm' so "٣٤" matches 'raqms' not just the first digit.
1589
+ *
1590
+ * Tokens not in this list are appended in alphabetical order from TOKEN_PATTERNS.
1591
+ */
1592
+ const TOKEN_PRIORITY_ORDER = [
1593
+ "basmalah",
1594
+ "kitab",
1595
+ "bab",
1596
+ "fasl",
1597
+ "naql",
1598
+ "numbered",
1599
+ "raqms",
1600
+ "raqm",
1601
+ "tarqim",
1602
+ "bullet",
1603
+ "dash",
1604
+ "harf"
1605
+ ];
1606
+ /**
1607
+ * Gets the token detection priority order.
1608
+ * Returns tokens in priority order, with any TOKEN_PATTERNS not in the priority list appended.
1609
+ */
1610
+ const getTokenPriority = () => {
1611
+ const allTokens = getAvailableTokens();
1612
+ const prioritized = TOKEN_PRIORITY_ORDER.filter((t) => allTokens.includes(t));
1613
+ const remaining = allTokens.filter((t) => !TOKEN_PRIORITY_ORDER.includes(t)).sort();
1614
+ return [...prioritized, ...remaining];
1615
+ };
1616
+ /**
1617
+ * Analyzes text and returns all detected token patterns with their positions.
1618
+ * Patterns are detected in priority order to avoid partial matches.
1619
+ *
1620
+ * @param text - The text to analyze for token patterns
1621
+ * @returns Array of detected patterns sorted by position
1622
+ *
1623
+ * @example
1624
+ * detectTokenPatterns("٣٤ - حدثنا")
1625
+ * // Returns: [
1626
+ * // { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
1627
+ * // { token: 'dash', match: '-', index: 3, endIndex: 4 },
1628
+ * // { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
1629
+ * // ]
1630
+ */
1631
+ const detectTokenPatterns = (text) => {
1632
+ if (!text) return [];
1633
+ const results = [];
1634
+ const coveredRanges = [];
1635
+ const isPositionCovered = (start, end) => {
1636
+ return coveredRanges.some(([s, e]) => start >= s && start < e || end > s && end <= e || start <= s && end >= e);
1637
+ };
1638
+ for (const tokenName of getTokenPriority()) {
1639
+ const pattern = TOKEN_PATTERNS[tokenName];
1640
+ if (!pattern) continue;
1641
+ try {
1642
+ const regex = new RegExp(`(${pattern})`, "gu");
1643
+ let match;
1644
+ while ((match = regex.exec(text)) !== null) {
1645
+ const startIndex = match.index;
1646
+ const endIndex = startIndex + match[0].length;
1647
+ if (isPositionCovered(startIndex, endIndex)) continue;
1648
+ results.push({
1649
+ endIndex,
1650
+ index: startIndex,
1651
+ match: match[0],
1652
+ token: tokenName
1653
+ });
1654
+ coveredRanges.push([startIndex, endIndex]);
1655
+ }
1656
+ } catch {}
1657
+ }
1658
+ return results.sort((a, b) => a.index - b.index);
1659
+ };
1660
+ /**
1661
+ * Generates a template pattern from text using detected tokens.
1662
+ * Replaces matched portions with {{token}} syntax.
1663
+ *
1664
+ * @param text - Original text
1665
+ * @param detected - Array of detected patterns from detectTokenPatterns
1666
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
1667
+ *
1668
+ * @example
1669
+ * const detected = detectTokenPatterns("٣٤ - ");
1670
+ * generateTemplateFromText("٣٤ - ", detected);
1671
+ * // Returns: "{{raqms}} {{dash}} "
1672
+ */
1673
+ const generateTemplateFromText = (text, detected) => {
1674
+ if (!text || detected.length === 0) return text;
1675
+ let template = text;
1676
+ const sortedByIndexDesc = [...detected].sort((a, b) => b.index - a.index);
1677
+ for (const d of sortedByIndexDesc) template = `${template.slice(0, d.index)}{{${d.token}}}${template.slice(d.endIndex)}`;
1678
+ return template;
1679
+ };
1680
+ /**
1681
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
1682
+ *
1683
+ * @param detected - Array of detected patterns
1684
+ * @returns Suggested pattern type and whether to use fuzzy matching
1685
+ */
1686
+ const suggestPatternConfig = (detected) => {
1687
+ const hasStructuralToken = detected.some((d) => [
1688
+ "basmalah",
1689
+ "kitab",
1690
+ "bab",
1691
+ "fasl"
1692
+ ].includes(d.token));
1693
+ const hasNumberedPattern = detected.some((d) => [
1694
+ "raqms",
1695
+ "raqm",
1696
+ "numbered"
1697
+ ].includes(d.token));
1698
+ if (hasStructuralToken) return {
1699
+ fuzzy: true,
1700
+ metaType: detected.find((d) => [
1701
+ "kitab",
1702
+ "bab",
1703
+ "fasl"
1704
+ ].includes(d.token))?.token || "chapter",
1705
+ patternType: "lineStartsWith"
1706
+ };
1707
+ if (hasNumberedPattern) return {
1708
+ fuzzy: false,
1709
+ metaType: "hadith",
1710
+ patternType: "lineStartsAfter"
1711
+ };
1712
+ return {
1713
+ fuzzy: false,
1714
+ patternType: "lineStartsAfter"
1715
+ };
1716
+ };
1717
+ /**
1718
+ * Analyzes text and generates a complete suggested rule configuration.
1719
+ *
1720
+ * @param text - Highlighted text from the page
1721
+ * @returns Suggested rule configuration or null if no patterns detected
1722
+ */
1723
+ const analyzeTextForRule = (text) => {
1724
+ const detected = detectTokenPatterns(text);
1725
+ if (detected.length === 0) return null;
1726
+ return {
1727
+ detected,
1728
+ template: generateTemplateFromText(text, detected),
1729
+ ...suggestPatternConfig(detected)
1730
+ };
1731
+ };
1732
+
1733
+ //#endregion
1734
+ export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
1484
1735
  //# sourceMappingURL=index.mjs.map