flappa-doormal 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +32 -0
- package/README.md +94 -56
- package/dist/index.d.mts +95 -8
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +49 -116
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
package/dist/index.mjs
CHANGED
|
@@ -673,6 +673,35 @@ const normalizeLineEndings = (content) => content.replace(/\r\n?/g, "\n");
|
|
|
673
673
|
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
674
674
|
*/
|
|
675
675
|
/**
|
|
676
|
+
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
677
|
+
* but preserves content inside `{{...}}` token delimiters.
|
|
678
|
+
*
|
|
679
|
+
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
680
|
+
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
681
|
+
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
682
|
+
*
|
|
683
|
+
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
684
|
+
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
685
|
+
*
|
|
686
|
+
* @example
|
|
687
|
+
* escapeTemplateBrackets('({{harf}}): ')
|
|
688
|
+
* // → '\\({{harf}}\\): '
|
|
689
|
+
*
|
|
690
|
+
* @example
|
|
691
|
+
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
692
|
+
* // → '\\[{{raqm}}\\] '
|
|
693
|
+
*
|
|
694
|
+
* @example
|
|
695
|
+
* escapeTemplateBrackets('{{harf}}')
|
|
696
|
+
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
697
|
+
*/
|
|
698
|
+
const escapeTemplateBrackets = (pattern) => {
|
|
699
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (match, token, bracket) => {
|
|
700
|
+
if (token) return token;
|
|
701
|
+
return `\\${bracket}`;
|
|
702
|
+
});
|
|
703
|
+
};
|
|
704
|
+
/**
|
|
676
705
|
* Base token definitions mapping human-readable token names to regex patterns.
|
|
677
706
|
*
|
|
678
707
|
* These tokens contain raw regex patterns and do not reference other tokens.
|
|
@@ -1000,7 +1029,7 @@ const hasCapturingGroup = (pattern) => {
|
|
|
1000
1029
|
* // → { pattern: 'حَ?دَّ?ثَ?نَ?ا|...', captureNames: [] }
|
|
1001
1030
|
*/
|
|
1002
1031
|
const processPattern = (pattern, fuzzy) => {
|
|
1003
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(pattern, fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1032
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1004
1033
|
return {
|
|
1005
1034
|
captureNames,
|
|
1006
1035
|
pattern: expanded
|
|
@@ -1055,16 +1084,16 @@ const buildRuleRegex = (rule) => {
|
|
|
1055
1084
|
const processed = s.lineStartsWith.map((p) => processPattern(p, fuzzy));
|
|
1056
1085
|
const patterns = processed.map((p) => p.pattern).join("|");
|
|
1057
1086
|
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1058
|
-
s.
|
|
1087
|
+
s.regex = `^(?:${patterns})`;
|
|
1059
1088
|
}
|
|
1060
1089
|
if (s.lineEndsWith?.length) {
|
|
1061
1090
|
const processed = s.lineEndsWith.map((p) => processPattern(p, fuzzy));
|
|
1062
1091
|
const patterns = processed.map((p) => p.pattern).join("|");
|
|
1063
1092
|
allCaptureNames = processed.flatMap((p) => p.captureNames);
|
|
1064
|
-
s.
|
|
1093
|
+
s.regex = `(?:${patterns})$`;
|
|
1065
1094
|
}
|
|
1066
1095
|
if (s.template) {
|
|
1067
|
-
const { pattern, captureNames } = expandTokensWithCaptures(s.template);
|
|
1096
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(s.template));
|
|
1068
1097
|
s.regex = pattern;
|
|
1069
1098
|
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1070
1099
|
}
|
|
@@ -1227,7 +1256,7 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
1227
1256
|
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
1228
1257
|
* @returns Processed segments with oversized ones broken up
|
|
1229
1258
|
*/
|
|
1230
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer
|
|
1259
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer) => {
|
|
1231
1260
|
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds$1, expandedBreakpoints$1, cumulativeOffsets$1) => {
|
|
1232
1261
|
const startingPageId = pageIds$1[currentFromIdx];
|
|
1233
1262
|
if (expandedBreakpoints$1.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets$1[currentFromIdx + 1] - cumulativeOffsets$1[currentFromIdx];
|
|
@@ -1259,168 +1288,72 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1259
1288
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1260
1289
|
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
|
|
1261
1290
|
const result = [];
|
|
1262
|
-
logger?.info?.("Starting breakpoint processing", {
|
|
1263
|
-
maxPages,
|
|
1264
|
-
segmentCount: segments.length
|
|
1265
|
-
});
|
|
1266
1291
|
for (const segment of segments) {
|
|
1267
1292
|
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
1268
1293
|
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
1269
|
-
logger?.debug?.("Processing segment", {
|
|
1270
|
-
contentLength: segment.content.length,
|
|
1271
|
-
contentPreview: segment.content.slice(0, 100),
|
|
1272
|
-
from: segment.from,
|
|
1273
|
-
fromIdx,
|
|
1274
|
-
to: segment.to,
|
|
1275
|
-
toIdx
|
|
1276
|
-
});
|
|
1277
1294
|
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
1278
1295
|
const hasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
1279
1296
|
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
1280
|
-
logger?.trace?.("Segment within limit, keeping as-is");
|
|
1281
1297
|
result.push(segment);
|
|
1282
1298
|
continue;
|
|
1283
1299
|
}
|
|
1284
|
-
logger?.debug?.("Segment exceeds limit or has exclusions, breaking it up");
|
|
1285
1300
|
let remainingContent = segment.content;
|
|
1286
1301
|
let currentFromIdx = fromIdx;
|
|
1287
1302
|
let isFirstPiece = true;
|
|
1288
|
-
let iterationCount = 0;
|
|
1289
|
-
const maxIterations = 1e4;
|
|
1290
1303
|
while (currentFromIdx <= toIdx) {
|
|
1291
|
-
iterationCount++;
|
|
1292
|
-
if (iterationCount > maxIterations) {
|
|
1293
|
-
logger?.error?.("INFINITE LOOP DETECTED! Breaking out", { iterationCount: maxIterations });
|
|
1294
|
-
logger?.error?.("Loop state", {
|
|
1295
|
-
currentFromIdx,
|
|
1296
|
-
remainingContentLength: remainingContent.length,
|
|
1297
|
-
toIdx
|
|
1298
|
-
});
|
|
1299
|
-
break;
|
|
1300
|
-
}
|
|
1301
1304
|
const remainingSpan = pageIds[toIdx] - pageIds[currentFromIdx];
|
|
1302
|
-
logger?.trace?.("Loop iteration", {
|
|
1303
|
-
currentFromIdx,
|
|
1304
|
-
currentPageId: pageIds[currentFromIdx],
|
|
1305
|
-
iterationCount,
|
|
1306
|
-
remainingContentLength: remainingContent.length,
|
|
1307
|
-
remainingContentPreview: remainingContent.slice(0, 80),
|
|
1308
|
-
remainingSpan,
|
|
1309
|
-
toIdx,
|
|
1310
|
-
toPageId: pageIds[toIdx]
|
|
1311
|
-
});
|
|
1312
1305
|
const remainingHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, toIdx));
|
|
1313
1306
|
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1314
|
-
logger?.debug?.("Remaining span within limit, outputting final segment");
|
|
1315
1307
|
const finalSeg = createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1316
1308
|
if (finalSeg) result.push(finalSeg);
|
|
1317
1309
|
break;
|
|
1318
1310
|
}
|
|
1319
|
-
const
|
|
1320
|
-
const maxWindowPageId = currentPageId + maxPages;
|
|
1311
|
+
const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
|
|
1321
1312
|
let windowEndIdx = currentFromIdx;
|
|
1322
1313
|
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
1323
1314
|
else break;
|
|
1324
|
-
logger?.trace?.("Window calculation", {
|
|
1325
|
-
currentPageId,
|
|
1326
|
-
maxWindowPageId,
|
|
1327
|
-
windowEndIdx,
|
|
1328
|
-
windowEndPageId: pageIds[windowEndIdx]
|
|
1329
|
-
});
|
|
1330
1315
|
const windowHasExclusions = expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, currentFromIdx, windowEndIdx));
|
|
1331
1316
|
let breakPosition = -1;
|
|
1332
|
-
if (windowHasExclusions)
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
expandedBreakpoints,
|
|
1341
|
-
normalizedPages,
|
|
1342
|
-
pageIds,
|
|
1343
|
-
prefer
|
|
1344
|
-
};
|
|
1345
|
-
logger?.trace?.("Finding break position using patterns...");
|
|
1346
|
-
breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, breakpointCtx);
|
|
1347
|
-
logger?.trace?.("Pattern break position", { breakPosition });
|
|
1348
|
-
}
|
|
1317
|
+
if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1318
|
+
if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, {
|
|
1319
|
+
cumulativeOffsets,
|
|
1320
|
+
expandedBreakpoints,
|
|
1321
|
+
normalizedPages,
|
|
1322
|
+
pageIds,
|
|
1323
|
+
prefer
|
|
1324
|
+
});
|
|
1349
1325
|
if (breakPosition <= 0) {
|
|
1350
|
-
logger?.debug?.("No pattern matched, falling back to page boundary");
|
|
1351
1326
|
if (windowEndIdx === currentFromIdx) {
|
|
1352
|
-
logger?.trace?.("Single page window, outputting page and advancing");
|
|
1353
1327
|
const pageContent = cumulativeOffsets[currentFromIdx + 1] !== void 0 ? remainingContent.slice(0, cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx]) : remainingContent;
|
|
1354
1328
|
const pageSeg = createSegment(pageContent.trim(), pageIds[currentFromIdx], void 0, isFirstPiece ? segment.meta : void 0);
|
|
1355
1329
|
if (pageSeg) result.push(pageSeg);
|
|
1356
1330
|
remainingContent = remainingContent.slice(pageContent.length).trim();
|
|
1357
1331
|
currentFromIdx++;
|
|
1358
1332
|
isFirstPiece = false;
|
|
1359
|
-
logger?.trace?.("After single page", {
|
|
1360
|
-
currentFromIdx,
|
|
1361
|
-
remainingContentLength: remainingContent.length
|
|
1362
|
-
});
|
|
1363
1333
|
continue;
|
|
1364
1334
|
}
|
|
1365
1335
|
breakPosition = cumulativeOffsets[windowEndIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
1366
|
-
logger?.trace?.("Multi-page window, using full window break position", { breakPosition });
|
|
1367
1336
|
}
|
|
1368
1337
|
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
1369
|
-
logger?.trace?.("Piece extracted", {
|
|
1370
|
-
breakPosition,
|
|
1371
|
-
pieceContentLength: pieceContent.length,
|
|
1372
|
-
pieceContentPreview: pieceContent.slice(0, 80)
|
|
1373
|
-
});
|
|
1374
1338
|
const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1375
1339
|
const actualEndIdx = pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx;
|
|
1376
|
-
logger?.trace?.("Actual page indices", {
|
|
1377
|
-
actualEndIdx,
|
|
1378
|
-
actualStartIdx,
|
|
1379
|
-
pieceHasContent: !!pieceContent
|
|
1380
|
-
});
|
|
1381
1340
|
if (pieceContent) {
|
|
1382
1341
|
const pieceSeg = createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, isFirstPiece ? segment.meta : void 0);
|
|
1383
|
-
if (pieceSeg)
|
|
1384
|
-
result.push(pieceSeg);
|
|
1385
|
-
logger?.debug?.("Created segment", {
|
|
1386
|
-
contentLength: pieceSeg.content.length,
|
|
1387
|
-
from: pieceSeg.from,
|
|
1388
|
-
to: pieceSeg.to
|
|
1389
|
-
});
|
|
1390
|
-
}
|
|
1342
|
+
if (pieceSeg) result.push(pieceSeg);
|
|
1391
1343
|
}
|
|
1392
|
-
const prevRemainingLength = remainingContent.length;
|
|
1393
1344
|
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
1394
|
-
logger?.trace?.("After slicing remainingContent", {
|
|
1395
|
-
newLength: remainingContent.length,
|
|
1396
|
-
prevLength: prevRemainingLength,
|
|
1397
|
-
slicedAmount: breakPosition
|
|
1398
|
-
});
|
|
1399
|
-
if (!remainingContent) {
|
|
1400
|
-
logger?.debug?.("No remaining content, breaking out of loop");
|
|
1401
|
-
break;
|
|
1402
|
-
}
|
|
1403
1345
|
let nextFromIdx = actualEndIdx;
|
|
1404
|
-
if (actualEndIdx + 1 <= toIdx) {
|
|
1346
|
+
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
1405
1347
|
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
1406
1348
|
if (nextPageData) {
|
|
1407
1349
|
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
1408
|
-
if (nextPrefix && remainingContent.startsWith(nextPrefix))
|
|
1409
|
-
nextFromIdx = actualEndIdx + 1;
|
|
1410
|
-
logger?.trace?.("Content starts with next page prefix", { advancingTo: nextFromIdx });
|
|
1411
|
-
}
|
|
1350
|
+
if (nextPrefix && remainingContent.startsWith(nextPrefix)) nextFromIdx = actualEndIdx + 1;
|
|
1412
1351
|
}
|
|
1413
1352
|
}
|
|
1414
|
-
logger?.trace?.("End of iteration", {
|
|
1415
|
-
nextFromIdx,
|
|
1416
|
-
prevCurrentFromIdx: currentFromIdx,
|
|
1417
|
-
willAdvance: nextFromIdx !== currentFromIdx
|
|
1418
|
-
});
|
|
1419
1353
|
currentFromIdx = nextFromIdx;
|
|
1420
1354
|
isFirstPiece = false;
|
|
1421
1355
|
}
|
|
1422
1356
|
}
|
|
1423
|
-
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
1424
1357
|
return result;
|
|
1425
1358
|
};
|
|
1426
1359
|
/**
|
|
@@ -1466,7 +1399,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1466
1399
|
* });
|
|
1467
1400
|
*/
|
|
1468
1401
|
const segmentPages = (pages, options) => {
|
|
1469
|
-
const { rules = [], maxPages, breakpoints, prefer = "longer"
|
|
1402
|
+
const { rules = [], maxPages, breakpoints, prefer = "longer" } = options;
|
|
1470
1403
|
if (!pages.length) return [];
|
|
1471
1404
|
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
|
|
1472
1405
|
const splitPoints = [];
|
|
@@ -1504,7 +1437,7 @@ const segmentPages = (pages, options) => {
|
|
|
1504
1437
|
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
1505
1438
|
if (initialSeg.content) segments = [initialSeg];
|
|
1506
1439
|
}
|
|
1507
|
-
if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer
|
|
1440
|
+
if (maxPages !== void 0 && maxPages >= 0 && breakpoints?.length) return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer);
|
|
1508
1441
|
return segments;
|
|
1509
1442
|
};
|
|
1510
1443
|
/**
|
|
@@ -1731,5 +1664,5 @@ const analyzeTextForRule = (text) => {
|
|
|
1731
1664
|
};
|
|
1732
1665
|
|
|
1733
1666
|
//#endregion
|
|
1734
|
-
export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
|
|
1667
|
+
export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
|
|
1735
1668
|
//# sourceMappingURL=index.mjs.map
|