flappa-doormal 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +54 -0
- package/README.md +257 -5
- package/dist/index.d.mts +100 -23
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +591 -82
- package/dist/index.mjs.map +1 -1
- package/package.json +6 -5
package/dist/index.mjs
CHANGED
|
@@ -662,12 +662,24 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
662
662
|
break;
|
|
663
663
|
}
|
|
664
664
|
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
665
|
-
|
|
665
|
+
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
666
|
+
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
666
667
|
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
667
668
|
if (finalSeg) result.push(finalSeg);
|
|
668
669
|
break;
|
|
669
670
|
}
|
|
670
671
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
672
|
+
logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
|
|
673
|
+
currentFromIdx,
|
|
674
|
+
currentFromPageId: pageIds[currentFromIdx],
|
|
675
|
+
remainingContentStart: remainingContent.slice(0, 50),
|
|
676
|
+
remainingContentLength: remainingContent.length,
|
|
677
|
+
remainingSpan,
|
|
678
|
+
toIdx,
|
|
679
|
+
toPageId: pageIds[toIdx],
|
|
680
|
+
windowEndIdx,
|
|
681
|
+
windowEndPageId: pageIds[windowEndIdx]
|
|
682
|
+
});
|
|
671
683
|
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
672
684
|
const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
|
|
673
685
|
let breakPosition = -1;
|
|
@@ -680,16 +692,35 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
680
692
|
});
|
|
681
693
|
if (breakPosition <= 0) breakPosition = windowEndPosition;
|
|
682
694
|
const pieceContent = remainingContent.slice(0, breakPosition).trim();
|
|
695
|
+
logger?.debug?.("[breakpoints] selectedBreak", {
|
|
696
|
+
breakPosition,
|
|
697
|
+
pieceContentEnd: pieceContent.slice(-50),
|
|
698
|
+
pieceContentLength: pieceContent.length,
|
|
699
|
+
windowEndPosition
|
|
700
|
+
});
|
|
683
701
|
const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
|
|
684
702
|
if (pieceContent) {
|
|
685
703
|
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
686
704
|
if (pieceSeg) result.push(pieceSeg);
|
|
687
705
|
}
|
|
688
706
|
remainingContent = remainingContent.slice(breakPosition).trim();
|
|
689
|
-
|
|
707
|
+
logger?.debug?.("[breakpoints] afterSlice", {
|
|
708
|
+
actualEndIdx,
|
|
709
|
+
remainingContentLength: remainingContent.length,
|
|
710
|
+
remainingContentStart: remainingContent.slice(0, 60)
|
|
711
|
+
});
|
|
712
|
+
if (!remainingContent) {
|
|
713
|
+
logger?.debug?.("[breakpoints] done: no remaining content");
|
|
714
|
+
break;
|
|
715
|
+
}
|
|
690
716
|
currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
717
|
+
logger?.debug?.("[breakpoints] nextIteration", {
|
|
718
|
+
currentFromIdx,
|
|
719
|
+
currentFromPageId: pageIds[currentFromIdx]
|
|
720
|
+
});
|
|
691
721
|
isFirstPiece = false;
|
|
692
722
|
}
|
|
723
|
+
logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
|
|
693
724
|
return result;
|
|
694
725
|
};
|
|
695
726
|
/**
|
|
@@ -708,6 +739,14 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
708
739
|
maxPages,
|
|
709
740
|
segmentCount: segments.length
|
|
710
741
|
});
|
|
742
|
+
logger?.debug?.("[breakpoints] inputSegments", {
|
|
743
|
+
segmentCount: segments.length,
|
|
744
|
+
segments: segments.map((s) => ({
|
|
745
|
+
contentLength: s.content.length,
|
|
746
|
+
from: s.from,
|
|
747
|
+
to: s.to
|
|
748
|
+
}))
|
|
749
|
+
});
|
|
711
750
|
for (const segment of segments) {
|
|
712
751
|
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
713
752
|
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
@@ -830,39 +869,6 @@ const filterByConstraints = (matches, rule, getId) => {
|
|
|
830
869
|
});
|
|
831
870
|
};
|
|
832
871
|
/**
|
|
833
|
-
* Filters matches based on occurrence setting (first, last, or all).
|
|
834
|
-
*
|
|
835
|
-
* Applies occurrence-based selection to a list of matches:
|
|
836
|
-
* - `'all'` or `undefined`: Return all matches (default)
|
|
837
|
-
* - `'first'`: Return only the first match
|
|
838
|
-
* - `'last'`: Return only the last match
|
|
839
|
-
*
|
|
840
|
-
* @param matches - Array of match results to filter
|
|
841
|
-
* @param occurrence - Which occurrence(s) to keep
|
|
842
|
-
* @returns Filtered array based on occurrence setting
|
|
843
|
-
*
|
|
844
|
-
* @example
|
|
845
|
-
* const matches = [{ start: 0 }, { start: 10 }, { start: 20 }];
|
|
846
|
-
*
|
|
847
|
-
* filterByOccurrence(matches, 'first')
|
|
848
|
-
* // → [{ start: 0 }]
|
|
849
|
-
*
|
|
850
|
-
* filterByOccurrence(matches, 'last')
|
|
851
|
-
* // → [{ start: 20 }]
|
|
852
|
-
*
|
|
853
|
-
* filterByOccurrence(matches, 'all')
|
|
854
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }]
|
|
855
|
-
*
|
|
856
|
-
* filterByOccurrence(matches, undefined)
|
|
857
|
-
* // → [{ start: 0 }, { start: 10 }, { start: 20 }] (default: all)
|
|
858
|
-
*/
|
|
859
|
-
const filterByOccurrence = (matches, occurrence) => {
|
|
860
|
-
if (!matches.length) return [];
|
|
861
|
-
if (occurrence === "first") return [matches[0]];
|
|
862
|
-
if (occurrence === "last") return [matches[matches.length - 1]];
|
|
863
|
-
return matches;
|
|
864
|
-
};
|
|
865
|
-
/**
|
|
866
872
|
* Checks if any rule in the list allows the given page ID.
|
|
867
873
|
*
|
|
868
874
|
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
@@ -963,19 +969,13 @@ const anyRuleAllowsId = (rules, pageId) => {
|
|
|
963
969
|
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
964
970
|
*/
|
|
965
971
|
const escapeTemplateBrackets = (pattern) => {
|
|
966
|
-
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (
|
|
972
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
|
|
967
973
|
if (token) return token;
|
|
968
974
|
return `\\${bracket}`;
|
|
969
975
|
});
|
|
970
976
|
};
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
*
|
|
974
|
-
* These tokens contain raw regex patterns and do not reference other tokens.
|
|
975
|
-
* For composite tokens that build on these, see `COMPOSITE_TOKENS`.
|
|
976
|
-
*
|
|
977
|
-
* @internal
|
|
978
|
-
*/
|
|
977
|
+
const RUMUZ_ATOM = `(?:خت|خغ|بخ|عخ|مق|مت|عس|سي|كن|مد|قد|خد|فد|دل|كد|غد|صد|تم|فق|دق|[خرزيمنصدفلتقع]|(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669]))`;
|
|
978
|
+
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
979
979
|
const BASE_TOKENS = {
|
|
980
980
|
bab: "باب",
|
|
981
981
|
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
@@ -983,7 +983,7 @@ const BASE_TOKENS = {
|
|
|
983
983
|
dash: "[-–—ـ]",
|
|
984
984
|
fasl: ["مسألة", "فصل"].join("|"),
|
|
985
985
|
harf: "[أ-ي]",
|
|
986
|
-
harfs: "[أ-ي](
|
|
986
|
+
harfs: "[أ-ي](?:\\s+[أ-ي])*",
|
|
987
987
|
kitab: "كتاب",
|
|
988
988
|
naql: [
|
|
989
989
|
"حدثني",
|
|
@@ -996,6 +996,7 @@ const BASE_TOKENS = {
|
|
|
996
996
|
].join("|"),
|
|
997
997
|
raqm: "[\\u0660-\\u0669]",
|
|
998
998
|
raqms: "[\\u0660-\\u0669]+",
|
|
999
|
+
rumuz: RUMUZ_BLOCK,
|
|
999
1000
|
tarqim: "[.!?؟؛]"
|
|
1000
1001
|
};
|
|
1001
1002
|
/**
|
|
@@ -1127,7 +1128,7 @@ const containsTokens = (query) => {
|
|
|
1127
1128
|
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
1128
1129
|
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
1129
1130
|
*/
|
|
1130
|
-
const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
1131
|
+
const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
|
|
1131
1132
|
const captureNames = [];
|
|
1132
1133
|
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
1133
1134
|
/**
|
|
@@ -1169,16 +1170,18 @@ const expandTokensWithCaptures = (query, fuzzyTransform) => {
|
|
|
1169
1170
|
const [, tokenName, captureName] = tokenMatch;
|
|
1170
1171
|
if (!tokenName && captureName) {
|
|
1171
1172
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1172
|
-
|
|
1173
|
-
|
|
1173
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1174
|
+
captureNames.push(prefixedName);
|
|
1175
|
+
return `(?<${prefixedName}>.+)`;
|
|
1174
1176
|
}
|
|
1175
1177
|
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
1176
1178
|
if (!tokenPattern) return segment.value;
|
|
1177
1179
|
if (fuzzyTransform) tokenPattern = tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
1178
1180
|
if (captureName) {
|
|
1179
1181
|
const uniqueName = getUniqueCaptureName(captureName);
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1183
|
+
captureNames.push(prefixedName);
|
|
1184
|
+
return `(?<${prefixedName}>${tokenPattern})`;
|
|
1182
1185
|
}
|
|
1183
1186
|
return tokenPattern;
|
|
1184
1187
|
});
|
|
@@ -1319,39 +1322,42 @@ const compileRuleRegex = (pattern) => {
|
|
|
1319
1322
|
*
|
|
1320
1323
|
* Brackets `()[]` outside `{{tokens}}` are auto-escaped.
|
|
1321
1324
|
*/
|
|
1322
|
-
const processPattern = (pattern, fuzzy) => {
|
|
1323
|
-
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0);
|
|
1325
|
+
const processPattern = (pattern, fuzzy, capturePrefix) => {
|
|
1326
|
+
const { pattern: expanded, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(pattern), fuzzy ? makeDiacriticInsensitive : void 0, capturePrefix);
|
|
1324
1327
|
return {
|
|
1325
1328
|
captureNames,
|
|
1326
1329
|
pattern: expanded
|
|
1327
1330
|
};
|
|
1328
1331
|
};
|
|
1329
|
-
const buildLineStartsAfterRegexSource = (patterns, fuzzy) => {
|
|
1330
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1332
|
+
const buildLineStartsAfterRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1333
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1331
1334
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1335
|
+
const captureNames = processed.flatMap((p) => p.captureNames);
|
|
1336
|
+
const contentCapture = capturePrefix ? `(?<${capturePrefix}content>.*)` : "(.*)";
|
|
1337
|
+
if (capturePrefix) captureNames.push(`${capturePrefix}content`);
|
|
1332
1338
|
return {
|
|
1333
|
-
captureNames
|
|
1334
|
-
regex: `^(?:${union})
|
|
1339
|
+
captureNames,
|
|
1340
|
+
regex: `^(?:${union})${contentCapture}`
|
|
1335
1341
|
};
|
|
1336
1342
|
};
|
|
1337
|
-
const buildLineStartsWithRegexSource = (patterns, fuzzy) => {
|
|
1338
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1343
|
+
const buildLineStartsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1344
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1339
1345
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1340
1346
|
return {
|
|
1341
1347
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1342
1348
|
regex: `^(?:${union})`
|
|
1343
1349
|
};
|
|
1344
1350
|
};
|
|
1345
|
-
const buildLineEndsWithRegexSource = (patterns, fuzzy) => {
|
|
1346
|
-
const processed = patterns.map((p) => processPattern(p, fuzzy));
|
|
1351
|
+
const buildLineEndsWithRegexSource = (patterns, fuzzy, capturePrefix) => {
|
|
1352
|
+
const processed = patterns.map((p) => processPattern(p, fuzzy, capturePrefix));
|
|
1347
1353
|
const union = processed.map((p) => p.pattern).join("|");
|
|
1348
1354
|
return {
|
|
1349
1355
|
captureNames: processed.flatMap((p) => p.captureNames),
|
|
1350
1356
|
regex: `(?:${union})$`
|
|
1351
1357
|
};
|
|
1352
1358
|
};
|
|
1353
|
-
const buildTemplateRegexSource = (template) => {
|
|
1354
|
-
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template));
|
|
1359
|
+
const buildTemplateRegexSource = (template, capturePrefix) => {
|
|
1360
|
+
const { pattern, captureNames } = expandTokensWithCaptures(escapeTemplateBrackets(template), void 0, capturePrefix);
|
|
1355
1361
|
return {
|
|
1356
1362
|
captureNames,
|
|
1357
1363
|
regex: pattern
|
|
@@ -1363,12 +1369,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
|
|
|
1363
1369
|
*
|
|
1364
1370
|
* Behavior mirrors the previous implementation in `segmenter.ts`.
|
|
1365
1371
|
*/
|
|
1366
|
-
const buildRuleRegex = (rule) => {
|
|
1372
|
+
const buildRuleRegex = (rule, capturePrefix) => {
|
|
1367
1373
|
const s = { ...rule };
|
|
1368
1374
|
const fuzzy = rule.fuzzy ?? false;
|
|
1369
1375
|
let allCaptureNames = [];
|
|
1370
1376
|
if (s.lineStartsAfter?.length) {
|
|
1371
|
-
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy);
|
|
1377
|
+
const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
|
|
1372
1378
|
allCaptureNames = captureNames;
|
|
1373
1379
|
return {
|
|
1374
1380
|
captureNames: allCaptureNames,
|
|
@@ -1378,17 +1384,17 @@ const buildRuleRegex = (rule) => {
|
|
|
1378
1384
|
};
|
|
1379
1385
|
}
|
|
1380
1386
|
if (s.lineStartsWith?.length) {
|
|
1381
|
-
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy);
|
|
1387
|
+
const { regex, captureNames } = buildLineStartsWithRegexSource(s.lineStartsWith, fuzzy, capturePrefix);
|
|
1382
1388
|
s.regex = regex;
|
|
1383
1389
|
allCaptureNames = captureNames;
|
|
1384
1390
|
}
|
|
1385
1391
|
if (s.lineEndsWith?.length) {
|
|
1386
|
-
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy);
|
|
1392
|
+
const { regex, captureNames } = buildLineEndsWithRegexSource(s.lineEndsWith, fuzzy, capturePrefix);
|
|
1387
1393
|
s.regex = regex;
|
|
1388
1394
|
allCaptureNames = captureNames;
|
|
1389
1395
|
}
|
|
1390
1396
|
if (s.template) {
|
|
1391
|
-
const { regex, captureNames } = buildTemplateRegexSource(s.template);
|
|
1397
|
+
const { regex, captureNames } = buildTemplateRegexSource(s.template, capturePrefix);
|
|
1392
1398
|
s.regex = regex;
|
|
1393
1399
|
allCaptureNames = [...allCaptureNames, ...captureNames];
|
|
1394
1400
|
}
|
|
@@ -1404,16 +1410,231 @@ const buildRuleRegex = (rule) => {
|
|
|
1404
1410
|
};
|
|
1405
1411
|
|
|
1406
1412
|
//#endregion
|
|
1407
|
-
//#region src/segmentation/
|
|
1413
|
+
//#region src/segmentation/fast-fuzzy-prefix.ts
|
|
1414
|
+
/**
|
|
1415
|
+
* Fast-path fuzzy prefix matching for common Arabic line-start markers.
|
|
1416
|
+
*
|
|
1417
|
+
* This exists to avoid running expensive fuzzy-expanded regex alternations over
|
|
1418
|
+
* a giant concatenated string. Instead, we match only at known line-start
|
|
1419
|
+
* offsets and perform a small deterministic comparison:
|
|
1420
|
+
* - Skip Arabic diacritics in the CONTENT
|
|
1421
|
+
* - Treat common equivalence groups as equal (ا/آ/أ/إ, ة/ه, ى/ي)
|
|
1422
|
+
*
|
|
1423
|
+
* This module is intentionally conservative: it only supports "literal"
|
|
1424
|
+
* token patterns (plain text alternation via `|`), not general regex.
|
|
1425
|
+
*/
|
|
1426
|
+
const isArabicDiacriticCode = (code) => code >= 1611 && code <= 1618;
|
|
1427
|
+
const equivKey = (ch) => {
|
|
1428
|
+
switch (ch) {
|
|
1429
|
+
case "آ":
|
|
1430
|
+
case "أ":
|
|
1431
|
+
case "إ": return "ا";
|
|
1432
|
+
case "ه": return "ة";
|
|
1433
|
+
case "ي": return "ى";
|
|
1434
|
+
default: return ch;
|
|
1435
|
+
}
|
|
1436
|
+
};
|
|
1408
1437
|
/**
|
|
1409
|
-
*
|
|
1438
|
+
* Match a fuzzy literal prefix at a given offset.
|
|
1439
|
+
*
|
|
1440
|
+
* - Skips diacritics in the content
|
|
1441
|
+
* - Applies equivalence groups on both content and literal
|
|
1410
1442
|
*
|
|
1411
|
-
* @
|
|
1412
|
-
* @returns Plain text content
|
|
1443
|
+
* @returns endOffset (exclusive) in CONTENT if matched; otherwise null.
|
|
1413
1444
|
*/
|
|
1414
|
-
const
|
|
1415
|
-
|
|
1445
|
+
const matchFuzzyLiteralPrefixAt = (content, offset, literal) => {
|
|
1446
|
+
let i = offset;
|
|
1447
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1448
|
+
for (let j = 0; j < literal.length; j++) {
|
|
1449
|
+
const litCh = literal[j];
|
|
1450
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1451
|
+
if (i >= content.length) return null;
|
|
1452
|
+
const cCh = content[i];
|
|
1453
|
+
if (equivKey(cCh) !== equivKey(litCh)) return null;
|
|
1454
|
+
i++;
|
|
1455
|
+
}
|
|
1456
|
+
while (i < content.length && isArabicDiacriticCode(content.charCodeAt(i))) i++;
|
|
1457
|
+
return i;
|
|
1458
|
+
};
|
|
1459
|
+
const isLiteralOnly = (s) => {
|
|
1460
|
+
return !/[\\[\]{}()^$.*+?]/.test(s);
|
|
1461
|
+
};
|
|
1462
|
+
const compileLiteralAlternation = (pattern) => {
|
|
1463
|
+
if (!pattern) return null;
|
|
1464
|
+
if (!isLiteralOnly(pattern)) return null;
|
|
1465
|
+
const alternatives = pattern.split("|").map((s) => s.trim()).filter(Boolean);
|
|
1466
|
+
if (!alternatives.length) return null;
|
|
1467
|
+
return { alternatives };
|
|
1468
|
+
};
|
|
1469
|
+
/**
|
|
1470
|
+
* Attempt to compile a fast fuzzy rule from a single-token pattern like `{{kitab}}`.
|
|
1471
|
+
* Returns null if not eligible.
|
|
1472
|
+
*/
|
|
1473
|
+
const compileFastFuzzyTokenRule = (tokenTemplate) => {
|
|
1474
|
+
const m = tokenTemplate.match(/^\{\{(\w+)\}\}$/);
|
|
1475
|
+
if (!m) return null;
|
|
1476
|
+
const token = m[1];
|
|
1477
|
+
const tokenPattern = getTokenPattern(token);
|
|
1478
|
+
if (!tokenPattern) return null;
|
|
1479
|
+
const compiled = compileLiteralAlternation(tokenPattern);
|
|
1480
|
+
if (!compiled) return null;
|
|
1481
|
+
return {
|
|
1482
|
+
alternatives: compiled.alternatives,
|
|
1483
|
+
token
|
|
1484
|
+
};
|
|
1485
|
+
};
|
|
1486
|
+
/**
|
|
1487
|
+
* Try matching any alternative for a compiled token at a line-start offset.
|
|
1488
|
+
* Returns endOffset (exclusive) on match, else null.
|
|
1489
|
+
*/
|
|
1490
|
+
const matchFastFuzzyTokenAt = (content, offset, compiled) => {
|
|
1491
|
+
for (const alt of compiled.alternatives) {
|
|
1492
|
+
const end = matchFuzzyLiteralPrefixAt(content, offset, alt);
|
|
1493
|
+
if (end !== null) return end;
|
|
1494
|
+
}
|
|
1495
|
+
return null;
|
|
1496
|
+
};
|
|
1497
|
+
|
|
1498
|
+
//#endregion
|
|
1499
|
+
//#region src/segmentation/segmenter-rule-utils.ts
|
|
1500
|
+
const partitionRulesForMatching = (rules) => {
|
|
1501
|
+
const combinableRules = [];
|
|
1502
|
+
const standaloneRules = [];
|
|
1503
|
+
const fastFuzzyRules = [];
|
|
1504
|
+
rules.forEach((rule, index) => {
|
|
1505
|
+
if (rule.fuzzy && "lineStartsWith" in rule) {
|
|
1506
|
+
const compiled = rule.lineStartsWith.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsWith[0]) : null;
|
|
1507
|
+
if (compiled) {
|
|
1508
|
+
fastFuzzyRules.push({
|
|
1509
|
+
compiled,
|
|
1510
|
+
kind: "startsWith",
|
|
1511
|
+
rule,
|
|
1512
|
+
ruleIndex: index
|
|
1513
|
+
});
|
|
1514
|
+
return;
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
if (rule.fuzzy && "lineStartsAfter" in rule) {
|
|
1518
|
+
const compiled = rule.lineStartsAfter.length === 1 ? compileFastFuzzyTokenRule(rule.lineStartsAfter[0]) : null;
|
|
1519
|
+
if (compiled) {
|
|
1520
|
+
fastFuzzyRules.push({
|
|
1521
|
+
compiled,
|
|
1522
|
+
kind: "startsAfter",
|
|
1523
|
+
rule,
|
|
1524
|
+
ruleIndex: index
|
|
1525
|
+
});
|
|
1526
|
+
return;
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
let isCombinable = true;
|
|
1530
|
+
if ("regex" in rule && rule.regex) {
|
|
1531
|
+
const hasNamedCaptures = extractNamedCaptureNames(rule.regex).length > 0;
|
|
1532
|
+
const hasBackreferences = /\\[1-9]/.test(rule.regex);
|
|
1533
|
+
const hasAnonymousCaptures = hasCapturingGroup(rule.regex);
|
|
1534
|
+
if (hasNamedCaptures || hasBackreferences || hasAnonymousCaptures) isCombinable = false;
|
|
1535
|
+
}
|
|
1536
|
+
if (isCombinable) combinableRules.push({
|
|
1537
|
+
index,
|
|
1538
|
+
prefix: `r${index}_`,
|
|
1539
|
+
rule
|
|
1540
|
+
});
|
|
1541
|
+
else standaloneRules.push(rule);
|
|
1542
|
+
});
|
|
1543
|
+
return {
|
|
1544
|
+
combinableRules,
|
|
1545
|
+
fastFuzzyRules,
|
|
1546
|
+
standaloneRules
|
|
1547
|
+
};
|
|
1548
|
+
};
|
|
1549
|
+
const createPageStartGuardChecker = (matchContent, pageMap) => {
|
|
1550
|
+
const pageStartToBoundaryIndex = /* @__PURE__ */ new Map();
|
|
1551
|
+
for (let i = 0; i < pageMap.boundaries.length; i++) pageStartToBoundaryIndex.set(pageMap.boundaries[i].start, i);
|
|
1552
|
+
const compiledPageStartPrev = /* @__PURE__ */ new Map();
|
|
1553
|
+
const getPageStartPrevRegex = (rule, ruleIndex) => {
|
|
1554
|
+
if (compiledPageStartPrev.has(ruleIndex)) return compiledPageStartPrev.get(ruleIndex) ?? null;
|
|
1555
|
+
const pattern = rule.pageStartGuard;
|
|
1556
|
+
if (!pattern) {
|
|
1557
|
+
compiledPageStartPrev.set(ruleIndex, null);
|
|
1558
|
+
return null;
|
|
1559
|
+
}
|
|
1560
|
+
const expanded = processPattern(pattern, false).pattern;
|
|
1561
|
+
const re = new RegExp(`(?:${expanded})$`, "u");
|
|
1562
|
+
compiledPageStartPrev.set(ruleIndex, re);
|
|
1563
|
+
return re;
|
|
1564
|
+
};
|
|
1565
|
+
const getPrevPageLastNonWsChar = (boundaryIndex) => {
|
|
1566
|
+
if (boundaryIndex <= 0) return "";
|
|
1567
|
+
const prevBoundary = pageMap.boundaries[boundaryIndex - 1];
|
|
1568
|
+
for (let i = prevBoundary.end - 1; i >= prevBoundary.start; i--) {
|
|
1569
|
+
const ch = matchContent[i];
|
|
1570
|
+
if (!ch) continue;
|
|
1571
|
+
if (/\s/u.test(ch)) continue;
|
|
1572
|
+
return ch;
|
|
1573
|
+
}
|
|
1574
|
+
return "";
|
|
1575
|
+
};
|
|
1576
|
+
return (rule, ruleIndex, matchStart) => {
|
|
1577
|
+
const boundaryIndex = pageStartToBoundaryIndex.get(matchStart);
|
|
1578
|
+
if (boundaryIndex === void 0 || boundaryIndex === 0) return true;
|
|
1579
|
+
const prevReq = getPageStartPrevRegex(rule, ruleIndex);
|
|
1580
|
+
if (!prevReq) return true;
|
|
1581
|
+
const lastChar = getPrevPageLastNonWsChar(boundaryIndex);
|
|
1582
|
+
if (!lastChar) return false;
|
|
1583
|
+
return prevReq.test(lastChar);
|
|
1584
|
+
};
|
|
1585
|
+
};
|
|
1586
|
+
const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, passesPageStartGuard) => {
|
|
1587
|
+
const splitPointsByRule = /* @__PURE__ */ new Map();
|
|
1588
|
+
if (fastFuzzyRules.length === 0 || pageMap.boundaries.length === 0) return splitPointsByRule;
|
|
1589
|
+
let boundaryIdx = 0;
|
|
1590
|
+
let currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1591
|
+
const advanceBoundaryTo = (offset) => {
|
|
1592
|
+
while (currentBoundary && offset > currentBoundary.end && boundaryIdx < pageMap.boundaries.length - 1) {
|
|
1593
|
+
boundaryIdx++;
|
|
1594
|
+
currentBoundary = pageMap.boundaries[boundaryIdx];
|
|
1595
|
+
}
|
|
1596
|
+
};
|
|
1597
|
+
const recordSplitPoint = (ruleIndex, sp) => {
|
|
1598
|
+
const arr = splitPointsByRule.get(ruleIndex);
|
|
1599
|
+
if (!arr) {
|
|
1600
|
+
splitPointsByRule.set(ruleIndex, [sp]);
|
|
1601
|
+
return;
|
|
1602
|
+
}
|
|
1603
|
+
arr.push(sp);
|
|
1604
|
+
};
|
|
1605
|
+
const isPageStart = (offset) => offset === currentBoundary?.start;
|
|
1606
|
+
for (let lineStart = 0; lineStart <= matchContent.length;) {
|
|
1607
|
+
advanceBoundaryTo(lineStart);
|
|
1608
|
+
const pageId = currentBoundary?.id ?? 0;
|
|
1609
|
+
if (lineStart >= matchContent.length) break;
|
|
1610
|
+
for (const { compiled, kind, rule, ruleIndex } of fastFuzzyRules) {
|
|
1611
|
+
if (!((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude))) continue;
|
|
1612
|
+
if (isPageStart(lineStart) && !passesPageStartGuard(rule, ruleIndex, lineStart)) continue;
|
|
1613
|
+
const end = matchFastFuzzyTokenAt(matchContent, lineStart, compiled);
|
|
1614
|
+
if (end === null) continue;
|
|
1615
|
+
const splitIndex = (rule.split ?? "at") === "at" ? lineStart : end;
|
|
1616
|
+
if (kind === "startsWith") recordSplitPoint(ruleIndex, {
|
|
1617
|
+
index: splitIndex,
|
|
1618
|
+
meta: rule.meta
|
|
1619
|
+
});
|
|
1620
|
+
else {
|
|
1621
|
+
const markerLength = end - lineStart;
|
|
1622
|
+
recordSplitPoint(ruleIndex, {
|
|
1623
|
+
contentStartOffset: (rule.split ?? "at") === "at" ? markerLength : void 0,
|
|
1624
|
+
index: splitIndex,
|
|
1625
|
+
meta: rule.meta
|
|
1626
|
+
});
|
|
1627
|
+
}
|
|
1628
|
+
}
|
|
1629
|
+
const nextNl = matchContent.indexOf("\n", lineStart);
|
|
1630
|
+
if (nextNl === -1) break;
|
|
1631
|
+
lineStart = nextNl + 1;
|
|
1632
|
+
}
|
|
1633
|
+
return splitPointsByRule;
|
|
1416
1634
|
};
|
|
1635
|
+
|
|
1636
|
+
//#endregion
|
|
1637
|
+
//#region src/segmentation/textUtils.ts
|
|
1417
1638
|
/**
|
|
1418
1639
|
* Normalizes line endings to Unix-style (`\n`).
|
|
1419
1640
|
*
|
|
@@ -1423,7 +1644,9 @@ const stripHtmlTags = (html) => {
|
|
|
1423
1644
|
* @param content - Raw content with potentially mixed line endings
|
|
1424
1645
|
* @returns Content with all line endings normalized to `\n`
|
|
1425
1646
|
*/
|
|
1426
|
-
const normalizeLineEndings = (content) =>
|
|
1647
|
+
const normalizeLineEndings = (content) => {
|
|
1648
|
+
return content.includes("\r") ? content.replace(/\r\n?/g, "\n") : content;
|
|
1649
|
+
};
|
|
1427
1650
|
|
|
1428
1651
|
//#endregion
|
|
1429
1652
|
//#region src/segmentation/segmenter.ts
|
|
@@ -1544,9 +1767,63 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
1544
1767
|
return [initialSeg];
|
|
1545
1768
|
};
|
|
1546
1769
|
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
1547
|
-
const
|
|
1770
|
+
const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
|
|
1771
|
+
const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
|
|
1772
|
+
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
1773
|
+
if (combinableRules.length > 0) {
|
|
1774
|
+
const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
|
|
1775
|
+
const built = buildRuleRegex(rule, prefix);
|
|
1776
|
+
return {
|
|
1777
|
+
prefix,
|
|
1778
|
+
source: `(?<${prefix}>${built.regex.source})`,
|
|
1779
|
+
...built
|
|
1780
|
+
};
|
|
1781
|
+
});
|
|
1782
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1783
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1784
|
+
combinedRegex.lastIndex = 0;
|
|
1785
|
+
let m = combinedRegex.exec(matchContent);
|
|
1786
|
+
while (m !== null) {
|
|
1787
|
+
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
1788
|
+
if (matchedRuleIndex !== -1) {
|
|
1789
|
+
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
1790
|
+
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
1791
|
+
const namedCaptures = {};
|
|
1792
|
+
if (m.groups) {
|
|
1793
|
+
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
1794
|
+
const cleanName = prefixedName.slice(prefix.length);
|
|
1795
|
+
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
let capturedContent;
|
|
1799
|
+
let contentStartOffset;
|
|
1800
|
+
if (ruleInfo.usesLineStartsAfter) {
|
|
1801
|
+
capturedContent = m.groups?.[`${prefix}content`];
|
|
1802
|
+
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
1803
|
+
}
|
|
1804
|
+
const start = m.index;
|
|
1805
|
+
const end = m.index + m[0].length;
|
|
1806
|
+
const pageId = pageMap.getId(start);
|
|
1807
|
+
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
|
|
1808
|
+
if (!passesPageStartGuard(rule, originalIndex, start)) continue;
|
|
1809
|
+
const sp = {
|
|
1810
|
+
capturedContent: void 0,
|
|
1811
|
+
contentStartOffset,
|
|
1812
|
+
index: (rule.split ?? "at") === "at" ? start : end,
|
|
1813
|
+
meta: rule.meta,
|
|
1814
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
1815
|
+
};
|
|
1816
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
1817
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
1818
|
+
}
|
|
1819
|
+
}
|
|
1820
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
1821
|
+
m = combinedRegex.exec(matchContent);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
1548
1825
|
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
1549
|
-
|
|
1826
|
+
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
1550
1827
|
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
1551
1828
|
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
1552
1829
|
return {
|
|
@@ -1557,8 +1834,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
|
1557
1834
|
namedCaptures: m.namedCaptures
|
|
1558
1835
|
};
|
|
1559
1836
|
});
|
|
1837
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
1838
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
1560
1839
|
};
|
|
1561
|
-
|
|
1840
|
+
standaloneRules.forEach((rule) => {
|
|
1841
|
+
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
1842
|
+
});
|
|
1843
|
+
const finalSplitPoints = [];
|
|
1844
|
+
rules.forEach((rule, index) => {
|
|
1845
|
+
const points = splitPointsByRule.get(index);
|
|
1846
|
+
if (!points || points.length === 0) return;
|
|
1847
|
+
let filtered = points;
|
|
1848
|
+
if (rule.occurrence === "first") filtered = [points[0]];
|
|
1849
|
+
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
1850
|
+
finalSplitPoints.push(...filtered);
|
|
1851
|
+
});
|
|
1852
|
+
return finalSplitPoints;
|
|
1562
1853
|
};
|
|
1563
1854
|
/**
|
|
1564
1855
|
* Executes a regex against content and extracts match results with capture information.
|
|
@@ -1686,12 +1977,11 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
1686
1977
|
* });
|
|
1687
1978
|
*/
|
|
1688
1979
|
const segmentPages = (pages, options) => {
|
|
1689
|
-
const { rules = [], maxPages, breakpoints, prefer = "longer", pageJoiner = "space", logger } = options;
|
|
1690
|
-
if (!pages.length) return [];
|
|
1980
|
+
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
1691
1981
|
const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(pages);
|
|
1692
1982
|
let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
|
|
1693
1983
|
segments = ensureFallbackSegment(segments, pages, normalizedContent, pageJoiner);
|
|
1694
|
-
if (maxPages
|
|
1984
|
+
if (maxPages >= 0 && breakpoints.length) {
|
|
1695
1985
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
1696
1986
|
return applyBreakpoints(segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
1697
1987
|
}
|
|
@@ -1766,7 +2056,225 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
1766
2056
|
};
|
|
1767
2057
|
|
|
1768
2058
|
//#endregion
|
|
1769
|
-
//#region src/
|
|
2059
|
+
//#region src/analysis.ts
|
|
2060
|
+
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2061
|
+
const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "");
|
|
2062
|
+
const computeSpecificity = (pattern) => {
|
|
2063
|
+
const tokenCount = countTokenMarkers(pattern);
|
|
2064
|
+
return {
|
|
2065
|
+
literalLen: stripWhitespacePlaceholders(pattern).length,
|
|
2066
|
+
tokenCount
|
|
2067
|
+
};
|
|
2068
|
+
};
|
|
2069
|
+
const DEFAULT_OPTIONS = {
|
|
2070
|
+
includeFirstWordFallback: true,
|
|
2071
|
+
lineFilter: void 0,
|
|
2072
|
+
maxExamples: 1,
|
|
2073
|
+
minCount: 3,
|
|
2074
|
+
minLineLength: 6,
|
|
2075
|
+
normalizeArabicDiacritics: true,
|
|
2076
|
+
prefixChars: 60,
|
|
2077
|
+
prefixMatchers: [/^#+/u],
|
|
2078
|
+
sortBy: "specificity",
|
|
2079
|
+
topK: 40
|
|
2080
|
+
};
|
|
2081
|
+
const escapeRegexLiteral = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
2082
|
+
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2083
|
+
"basmalah",
|
|
2084
|
+
"kitab",
|
|
2085
|
+
"bab",
|
|
2086
|
+
"fasl",
|
|
2087
|
+
"naql",
|
|
2088
|
+
"rumuz",
|
|
2089
|
+
"numbered",
|
|
2090
|
+
"raqms",
|
|
2091
|
+
"raqm",
|
|
2092
|
+
"dash",
|
|
2093
|
+
"bullet",
|
|
2094
|
+
"tarqim"
|
|
2095
|
+
];
|
|
2096
|
+
const buildTokenPriority = () => {
|
|
2097
|
+
const allTokens = new Set(getAvailableTokens());
|
|
2098
|
+
return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
|
|
2099
|
+
};
|
|
2100
|
+
const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
|
|
2101
|
+
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
|
|
2102
|
+
const compileTokenRegexes = (tokenNames) => {
|
|
2103
|
+
const compiled = [];
|
|
2104
|
+
for (const token of tokenNames) {
|
|
2105
|
+
const pat = TOKEN_PATTERNS[token];
|
|
2106
|
+
if (!pat) continue;
|
|
2107
|
+
try {
|
|
2108
|
+
compiled.push({
|
|
2109
|
+
re: new RegExp(pat, "uy"),
|
|
2110
|
+
token
|
|
2111
|
+
});
|
|
2112
|
+
} catch {}
|
|
2113
|
+
}
|
|
2114
|
+
return compiled;
|
|
2115
|
+
};
|
|
2116
|
+
const appendWs = (out) => out && !out.endsWith("\\s*") ? `${out}\\s*` : out;
|
|
2117
|
+
const consumeLeadingPrefixes = (s, pos, out, prefixMatchers) => {
|
|
2118
|
+
let matchedAny = false;
|
|
2119
|
+
let currentPos = pos;
|
|
2120
|
+
let currentOut = out;
|
|
2121
|
+
for (const re of prefixMatchers) {
|
|
2122
|
+
if (currentPos >= s.length) break;
|
|
2123
|
+
const m = re.exec(s.slice(currentPos));
|
|
2124
|
+
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2125
|
+
currentOut += escapeRegexLiteral(m[0]);
|
|
2126
|
+
currentPos += m[0].length;
|
|
2127
|
+
matchedAny = true;
|
|
2128
|
+
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
2129
|
+
if (wsAfter) {
|
|
2130
|
+
currentPos += wsAfter[0].length;
|
|
2131
|
+
currentOut = appendWs(currentOut);
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
return {
|
|
2135
|
+
matchedAny,
|
|
2136
|
+
out: currentOut,
|
|
2137
|
+
pos: currentPos
|
|
2138
|
+
};
|
|
2139
|
+
};
|
|
2140
|
+
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
2141
|
+
let best = null;
|
|
2142
|
+
for (const { token, re } of compiled) {
|
|
2143
|
+
re.lastIndex = pos;
|
|
2144
|
+
const m = re.exec(s);
|
|
2145
|
+
if (!m || m.index !== pos) continue;
|
|
2146
|
+
if (!best || m[0].length > best.text.length) best = {
|
|
2147
|
+
text: m[0],
|
|
2148
|
+
token
|
|
2149
|
+
};
|
|
2150
|
+
}
|
|
2151
|
+
if (best?.token === "rumuz") {
|
|
2152
|
+
const end = pos + best.text.length;
|
|
2153
|
+
const next = end < s.length ? s[end] : "";
|
|
2154
|
+
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
2155
|
+
}
|
|
2156
|
+
return best;
|
|
2157
|
+
};
|
|
2158
|
+
const tokenizeLineStart = (line, tokenNames, prefixChars, includeFirstWordFallback, normalizeArabicDiacritics, prefixMatchers) => {
|
|
2159
|
+
const trimmed = collapseWhitespace(line);
|
|
2160
|
+
if (!trimmed) return null;
|
|
2161
|
+
const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
|
|
2162
|
+
let pos = 0;
|
|
2163
|
+
let out = "";
|
|
2164
|
+
let matchedAny = false;
|
|
2165
|
+
let matchedToken = false;
|
|
2166
|
+
const compiled = compileTokenRegexes(tokenNames);
|
|
2167
|
+
const isArabicLetter = (ch) => /[\u0600-\u06FF]/u.test(ch);
|
|
2168
|
+
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
2169
|
+
{
|
|
2170
|
+
const consumed = consumeLeadingPrefixes(s, pos, out, prefixMatchers);
|
|
2171
|
+
pos = consumed.pos;
|
|
2172
|
+
out = consumed.out;
|
|
2173
|
+
matchedAny = consumed.matchedAny;
|
|
2174
|
+
}
|
|
2175
|
+
for (let steps = 0; steps < 6 && pos < s.length; steps++) {
|
|
2176
|
+
const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
|
|
2177
|
+
if (wsMatch) {
|
|
2178
|
+
pos += wsMatch[0].length;
|
|
2179
|
+
out = appendWs(out);
|
|
2180
|
+
continue;
|
|
2181
|
+
}
|
|
2182
|
+
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
2183
|
+
if (best) {
|
|
2184
|
+
if (out && !out.endsWith("\\s*")) {}
|
|
2185
|
+
out += `{{${best.token}}}`;
|
|
2186
|
+
matchedAny = true;
|
|
2187
|
+
matchedToken = true;
|
|
2188
|
+
pos += best.text.length;
|
|
2189
|
+
continue;
|
|
2190
|
+
}
|
|
2191
|
+
if (matchedAny) {
|
|
2192
|
+
const ch = s[pos];
|
|
2193
|
+
if (ch && isCommonDelimiter(ch)) {
|
|
2194
|
+
out += escapeRegexLiteral(ch);
|
|
2195
|
+
pos += 1;
|
|
2196
|
+
continue;
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
if (matchedAny) {
|
|
2200
|
+
if (includeFirstWordFallback && !matchedToken) {
|
|
2201
|
+
const firstWord$1 = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2202
|
+
if (!firstWord$1) break;
|
|
2203
|
+
out += escapeRegexLiteral(firstWord$1);
|
|
2204
|
+
}
|
|
2205
|
+
break;
|
|
2206
|
+
}
|
|
2207
|
+
if (!includeFirstWordFallback) return null;
|
|
2208
|
+
const firstWord = (s.slice(pos).match(/^[^\s:،؛.?!؟]+/u) ?? [])[0];
|
|
2209
|
+
if (!firstWord) return null;
|
|
2210
|
+
out += escapeRegexLiteral(firstWord);
|
|
2211
|
+
return out;
|
|
2212
|
+
}
|
|
2213
|
+
if (!matchedAny) return null;
|
|
2214
|
+
while (out.endsWith("\\s*")) out = out.slice(0, -3);
|
|
2215
|
+
return out;
|
|
2216
|
+
};
|
|
2217
|
+
/**
|
|
2218
|
+
* Analyze pages and return the most common line-start patterns (top K).
|
|
2219
|
+
*
|
|
2220
|
+
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
2221
|
+
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
2222
|
+
*/
|
|
2223
|
+
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
2224
|
+
const o = {
|
|
2225
|
+
...DEFAULT_OPTIONS,
|
|
2226
|
+
...options,
|
|
2227
|
+
lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
|
|
2228
|
+
prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers
|
|
2229
|
+
};
|
|
2230
|
+
const tokenPriority = buildTokenPriority();
|
|
2231
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2232
|
+
for (const page of pages) {
|
|
2233
|
+
const lines = normalizeLineEndings(page.content ?? "").split("\n");
|
|
2234
|
+
for (const line of lines) {
|
|
2235
|
+
const trimmed = collapseWhitespace(line);
|
|
2236
|
+
if (trimmed.length < o.minLineLength) continue;
|
|
2237
|
+
if (o.lineFilter && !o.lineFilter(trimmed, page.id)) continue;
|
|
2238
|
+
const sig = tokenizeLineStart(trimmed, tokenPriority, o.prefixChars, o.includeFirstWordFallback, o.normalizeArabicDiacritics, o.prefixMatchers);
|
|
2239
|
+
if (!sig) continue;
|
|
2240
|
+
const existing = counts.get(sig);
|
|
2241
|
+
if (!existing) counts.set(sig, {
|
|
2242
|
+
count: 1,
|
|
2243
|
+
examples: [{
|
|
2244
|
+
line: trimmed,
|
|
2245
|
+
pageId: page.id
|
|
2246
|
+
}]
|
|
2247
|
+
});
|
|
2248
|
+
else {
|
|
2249
|
+
existing.count++;
|
|
2250
|
+
if (existing.examples.length < o.maxExamples) existing.examples.push({
|
|
2251
|
+
line: trimmed,
|
|
2252
|
+
pageId: page.id
|
|
2253
|
+
});
|
|
2254
|
+
}
|
|
2255
|
+
}
|
|
2256
|
+
}
|
|
2257
|
+
const compareSpecificityThenCount = (a, b) => {
|
|
2258
|
+
const sa = computeSpecificity(a.pattern);
|
|
2259
|
+
const sb = computeSpecificity(b.pattern);
|
|
2260
|
+
if (sb.tokenCount !== sa.tokenCount) return sb.tokenCount - sa.tokenCount;
|
|
2261
|
+
if (sb.literalLen !== sa.literalLen) return sb.literalLen - sa.literalLen;
|
|
2262
|
+
if (b.count !== a.count) return b.count - a.count;
|
|
2263
|
+
return a.pattern.localeCompare(b.pattern);
|
|
2264
|
+
};
|
|
2265
|
+
const compareCountThenSpecificity = (a, b) => {
|
|
2266
|
+
if (b.count !== a.count) return b.count - a.count;
|
|
2267
|
+
return compareSpecificityThenCount(a, b);
|
|
2268
|
+
};
|
|
2269
|
+
return [...counts.entries()].map(([pattern, v]) => ({
|
|
2270
|
+
count: v.count,
|
|
2271
|
+
examples: v.examples,
|
|
2272
|
+
pattern
|
|
2273
|
+
})).filter((p) => p.count >= o.minCount).sort(o.sortBy === "count" ? compareCountThenSpecificity : compareSpecificityThenCount).slice(0, o.topK);
|
|
2274
|
+
};
|
|
2275
|
+
|
|
2276
|
+
//#endregion
|
|
2277
|
+
//#region src/detection.ts
|
|
1770
2278
|
/**
|
|
1771
2279
|
* Pattern detection utilities for recognizing template tokens in Arabic text.
|
|
1772
2280
|
* Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
|
|
@@ -1785,6 +2293,7 @@ const TOKEN_PRIORITY_ORDER = [
|
|
|
1785
2293
|
"bab",
|
|
1786
2294
|
"fasl",
|
|
1787
2295
|
"naql",
|
|
2296
|
+
"rumuz",
|
|
1788
2297
|
"numbered",
|
|
1789
2298
|
"raqms",
|
|
1790
2299
|
"raqm",
|
|
@@ -1921,5 +2430,5 @@ const analyzeTextForRule = (text) => {
|
|
|
1921
2430
|
};
|
|
1922
2431
|
|
|
1923
2432
|
//#endregion
|
|
1924
|
-
export { TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive,
|
|
2433
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
|
|
1925
2434
|
//# sourceMappingURL=index.mjs.map
|