flappa-doormal 2.9.0 → 2.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +86 -9
- package/README.md +73 -1
- package/dist/index.d.mts +138 -1
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +281 -16
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.mjs
CHANGED
|
@@ -155,6 +155,154 @@ const makeDiacriticInsensitive = (text) => {
|
|
|
155
155
|
return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
156
156
|
};
|
|
157
157
|
|
|
158
|
+
//#endregion
|
|
159
|
+
//#region src/segmentation/types.ts
|
|
160
|
+
/**
|
|
161
|
+
* Pattern type key names for split rules.
|
|
162
|
+
*
|
|
163
|
+
* Use this array to dynamically iterate over pattern types in UIs,
|
|
164
|
+
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
165
|
+
*
|
|
166
|
+
* @example
|
|
167
|
+
* // Build a dropdown/select in UI
|
|
168
|
+
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
169
|
+
*
|
|
170
|
+
* @example
|
|
171
|
+
* // Type-safe pattern key validation
|
|
172
|
+
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
173
|
+
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
174
|
+
*/
|
|
175
|
+
const PATTERN_TYPE_KEYS = [
|
|
176
|
+
"lineStartsWith",
|
|
177
|
+
"lineStartsAfter",
|
|
178
|
+
"lineEndsWith",
|
|
179
|
+
"template",
|
|
180
|
+
"regex"
|
|
181
|
+
];
|
|
182
|
+
|
|
183
|
+
//#endregion
|
|
184
|
+
//#region src/segmentation/optimize-rules.ts
|
|
185
|
+
/**
|
|
186
|
+
* Rule optimization utilities for merging and sorting split rules.
|
|
187
|
+
*
|
|
188
|
+
* Provides `optimizeRules()` to:
|
|
189
|
+
* 1. Merge compatible rules with same pattern type and options
|
|
190
|
+
* 2. Deduplicate patterns within each rule
|
|
191
|
+
* 3. Sort rules by specificity (longer patterns first)
|
|
192
|
+
*
|
|
193
|
+
* @module optimize-rules
|
|
194
|
+
*/
|
|
195
|
+
const MERGEABLE_KEYS = new Set([
|
|
196
|
+
"lineStartsWith",
|
|
197
|
+
"lineStartsAfter",
|
|
198
|
+
"lineEndsWith"
|
|
199
|
+
]);
|
|
200
|
+
/**
|
|
201
|
+
* Get the pattern type key for a rule.
|
|
202
|
+
*/
|
|
203
|
+
const getPatternKey = (rule) => {
|
|
204
|
+
for (const key of PATTERN_TYPE_KEYS) if (key in rule) return key;
|
|
205
|
+
return "regex";
|
|
206
|
+
};
|
|
207
|
+
/**
|
|
208
|
+
* Get the pattern array for a mergeable rule.
|
|
209
|
+
*/
|
|
210
|
+
const getPatternArray = (rule, key) => {
|
|
211
|
+
const value = rule[key];
|
|
212
|
+
return Array.isArray(value) ? value : [];
|
|
213
|
+
};
|
|
214
|
+
/**
|
|
215
|
+
* Get a string representation of the pattern value (for specificity scoring).
|
|
216
|
+
*/
|
|
217
|
+
const getPatternString = (rule, key) => {
|
|
218
|
+
const value = rule[key];
|
|
219
|
+
if (typeof value === "string") return value;
|
|
220
|
+
if (Array.isArray(value)) return value.join("\n");
|
|
221
|
+
return "";
|
|
222
|
+
};
|
|
223
|
+
/**
|
|
224
|
+
* Deduplicate and sort patterns by length (longest first).
|
|
225
|
+
*/
|
|
226
|
+
const normalizePatterns = (patterns) => {
|
|
227
|
+
return [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
228
|
+
};
|
|
229
|
+
/**
|
|
230
|
+
* Calculate specificity score for a rule (higher = more specific).
|
|
231
|
+
* Based on the longest pattern length.
|
|
232
|
+
*/
|
|
233
|
+
const getSpecificityScore = (rule) => {
|
|
234
|
+
const key = getPatternKey(rule);
|
|
235
|
+
if (MERGEABLE_KEYS.has(key)) return getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0);
|
|
236
|
+
return getPatternString(rule, key).length;
|
|
237
|
+
};
|
|
238
|
+
/**
|
|
239
|
+
* Create a merge key for a rule based on pattern type and all non-pattern properties.
|
|
240
|
+
* Rules with the same merge key can have their patterns combined.
|
|
241
|
+
*/
|
|
242
|
+
const createMergeKey = (rule) => {
|
|
243
|
+
const patternKey = getPatternKey(rule);
|
|
244
|
+
const { [patternKey]: _pattern, ...rest } = rule;
|
|
245
|
+
return `${patternKey}|${JSON.stringify(rest)}`;
|
|
246
|
+
};
|
|
247
|
+
/**
|
|
248
|
+
* Optimize split rules by merging compatible rules and sorting by specificity.
|
|
249
|
+
*
|
|
250
|
+
* This function:
|
|
251
|
+
* 1. **Merges compatible rules**: Rules with the same pattern type and identical
|
|
252
|
+
* options (meta, fuzzy, min/max, etc.) have their pattern arrays combined
|
|
253
|
+
* 2. **Deduplicates patterns**: Removes duplicate patterns within each rule
|
|
254
|
+
* 3. **Sorts by specificity**: Rules with longer patterns come first
|
|
255
|
+
*
|
|
256
|
+
* Only array-based pattern types (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`)
|
|
257
|
+
* can be merged. `template` and `regex` rules are kept separate.
|
|
258
|
+
*
|
|
259
|
+
* @param rules - Array of split rules to optimize
|
|
260
|
+
* @returns Optimized rules and count of merged rules
|
|
261
|
+
*
|
|
262
|
+
* @example
|
|
263
|
+
* import { optimizeRules } from 'flappa-doormal';
|
|
264
|
+
*
|
|
265
|
+
* const { rules, mergedCount } = optimizeRules([
|
|
266
|
+
* { lineStartsWith: ['{{kitab}}'], fuzzy: true, meta: { type: 'header' } },
|
|
267
|
+
* { lineStartsWith: ['{{bab}}'], fuzzy: true, meta: { type: 'header' } },
|
|
268
|
+
* { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } },
|
|
269
|
+
* ]);
|
|
270
|
+
*
|
|
271
|
+
* // rules[0] = { lineStartsWith: ['{{kitab}}', '{{bab}}'], fuzzy: true, meta: { type: 'header' } }
|
|
272
|
+
* // rules[1] = { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } }
|
|
273
|
+
* // mergedCount = 1
|
|
274
|
+
*/
|
|
275
|
+
const optimizeRules = (rules) => {
|
|
276
|
+
const output = [];
|
|
277
|
+
const indexByMergeKey = /* @__PURE__ */ new Map();
|
|
278
|
+
let mergedCount = 0;
|
|
279
|
+
for (const rule of rules) {
|
|
280
|
+
const patternKey = getPatternKey(rule);
|
|
281
|
+
if (!MERGEABLE_KEYS.has(patternKey)) {
|
|
282
|
+
output.push(rule);
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const mergeKey = createMergeKey(rule);
|
|
286
|
+
const existingIndex = indexByMergeKey.get(mergeKey);
|
|
287
|
+
if (existingIndex === void 0) {
|
|
288
|
+
indexByMergeKey.set(mergeKey, output.length);
|
|
289
|
+
output.push({
|
|
290
|
+
...rule,
|
|
291
|
+
[patternKey]: normalizePatterns(getPatternArray(rule, patternKey))
|
|
292
|
+
});
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
const existing = output[existingIndex];
|
|
296
|
+
existing[patternKey] = normalizePatterns([...getPatternArray(existing, patternKey), ...getPatternArray(rule, patternKey)]);
|
|
297
|
+
mergedCount++;
|
|
298
|
+
}
|
|
299
|
+
output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a));
|
|
300
|
+
return {
|
|
301
|
+
mergedCount,
|
|
302
|
+
rules: output
|
|
303
|
+
};
|
|
304
|
+
};
|
|
305
|
+
|
|
158
306
|
//#endregion
|
|
159
307
|
//#region src/segmentation/tokens.ts
|
|
160
308
|
/**
|
|
@@ -626,6 +774,51 @@ const shouldDefaultToFuzzy = (patterns) => {
|
|
|
626
774
|
return FUZZY_TOKEN_REGEX.test(p);
|
|
627
775
|
});
|
|
628
776
|
};
|
|
777
|
+
/**
|
|
778
|
+
* Apply token mappings to a template string.
|
|
779
|
+
*
|
|
780
|
+
* Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
|
|
781
|
+
* Useful for applying user-configured capture names to a raw template.
|
|
782
|
+
*
|
|
783
|
+
* - Only affects exact matches of `{{token}}`.
|
|
784
|
+
* - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
|
|
785
|
+
* - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
|
|
786
|
+
*
|
|
787
|
+
* @param template - The template string to transform
|
|
788
|
+
* @param mappings - Array of mappings from token name to capture name
|
|
789
|
+
* @returns Transformed template string with captures applied
|
|
790
|
+
*
|
|
791
|
+
* @example
|
|
792
|
+
* applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
|
|
793
|
+
* // → '{{raqms:num}} {{dash}}'
|
|
794
|
+
*/
|
|
795
|
+
const applyTokenMappings = (template, mappings) => {
|
|
796
|
+
let result = template;
|
|
797
|
+
for (const { token, name } of mappings) {
|
|
798
|
+
if (!token || !name) continue;
|
|
799
|
+
const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
|
|
800
|
+
result = result.replace(regex, `{{${token}:${name}}}`);
|
|
801
|
+
}
|
|
802
|
+
return result;
|
|
803
|
+
};
|
|
804
|
+
/**
|
|
805
|
+
* Strip token mappings from a template string.
|
|
806
|
+
*
|
|
807
|
+
* Transforms `{{token:name}}` back into `{{token}}`.
|
|
808
|
+
* Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
|
|
809
|
+
*
|
|
810
|
+
* Useful for normalizing templates for storage or comparison.
|
|
811
|
+
*
|
|
812
|
+
* @param template - The template string to strip
|
|
813
|
+
* @returns Template string with capture names removed
|
|
814
|
+
*
|
|
815
|
+
* @example
|
|
816
|
+
* stripTokenMappings('{{raqms:num}} {{dash}}')
|
|
817
|
+
* // → '{{raqms}} {{dash}}'
|
|
818
|
+
*/
|
|
819
|
+
const stripTokenMappings = (template) => {
|
|
820
|
+
return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
|
|
821
|
+
};
|
|
629
822
|
|
|
630
823
|
//#endregion
|
|
631
824
|
//#region src/segmentation/pattern-validator.ts
|
|
@@ -651,6 +844,7 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
651
844
|
};
|
|
652
845
|
if (seenPatterns.has(pattern)) return {
|
|
653
846
|
message: `Duplicate pattern: "${pattern}"`,
|
|
847
|
+
pattern,
|
|
654
848
|
type: "duplicate"
|
|
655
849
|
};
|
|
656
850
|
seenPatterns.add(pattern);
|
|
@@ -660,6 +854,7 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
660
854
|
if (!KNOWN_TOKENS.has(tokenName)) return {
|
|
661
855
|
message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
662
856
|
suggestion: `Check spelling or use a known token`,
|
|
857
|
+
token: tokenName,
|
|
663
858
|
type: "unknown_token"
|
|
664
859
|
};
|
|
665
860
|
}
|
|
@@ -674,6 +869,7 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
674
869
|
if (before !== "{{" && after !== "}}") return {
|
|
675
870
|
message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
|
|
676
871
|
suggestion: `{{${fullMatch}}}`,
|
|
872
|
+
token: tokenName,
|
|
677
873
|
type: "missing_braces"
|
|
678
874
|
};
|
|
679
875
|
}
|
|
@@ -742,6 +938,39 @@ const validateRules = (rules) => {
|
|
|
742
938
|
return hasIssues ? result : void 0;
|
|
743
939
|
});
|
|
744
940
|
};
|
|
941
|
+
/**
|
|
942
|
+
* Formats a validation result array into a list of human-readable error messages.
|
|
943
|
+
*
|
|
944
|
+
* Useful for displaying validation errors in UIs.
|
|
945
|
+
*
|
|
946
|
+
* @param results - The result array from `validateRules()`
|
|
947
|
+
* @returns Array of formatted error strings
|
|
948
|
+
*
|
|
949
|
+
* @example
|
|
950
|
+
* const issues = validateRules(rules);
|
|
951
|
+
* const errors = formatValidationReport(issues);
|
|
952
|
+
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
953
|
+
*/
|
|
954
|
+
const formatValidationReport = (results) => {
|
|
955
|
+
const errors = [];
|
|
956
|
+
results.forEach((result, ruleIndex) => {
|
|
957
|
+
if (!result) return;
|
|
958
|
+
const formatIssue = (issue, location) => {
|
|
959
|
+
if (!issue) return;
|
|
960
|
+
const type = issue.type;
|
|
961
|
+
if (type === "missing_braces" && issue.token) errors.push(`${location}: Missing {{}} around token "${issue.token}"`);
|
|
962
|
+
else if (type === "unknown_token" && issue.token) errors.push(`${location}: Unknown token "{{${issue.token}}}"`);
|
|
963
|
+
else if (type === "duplicate" && issue.pattern) errors.push(`${location}: Duplicate pattern "${issue.pattern}"`);
|
|
964
|
+
else if (issue.message) errors.push(`${location}: ${issue.message}`);
|
|
965
|
+
else errors.push(`${location}: ${type}`);
|
|
966
|
+
};
|
|
967
|
+
for (const [patternType, issues] of Object.entries(result)) {
|
|
968
|
+
const list = Array.isArray(issues) ? issues : [issues];
|
|
969
|
+
for (const issue of list) if (issue) formatIssue(issue, `Rule ${ruleIndex + 1}, ${patternType}`);
|
|
970
|
+
}
|
|
971
|
+
});
|
|
972
|
+
return errors;
|
|
973
|
+
};
|
|
745
974
|
|
|
746
975
|
//#endregion
|
|
747
976
|
//#region src/segmentation/replace.ts
|
|
@@ -1049,7 +1278,7 @@ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, page
|
|
|
1049
1278
|
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
1050
1279
|
* raw per-page offsets which can desync when structural rules strip markers.
|
|
1051
1280
|
*/
|
|
1052
|
-
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
1281
|
+
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages, logger) => {
|
|
1053
1282
|
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
1054
1283
|
if (!targetPageData) return -1;
|
|
1055
1284
|
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
@@ -1059,13 +1288,45 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1059
1288
|
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
1060
1289
|
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
1061
1290
|
if (!prefix) continue;
|
|
1291
|
+
const candidates = [];
|
|
1062
1292
|
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
1063
1293
|
while (pos !== -1 && pos <= searchEnd) {
|
|
1064
|
-
if (pos > 0
|
|
1294
|
+
if (pos > 0) {
|
|
1295
|
+
const charBefore = remainingContent[pos - 1];
|
|
1296
|
+
if (charBefore === "\n") candidates.push({
|
|
1297
|
+
isNewline: true,
|
|
1298
|
+
pos
|
|
1299
|
+
});
|
|
1300
|
+
else if (/\s/.test(charBefore)) candidates.push({
|
|
1301
|
+
isNewline: false,
|
|
1302
|
+
pos
|
|
1303
|
+
});
|
|
1304
|
+
}
|
|
1065
1305
|
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
1066
1306
|
}
|
|
1067
|
-
|
|
1068
|
-
|
|
1307
|
+
if (candidates.length > 0) {
|
|
1308
|
+
const newlineCandidates = candidates.filter((c) => c.isNewline);
|
|
1309
|
+
const pool = newlineCandidates.length > 0 ? newlineCandidates : candidates;
|
|
1310
|
+
let bestCandidate = pool[0];
|
|
1311
|
+
let bestDistance = Math.abs(pool[0].pos - expectedBoundary);
|
|
1312
|
+
for (let i = 1; i < pool.length; i++) {
|
|
1313
|
+
const dist = Math.abs(pool[i].pos - expectedBoundary);
|
|
1314
|
+
if (dist < bestDistance) {
|
|
1315
|
+
bestDistance = dist;
|
|
1316
|
+
bestCandidate = pool[i];
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
const MAX_DEVIATION = 2e3;
|
|
1320
|
+
if (bestDistance <= MAX_DEVIATION) return bestCandidate.pos;
|
|
1321
|
+
logger?.debug?.("[breakpoints] findPageStartNearExpectedBoundary: Rejected match exceeding deviation", {
|
|
1322
|
+
targetPageIdx,
|
|
1323
|
+
expectedBoundary,
|
|
1324
|
+
bestDistance,
|
|
1325
|
+
maxDeviation: MAX_DEVIATION,
|
|
1326
|
+
matchPos: bestCandidate.pos,
|
|
1327
|
+
prefixLength: len
|
|
1328
|
+
});
|
|
1329
|
+
}
|
|
1069
1330
|
}
|
|
1070
1331
|
return -1;
|
|
1071
1332
|
};
|
|
@@ -1085,6 +1346,7 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1085
1346
|
* @param pageIds - Array of all page IDs
|
|
1086
1347
|
* @param normalizedPages - Map of page ID to normalized content
|
|
1087
1348
|
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
1349
|
+
* @param logger - Optional logger for debugging
|
|
1088
1350
|
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
1089
1351
|
* with a sentinel boundary at segmentContent.length as the last element
|
|
1090
1352
|
*
|
|
@@ -1093,12 +1355,12 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
|
|
|
1093
1355
|
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
1094
1356
|
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
1095
1357
|
*/
|
|
1096
|
-
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1358
|
+
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1097
1359
|
const boundaryPositions = [0];
|
|
1098
1360
|
const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
1099
1361
|
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
1100
1362
|
const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
|
|
1101
|
-
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
|
|
1363
|
+
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1102
1364
|
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
1103
1365
|
if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
|
|
1104
1366
|
else {
|
|
@@ -1142,18 +1404,20 @@ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
|
1142
1404
|
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
1143
1405
|
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
1144
1406
|
*/
|
|
1145
|
-
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1407
|
+
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger) => {
|
|
1146
1408
|
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
1147
1409
|
const desiredNextIdx = windowEndIdx + 1;
|
|
1148
1410
|
const minNextIdx = currentFromIdx + 1;
|
|
1149
1411
|
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
1150
1412
|
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
1413
|
+
let bestExpectedBoundary = remainingContent.length;
|
|
1151
1414
|
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
1152
1415
|
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
1153
|
-
|
|
1416
|
+
if (nextIdx === maxNextIdx) bestExpectedBoundary = expectedBoundary;
|
|
1417
|
+
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages, logger);
|
|
1154
1418
|
if (pos > 0) return pos;
|
|
1155
1419
|
}
|
|
1156
|
-
return remainingContent.length;
|
|
1420
|
+
return Math.min(bestExpectedBoundary, remainingContent.length);
|
|
1157
1421
|
};
|
|
1158
1422
|
/**
|
|
1159
1423
|
* Finds exclusion-based break position using raw cumulative offsets.
|
|
@@ -1231,7 +1495,8 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1231
1495
|
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
1232
1496
|
if (nextPageData) {
|
|
1233
1497
|
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
1234
|
-
|
|
1498
|
+
const tolerance = Math.max(2e3, windowEndPosition * .5);
|
|
1499
|
+
if (pos > 0 && Math.abs(pos - windowEndPosition) <= tolerance) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
1235
1500
|
}
|
|
1236
1501
|
}
|
|
1237
1502
|
return Math.min(windowEndPosition, remainingContent.length);
|
|
@@ -1255,14 +1520,14 @@ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx
|
|
|
1255
1520
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1256
1521
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1257
1522
|
if (regex === null) return {
|
|
1258
|
-
breakpointIndex: i,
|
|
1259
1523
|
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1524
|
+
breakpointIndex: i,
|
|
1260
1525
|
rule
|
|
1261
1526
|
};
|
|
1262
1527
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1263
1528
|
if (breakPos > 0) return {
|
|
1264
|
-
breakpointIndex: i,
|
|
1265
1529
|
breakPos,
|
|
1530
|
+
breakpointIndex: i,
|
|
1266
1531
|
rule
|
|
1267
1532
|
};
|
|
1268
1533
|
}
|
|
@@ -1433,7 +1698,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1433
1698
|
let currentFromIdx = fromIdx;
|
|
1434
1699
|
let isFirstPiece = true;
|
|
1435
1700
|
let lastBreakpoint = null;
|
|
1436
|
-
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1701
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1437
1702
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1438
1703
|
boundaryPositions,
|
|
1439
1704
|
fromIdx,
|
|
@@ -1454,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1454
1719
|
break;
|
|
1455
1720
|
}
|
|
1456
1721
|
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
1457
|
-
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1722
|
+
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, logger);
|
|
1458
1723
|
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
1459
1724
|
currentFromIdx,
|
|
1460
1725
|
cursorPos,
|
|
@@ -2536,7 +2801,7 @@ const buildTokenPriority = () => {
|
|
|
2536
2801
|
return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
|
|
2537
2802
|
};
|
|
2538
2803
|
const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
|
|
2539
|
-
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED
|
|
2804
|
+
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
|
|
2540
2805
|
const compileTokenRegexes = (tokenNames) => {
|
|
2541
2806
|
const compiled = [];
|
|
2542
2807
|
for (const token of tokenNames) {
|
|
@@ -3618,5 +3883,5 @@ function recoverMistakenMarkersForRuns(runs, opts) {
|
|
|
3618
3883
|
}
|
|
3619
3884
|
|
|
3620
3885
|
//#endregion
|
|
3621
|
-
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
3886
|
+
export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, applyTokenMappings, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, formatValidationReport, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules };
|
|
3622
3887
|
//# sourceMappingURL=index.mjs.map
|