flappa-doormal 2.8.0 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +59 -14
- package/README.md +163 -47
- package/dist/index.d.mts +155 -39
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +338 -23
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
package/dist/index.mjs
CHANGED
|
@@ -155,6 +155,154 @@ const makeDiacriticInsensitive = (text) => {
|
|
|
155
155
|
return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
|
|
156
156
|
};
|
|
157
157
|
|
|
158
|
+
//#endregion
|
|
159
|
+
//#region src/segmentation/types.ts
|
|
160
|
+
/**
|
|
161
|
+
* Pattern type key names for split rules.
|
|
162
|
+
*
|
|
163
|
+
* Use this array to dynamically iterate over pattern types in UIs,
|
|
164
|
+
* or use the `PatternTypeKey` type for type-safe string unions.
|
|
165
|
+
*
|
|
166
|
+
* @example
|
|
167
|
+
* // Build a dropdown/select in UI
|
|
168
|
+
* PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
169
|
+
*
|
|
170
|
+
* @example
|
|
171
|
+
* // Type-safe pattern key validation
|
|
172
|
+
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
173
|
+
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
174
|
+
*/
|
|
175
|
+
const PATTERN_TYPE_KEYS = [
|
|
176
|
+
"lineStartsWith",
|
|
177
|
+
"lineStartsAfter",
|
|
178
|
+
"lineEndsWith",
|
|
179
|
+
"template",
|
|
180
|
+
"regex"
|
|
181
|
+
];
|
|
182
|
+
|
|
183
|
+
//#endregion
|
|
184
|
+
//#region src/segmentation/optimize-rules.ts
|
|
185
|
+
/**
|
|
186
|
+
* Rule optimization utilities for merging and sorting split rules.
|
|
187
|
+
*
|
|
188
|
+
* Provides `optimizeRules()` to:
|
|
189
|
+
* 1. Merge compatible rules with same pattern type and options
|
|
190
|
+
* 2. Deduplicate patterns within each rule
|
|
191
|
+
* 3. Sort rules by specificity (longer patterns first)
|
|
192
|
+
*
|
|
193
|
+
* @module optimize-rules
|
|
194
|
+
*/
|
|
195
|
+
const MERGEABLE_KEYS = new Set([
|
|
196
|
+
"lineStartsWith",
|
|
197
|
+
"lineStartsAfter",
|
|
198
|
+
"lineEndsWith"
|
|
199
|
+
]);
|
|
200
|
+
/**
|
|
201
|
+
* Get the pattern type key for a rule.
|
|
202
|
+
*/
|
|
203
|
+
const getPatternKey = (rule) => {
|
|
204
|
+
for (const key of PATTERN_TYPE_KEYS) if (key in rule) return key;
|
|
205
|
+
return "regex";
|
|
206
|
+
};
|
|
207
|
+
/**
|
|
208
|
+
* Get the pattern array for a mergeable rule.
|
|
209
|
+
*/
|
|
210
|
+
const getPatternArray = (rule, key) => {
|
|
211
|
+
const value = rule[key];
|
|
212
|
+
return Array.isArray(value) ? value : [];
|
|
213
|
+
};
|
|
214
|
+
/**
|
|
215
|
+
* Get a string representation of the pattern value (for specificity scoring).
|
|
216
|
+
*/
|
|
217
|
+
const getPatternString = (rule, key) => {
|
|
218
|
+
const value = rule[key];
|
|
219
|
+
if (typeof value === "string") return value;
|
|
220
|
+
if (Array.isArray(value)) return value.join("\n");
|
|
221
|
+
return "";
|
|
222
|
+
};
|
|
223
|
+
/**
|
|
224
|
+
* Deduplicate and sort patterns by length (longest first).
|
|
225
|
+
*/
|
|
226
|
+
const normalizePatterns = (patterns) => {
|
|
227
|
+
return [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
228
|
+
};
|
|
229
|
+
/**
|
|
230
|
+
* Calculate specificity score for a rule (higher = more specific).
|
|
231
|
+
* Based on the longest pattern length.
|
|
232
|
+
*/
|
|
233
|
+
const getSpecificityScore = (rule) => {
|
|
234
|
+
const key = getPatternKey(rule);
|
|
235
|
+
if (MERGEABLE_KEYS.has(key)) return getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0);
|
|
236
|
+
return getPatternString(rule, key).length;
|
|
237
|
+
};
|
|
238
|
+
/**
|
|
239
|
+
* Create a merge key for a rule based on pattern type and all non-pattern properties.
|
|
240
|
+
* Rules with the same merge key can have their patterns combined.
|
|
241
|
+
*/
|
|
242
|
+
const createMergeKey = (rule) => {
|
|
243
|
+
const patternKey = getPatternKey(rule);
|
|
244
|
+
const { [patternKey]: _pattern, ...rest } = rule;
|
|
245
|
+
return `${patternKey}|${JSON.stringify(rest)}`;
|
|
246
|
+
};
|
|
247
|
+
/**
|
|
248
|
+
* Optimize split rules by merging compatible rules and sorting by specificity.
|
|
249
|
+
*
|
|
250
|
+
* This function:
|
|
251
|
+
* 1. **Merges compatible rules**: Rules with the same pattern type and identical
|
|
252
|
+
* options (meta, fuzzy, min/max, etc.) have their pattern arrays combined
|
|
253
|
+
* 2. **Deduplicates patterns**: Removes duplicate patterns within each rule
|
|
254
|
+
* 3. **Sorts by specificity**: Rules with longer patterns come first
|
|
255
|
+
*
|
|
256
|
+
* Only array-based pattern types (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`)
|
|
257
|
+
* can be merged. `template` and `regex` rules are kept separate.
|
|
258
|
+
*
|
|
259
|
+
* @param rules - Array of split rules to optimize
|
|
260
|
+
* @returns Optimized rules and count of merged rules
|
|
261
|
+
*
|
|
262
|
+
* @example
|
|
263
|
+
* import { optimizeRules } from 'flappa-doormal';
|
|
264
|
+
*
|
|
265
|
+
* const { rules, mergedCount } = optimizeRules([
|
|
266
|
+
* { lineStartsWith: ['{{kitab}}'], fuzzy: true, meta: { type: 'header' } },
|
|
267
|
+
* { lineStartsWith: ['{{bab}}'], fuzzy: true, meta: { type: 'header' } },
|
|
268
|
+
* { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } },
|
|
269
|
+
* ]);
|
|
270
|
+
*
|
|
271
|
+
* // rules[0] = { lineStartsWith: ['{{kitab}}', '{{bab}}'], fuzzy: true, meta: { type: 'header' } }
|
|
272
|
+
* // rules[1] = { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } }
|
|
273
|
+
* // mergedCount = 1
|
|
274
|
+
*/
|
|
275
|
+
const optimizeRules = (rules) => {
|
|
276
|
+
const output = [];
|
|
277
|
+
const indexByMergeKey = /* @__PURE__ */ new Map();
|
|
278
|
+
let mergedCount = 0;
|
|
279
|
+
for (const rule of rules) {
|
|
280
|
+
const patternKey = getPatternKey(rule);
|
|
281
|
+
if (!MERGEABLE_KEYS.has(patternKey)) {
|
|
282
|
+
output.push(rule);
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const mergeKey = createMergeKey(rule);
|
|
286
|
+
const existingIndex = indexByMergeKey.get(mergeKey);
|
|
287
|
+
if (existingIndex === void 0) {
|
|
288
|
+
indexByMergeKey.set(mergeKey, output.length);
|
|
289
|
+
output.push({
|
|
290
|
+
...rule,
|
|
291
|
+
[patternKey]: normalizePatterns(getPatternArray(rule, patternKey))
|
|
292
|
+
});
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
const existing = output[existingIndex];
|
|
296
|
+
existing[patternKey] = normalizePatterns([...getPatternArray(existing, patternKey), ...getPatternArray(rule, patternKey)]);
|
|
297
|
+
mergedCount++;
|
|
298
|
+
}
|
|
299
|
+
output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a));
|
|
300
|
+
return {
|
|
301
|
+
mergedCount,
|
|
302
|
+
rules: output
|
|
303
|
+
};
|
|
304
|
+
};
|
|
305
|
+
|
|
158
306
|
//#endregion
|
|
159
307
|
//#region src/segmentation/tokens.ts
|
|
160
308
|
/**
|
|
@@ -626,6 +774,51 @@ const shouldDefaultToFuzzy = (patterns) => {
|
|
|
626
774
|
return FUZZY_TOKEN_REGEX.test(p);
|
|
627
775
|
});
|
|
628
776
|
};
|
|
777
|
+
/**
|
|
778
|
+
* Apply token mappings to a template string.
|
|
779
|
+
*
|
|
780
|
+
* Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
|
|
781
|
+
* Useful for applying user-configured capture names to a raw template.
|
|
782
|
+
*
|
|
783
|
+
* - Only affects exact matches of `{{token}}`.
|
|
784
|
+
* - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
|
|
785
|
+
* - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
|
|
786
|
+
*
|
|
787
|
+
* @param template - The template string to transform
|
|
788
|
+
* @param mappings - Array of mappings from token name to capture name
|
|
789
|
+
* @returns Transformed template string with captures applied
|
|
790
|
+
*
|
|
791
|
+
* @example
|
|
792
|
+
* applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
|
|
793
|
+
* // → '{{raqms:num}} {{dash}}'
|
|
794
|
+
*/
|
|
795
|
+
const applyTokenMappings = (template, mappings) => {
|
|
796
|
+
let result = template;
|
|
797
|
+
for (const { token, name } of mappings) {
|
|
798
|
+
if (!token || !name) continue;
|
|
799
|
+
const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
|
|
800
|
+
result = result.replace(regex, `{{${token}:${name}}}`);
|
|
801
|
+
}
|
|
802
|
+
return result;
|
|
803
|
+
};
|
|
804
|
+
/**
|
|
805
|
+
* Strip token mappings from a template string.
|
|
806
|
+
*
|
|
807
|
+
* Transforms `{{token:name}}` back into `{{token}}`.
|
|
808
|
+
* Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
|
|
809
|
+
*
|
|
810
|
+
* Useful for normalizing templates for storage or comparison.
|
|
811
|
+
*
|
|
812
|
+
* @param template - The template string to strip
|
|
813
|
+
* @returns Template string with capture names removed
|
|
814
|
+
*
|
|
815
|
+
* @example
|
|
816
|
+
* stripTokenMappings('{{raqms:num}} {{dash}}')
|
|
817
|
+
* // → '{{raqms}} {{dash}}'
|
|
818
|
+
*/
|
|
819
|
+
const stripTokenMappings = (template) => {
|
|
820
|
+
return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
|
|
821
|
+
};
|
|
629
822
|
|
|
630
823
|
//#endregion
|
|
631
824
|
//#region src/segmentation/pattern-validator.ts
|
|
@@ -645,8 +838,13 @@ const buildBareTokenRegex = () => {
|
|
|
645
838
|
* Validates a single pattern for common issues.
|
|
646
839
|
*/
|
|
647
840
|
const validatePattern = (pattern, seenPatterns) => {
|
|
841
|
+
if (!pattern.trim()) return {
|
|
842
|
+
message: "Empty pattern is not allowed",
|
|
843
|
+
type: "empty_pattern"
|
|
844
|
+
};
|
|
648
845
|
if (seenPatterns.has(pattern)) return {
|
|
649
846
|
message: `Duplicate pattern: "${pattern}"`,
|
|
847
|
+
pattern,
|
|
650
848
|
type: "duplicate"
|
|
651
849
|
};
|
|
652
850
|
seenPatterns.add(pattern);
|
|
@@ -656,6 +854,7 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
656
854
|
if (!KNOWN_TOKENS.has(tokenName)) return {
|
|
657
855
|
message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
658
856
|
suggestion: `Check spelling or use a known token`,
|
|
857
|
+
token: tokenName,
|
|
659
858
|
type: "unknown_token"
|
|
660
859
|
};
|
|
661
860
|
}
|
|
@@ -670,6 +869,7 @@ const validatePattern = (pattern, seenPatterns) => {
|
|
|
670
869
|
if (before !== "{{" && after !== "}}") return {
|
|
671
870
|
message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
|
|
672
871
|
suggestion: `{{${fullMatch}}}`,
|
|
872
|
+
token: tokenName,
|
|
673
873
|
type: "missing_braces"
|
|
674
874
|
};
|
|
675
875
|
}
|
|
@@ -727,7 +927,7 @@ const validateRules = (rules) => {
|
|
|
727
927
|
hasIssues = true;
|
|
728
928
|
}
|
|
729
929
|
}
|
|
730
|
-
if ("template" in rule && rule.template) {
|
|
930
|
+
if ("template" in rule && rule.template !== void 0) {
|
|
731
931
|
const seenPatterns = /* @__PURE__ */ new Set();
|
|
732
932
|
const issue = validatePattern(rule.template, seenPatterns);
|
|
733
933
|
if (issue) {
|
|
@@ -738,6 +938,39 @@ const validateRules = (rules) => {
|
|
|
738
938
|
return hasIssues ? result : void 0;
|
|
739
939
|
});
|
|
740
940
|
};
|
|
941
|
+
/**
|
|
942
|
+
* Formats a validation result array into a list of human-readable error messages.
|
|
943
|
+
*
|
|
944
|
+
* Useful for displaying validation errors in UIs.
|
|
945
|
+
*
|
|
946
|
+
* @param results - The result array from `validateRules()`
|
|
947
|
+
* @returns Array of formatted error strings
|
|
948
|
+
*
|
|
949
|
+
* @example
|
|
950
|
+
* const issues = validateRules(rules);
|
|
951
|
+
* const errors = formatValidationReport(issues);
|
|
952
|
+
* // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
|
|
953
|
+
*/
|
|
954
|
+
const formatValidationReport = (results) => {
|
|
955
|
+
const errors = [];
|
|
956
|
+
results.forEach((result, ruleIndex) => {
|
|
957
|
+
if (!result) return;
|
|
958
|
+
const formatIssue = (issue, location) => {
|
|
959
|
+
if (!issue) return;
|
|
960
|
+
const type = issue.type;
|
|
961
|
+
if (type === "missing_braces" && issue.token) errors.push(`${location}: Missing {{}} around token "${issue.token}"`);
|
|
962
|
+
else if (type === "unknown_token" && issue.token) errors.push(`${location}: Unknown token "{{${issue.token}}}"`);
|
|
963
|
+
else if (type === "duplicate" && issue.pattern) errors.push(`${location}: Duplicate pattern "${issue.pattern}"`);
|
|
964
|
+
else if (issue.message) errors.push(`${location}: ${issue.message}`);
|
|
965
|
+
else errors.push(`${location}: ${type}`);
|
|
966
|
+
};
|
|
967
|
+
for (const [patternType, issues] of Object.entries(result)) {
|
|
968
|
+
const list = Array.isArray(issues) ? issues : [issues];
|
|
969
|
+
for (const issue of list) if (issue) formatIssue(issue, `Rule ${ruleIndex + 1}, ${patternType}`);
|
|
970
|
+
}
|
|
971
|
+
});
|
|
972
|
+
return errors;
|
|
973
|
+
};
|
|
741
974
|
|
|
742
975
|
//#endregion
|
|
743
976
|
//#region src/segmentation/replace.ts
|
|
@@ -1245,16 +1478,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
|
|
|
1245
1478
|
*/
|
|
1246
1479
|
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
|
|
1247
1480
|
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
1248
|
-
for (
|
|
1481
|
+
for (let i = 0; i < expandedBreakpoints.length; i++) {
|
|
1482
|
+
const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
|
|
1249
1483
|
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
1250
1484
|
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1251
1485
|
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1252
|
-
if (regex === null) return
|
|
1486
|
+
if (regex === null) return {
|
|
1487
|
+
breakpointIndex: i,
|
|
1488
|
+
breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
|
|
1489
|
+
rule
|
|
1490
|
+
};
|
|
1253
1491
|
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1254
|
-
if (breakPos > 0) return
|
|
1492
|
+
if (breakPos > 0) return {
|
|
1493
|
+
breakpointIndex: i,
|
|
1494
|
+
breakPos,
|
|
1495
|
+
rule
|
|
1496
|
+
};
|
|
1255
1497
|
}
|
|
1256
|
-
return
|
|
1498
|
+
return null;
|
|
1499
|
+
};
|
|
1500
|
+
|
|
1501
|
+
//#endregion
|
|
1502
|
+
//#region src/segmentation/debug-meta.ts
|
|
1503
|
+
const resolveDebugConfig = (debug) => {
|
|
1504
|
+
if (!debug) return null;
|
|
1505
|
+
if (debug === true) return {
|
|
1506
|
+
includeBreakpoint: true,
|
|
1507
|
+
includeRule: true,
|
|
1508
|
+
metaKey: "_flappa"
|
|
1509
|
+
};
|
|
1510
|
+
if (typeof debug !== "object") return null;
|
|
1511
|
+
const metaKey = debug.metaKey;
|
|
1512
|
+
const include = debug.include;
|
|
1513
|
+
const includeRule = Array.isArray(include) ? include.includes("rule") : true;
|
|
1514
|
+
return {
|
|
1515
|
+
includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
|
|
1516
|
+
includeRule,
|
|
1517
|
+
metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
|
|
1518
|
+
};
|
|
1519
|
+
};
|
|
1520
|
+
const getRulePatternType = (rule) => {
|
|
1521
|
+
if ("lineStartsWith" in rule) return "lineStartsWith";
|
|
1522
|
+
if ("lineStartsAfter" in rule) return "lineStartsAfter";
|
|
1523
|
+
if ("lineEndsWith" in rule) return "lineEndsWith";
|
|
1524
|
+
if ("template" in rule) return "template";
|
|
1525
|
+
return "regex";
|
|
1526
|
+
};
|
|
1527
|
+
const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
|
|
1528
|
+
const mergeDebugIntoMeta = (meta, metaKey, patch) => {
|
|
1529
|
+
const out = meta ? { ...meta } : {};
|
|
1530
|
+
const existing = out[metaKey];
|
|
1531
|
+
out[metaKey] = {
|
|
1532
|
+
...isPlainObject(existing) ? existing : {},
|
|
1533
|
+
...patch
|
|
1534
|
+
};
|
|
1535
|
+
return out;
|
|
1257
1536
|
};
|
|
1537
|
+
const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
|
|
1538
|
+
index: ruleIndex,
|
|
1539
|
+
patternType: getRulePatternType(rule)
|
|
1540
|
+
} });
|
|
1541
|
+
const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
|
|
1542
|
+
index: breakpointIndex,
|
|
1543
|
+
kind: rule.pattern === "" ? "pageBoundary" : "pattern",
|
|
1544
|
+
pattern: rule.pattern
|
|
1545
|
+
} });
|
|
1258
1546
|
|
|
1259
1547
|
//#endregion
|
|
1260
1548
|
//#region src/segmentation/breakpoint-processor.ts
|
|
@@ -1338,15 +1626,20 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
|
|
|
1338
1626
|
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
1339
1627
|
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
1340
1628
|
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1341
|
-
if (exclusionBreak > 0) return exclusionBreak;
|
|
1629
|
+
if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
|
|
1342
1630
|
}
|
|
1343
|
-
const
|
|
1631
|
+
const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
1344
1632
|
expandedBreakpoints,
|
|
1345
1633
|
normalizedPages,
|
|
1346
1634
|
pageIds,
|
|
1347
1635
|
prefer
|
|
1348
1636
|
});
|
|
1349
|
-
|
|
1637
|
+
if (patternMatch && patternMatch.breakPos > 0) return {
|
|
1638
|
+
breakOffset: patternMatch.breakPos,
|
|
1639
|
+
breakpointIndex: patternMatch.breakpointIndex,
|
|
1640
|
+
breakpointRule: patternMatch.rule
|
|
1641
|
+
};
|
|
1642
|
+
return { breakOffset: windowEndPosition };
|
|
1350
1643
|
};
|
|
1351
1644
|
/**
|
|
1352
1645
|
* Advances cursor position past any leading whitespace.
|
|
@@ -1362,12 +1655,13 @@ const skipWhitespace$1 = (content, startPos) => {
|
|
|
1362
1655
|
*
|
|
1363
1656
|
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
1364
1657
|
*/
|
|
1365
|
-
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
1658
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
|
|
1366
1659
|
const result = [];
|
|
1367
1660
|
const fullContent = segment.content;
|
|
1368
1661
|
let cursorPos = 0;
|
|
1369
1662
|
let currentFromIdx = fromIdx;
|
|
1370
1663
|
let isFirstPiece = true;
|
|
1664
|
+
let lastBreakpoint = null;
|
|
1371
1665
|
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1372
1666
|
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1373
1667
|
boundaryPositions,
|
|
@@ -1382,7 +1676,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1382
1676
|
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
1383
1677
|
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1384
1678
|
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1385
|
-
const
|
|
1679
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1680
|
+
const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
|
|
1681
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
|
|
1386
1682
|
if (finalSeg) result.push(finalSeg);
|
|
1387
1683
|
break;
|
|
1388
1684
|
}
|
|
@@ -1393,8 +1689,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1393
1689
|
cursorPos,
|
|
1394
1690
|
windowEndIdx
|
|
1395
1691
|
});
|
|
1396
|
-
const
|
|
1397
|
-
|
|
1692
|
+
const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
1693
|
+
if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
|
|
1694
|
+
breakpointIndex: found.breakpointIndex,
|
|
1695
|
+
rule: found.breakpointRule
|
|
1696
|
+
};
|
|
1697
|
+
const breakPos = cursorPos + found.breakOffset;
|
|
1398
1698
|
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
1399
1699
|
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1400
1700
|
logger?.trace?.("[breakpoints] piece", {
|
|
@@ -1403,7 +1703,8 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1403
1703
|
pieceLength: pieceContent.length
|
|
1404
1704
|
});
|
|
1405
1705
|
if (pieceContent) {
|
|
1406
|
-
const
|
|
1706
|
+
const includeMeta = isFirstPiece || Boolean(debugMetaKey);
|
|
1707
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
|
|
1407
1708
|
if (pieceSeg) result.push(pieceSeg);
|
|
1408
1709
|
}
|
|
1409
1710
|
cursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
@@ -1418,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
|
|
|
1418
1719
|
*
|
|
1419
1720
|
* Note: This is an internal engine used by `segmentPages()`.
|
|
1420
1721
|
*/
|
|
1421
|
-
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
|
|
1722
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
|
|
1422
1723
|
const pageIds = pages.map((p) => p.id);
|
|
1423
1724
|
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
1424
1725
|
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
@@ -1446,7 +1747,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
|
|
|
1446
1747
|
result.push(segment);
|
|
1447
1748
|
continue;
|
|
1448
1749
|
}
|
|
1449
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
1750
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
|
|
1450
1751
|
result.push(...broken.map((s) => {
|
|
1451
1752
|
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
1452
1753
|
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
@@ -2059,13 +2360,25 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
|
2059
2360
|
}
|
|
2060
2361
|
return matches;
|
|
2061
2362
|
};
|
|
2062
|
-
const applyOccurrenceFilter = (rules, splitPointsByRule) => {
|
|
2363
|
+
const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
|
|
2063
2364
|
const result = [];
|
|
2064
2365
|
rules.forEach((rule, index) => {
|
|
2065
2366
|
const points = splitPointsByRule.get(index);
|
|
2066
2367
|
if (!points?.length) return;
|
|
2067
2368
|
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
2068
|
-
|
|
2369
|
+
if (!debugMetaKey) {
|
|
2370
|
+
result.push(...filtered.map((p) => ({
|
|
2371
|
+
...p,
|
|
2372
|
+
ruleIndex: index
|
|
2373
|
+
})));
|
|
2374
|
+
return;
|
|
2375
|
+
}
|
|
2376
|
+
const debugPatch = buildRuleDebugPatch(index, rule);
|
|
2377
|
+
result.push(...filtered.map((p) => ({
|
|
2378
|
+
...p,
|
|
2379
|
+
meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
|
|
2380
|
+
ruleIndex: index
|
|
2381
|
+
})));
|
|
2069
2382
|
});
|
|
2070
2383
|
return result;
|
|
2071
2384
|
};
|
|
@@ -2203,7 +2516,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
2203
2516
|
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
2204
2517
|
return [initialSeg];
|
|
2205
2518
|
};
|
|
2206
|
-
const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
2519
|
+
const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
|
|
2207
2520
|
logger?.debug?.("[segmenter] collecting split points from rules", {
|
|
2208
2521
|
contentLength: matchContent.length,
|
|
2209
2522
|
ruleCount: rules.length
|
|
@@ -2218,7 +2531,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
|
2218
2531
|
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
2219
2532
|
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
2220
2533
|
for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
2221
|
-
return applyOccurrenceFilter(rules, splitPointsByRule);
|
|
2534
|
+
return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
|
|
2222
2535
|
};
|
|
2223
2536
|
/**
|
|
2224
2537
|
* Finds page breaks within a given offset range using binary search.
|
|
@@ -2321,6 +2634,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
|
|
|
2321
2634
|
*/
|
|
2322
2635
|
const segmentPages = (pages, options) => {
|
|
2323
2636
|
const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
|
|
2637
|
+
const debug = resolveDebugConfig(options.debug);
|
|
2638
|
+
const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
|
|
2324
2639
|
logger?.info?.("[segmenter] starting segmentation", {
|
|
2325
2640
|
breakpointCount: breakpoints.length,
|
|
2326
2641
|
maxPages,
|
|
@@ -2334,7 +2649,7 @@ const segmentPages = (pages, options) => {
|
|
|
2334
2649
|
pageIds: pageMap.pageIds,
|
|
2335
2650
|
totalContentLength: matchContent.length
|
|
2336
2651
|
});
|
|
2337
|
-
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
|
|
2652
|
+
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
|
|
2338
2653
|
const unique = dedupeSplitPoints(splitPoints);
|
|
2339
2654
|
logger?.debug?.("[segmenter] split points collected", {
|
|
2340
2655
|
rawSplitPoints: splitPoints.length,
|
|
@@ -2353,7 +2668,7 @@ const segmentPages = (pages, options) => {
|
|
|
2353
2668
|
if (maxPages >= 0 && breakpoints.length) {
|
|
2354
2669
|
logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
|
|
2355
2670
|
const patternProcessor = (p) => processPattern(p, false).pattern;
|
|
2356
|
-
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
|
|
2671
|
+
const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
|
|
2357
2672
|
logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
|
|
2358
2673
|
return result;
|
|
2359
2674
|
}
|
|
@@ -2450,7 +2765,7 @@ const buildTokenPriority = () => {
|
|
|
2450
2765
|
return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
|
|
2451
2766
|
};
|
|
2452
2767
|
const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
|
|
2453
|
-
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED
|
|
2768
|
+
const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
|
|
2454
2769
|
const compileTokenRegexes = (tokenNames) => {
|
|
2455
2770
|
const compiled = [];
|
|
2456
2771
|
for (const token of tokenNames) {
|
|
@@ -3532,5 +3847,5 @@ function recoverMistakenMarkersForRuns(runs, opts) {
|
|
|
3532
3847
|
}
|
|
3533
3848
|
|
|
3534
3849
|
//#endregion
|
|
3535
|
-
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
3850
|
+
export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, applyTokenMappings, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, formatValidationReport, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules };
|
|
3536
3851
|
//# sourceMappingURL=index.mjs.map
|