flappa-doormal 2.8.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -155,6 +155,154 @@ const makeDiacriticInsensitive = (text) => {
155
155
  return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
156
156
  };
157
157
 
158
+ //#endregion
159
+ //#region src/segmentation/types.ts
160
+ /**
161
+ * Pattern type key names for split rules.
162
+ *
163
+ * Use this array to dynamically iterate over pattern types in UIs,
164
+ * or use the `PatternTypeKey` type for type-safe string unions.
165
+ *
166
+ * @example
167
+ * // Build a dropdown/select in UI
168
+ * PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
169
+ *
170
+ * @example
171
+ * // Type-safe pattern key validation
172
+ * const validateKey = (k: string): k is PatternTypeKey =>
173
+ * (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
174
+ */
175
+ const PATTERN_TYPE_KEYS = [
176
+ "lineStartsWith",
177
+ "lineStartsAfter",
178
+ "lineEndsWith",
179
+ "template",
180
+ "regex"
181
+ ];
182
+
183
+ //#endregion
184
+ //#region src/segmentation/optimize-rules.ts
185
+ /**
186
+ * Rule optimization utilities for merging and sorting split rules.
187
+ *
188
+ * Provides `optimizeRules()` to:
189
+ * 1. Merge compatible rules with same pattern type and options
190
+ * 2. Deduplicate patterns within each rule
191
+ * 3. Sort rules by specificity (longer patterns first)
192
+ *
193
+ * @module optimize-rules
194
+ */
195
+ const MERGEABLE_KEYS = new Set([
196
+ "lineStartsWith",
197
+ "lineStartsAfter",
198
+ "lineEndsWith"
199
+ ]);
200
+ /**
201
+ * Get the pattern type key for a rule.
202
+ */
203
+ const getPatternKey = (rule) => {
204
+ for (const key of PATTERN_TYPE_KEYS) if (key in rule) return key;
205
+ return "regex";
206
+ };
207
+ /**
208
+ * Get the pattern array for a mergeable rule.
209
+ */
210
+ const getPatternArray = (rule, key) => {
211
+ const value = rule[key];
212
+ return Array.isArray(value) ? value : [];
213
+ };
214
+ /**
215
+ * Get a string representation of the pattern value (for specificity scoring).
216
+ */
217
+ const getPatternString = (rule, key) => {
218
+ const value = rule[key];
219
+ if (typeof value === "string") return value;
220
+ if (Array.isArray(value)) return value.join("\n");
221
+ return "";
222
+ };
223
+ /**
224
+ * Deduplicate and sort patterns by length (longest first).
225
+ */
226
+ const normalizePatterns = (patterns) => {
227
+ return [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
228
+ };
229
+ /**
230
+ * Calculate specificity score for a rule (higher = more specific).
231
+ * Based on the longest pattern length.
232
+ */
233
+ const getSpecificityScore = (rule) => {
234
+ const key = getPatternKey(rule);
235
+ if (MERGEABLE_KEYS.has(key)) return getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0);
236
+ return getPatternString(rule, key).length;
237
+ };
238
+ /**
239
+ * Create a merge key for a rule based on pattern type and all non-pattern properties.
240
+ * Rules with the same merge key can have their patterns combined.
241
+ */
242
+ const createMergeKey = (rule) => {
243
+ const patternKey = getPatternKey(rule);
244
+ const { [patternKey]: _pattern, ...rest } = rule;
245
+ return `${patternKey}|${JSON.stringify(rest)}`;
246
+ };
247
+ /**
248
+ * Optimize split rules by merging compatible rules and sorting by specificity.
249
+ *
250
+ * This function:
251
+ * 1. **Merges compatible rules**: Rules with the same pattern type and identical
252
+ * options (meta, fuzzy, min/max, etc.) have their pattern arrays combined
253
+ * 2. **Deduplicates patterns**: Removes duplicate patterns within each rule
254
+ * 3. **Sorts by specificity**: Rules with longer patterns come first
255
+ *
256
+ * Only array-based pattern types (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`)
257
+ * can be merged. `template` and `regex` rules are kept separate.
258
+ *
259
+ * @param rules - Array of split rules to optimize
260
+ * @returns Optimized rules and count of merged rules
261
+ *
262
+ * @example
263
+ * import { optimizeRules } from 'flappa-doormal';
264
+ *
265
+ * const { rules, mergedCount } = optimizeRules([
266
+ * { lineStartsWith: ['{{kitab}}'], fuzzy: true, meta: { type: 'header' } },
267
+ * { lineStartsWith: ['{{bab}}'], fuzzy: true, meta: { type: 'header' } },
268
+ * { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } },
269
+ * ]);
270
+ *
271
+ * // rules[0] = { lineStartsWith: ['{{kitab}}', '{{bab}}'], fuzzy: true, meta: { type: 'header' } }
272
+ * // rules[1] = { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } }
273
+ * // mergedCount = 1
274
+ */
275
+ const optimizeRules = (rules) => {
276
+ const output = [];
277
+ const indexByMergeKey = /* @__PURE__ */ new Map();
278
+ let mergedCount = 0;
279
+ for (const rule of rules) {
280
+ const patternKey = getPatternKey(rule);
281
+ if (!MERGEABLE_KEYS.has(patternKey)) {
282
+ output.push(rule);
283
+ continue;
284
+ }
285
+ const mergeKey = createMergeKey(rule);
286
+ const existingIndex = indexByMergeKey.get(mergeKey);
287
+ if (existingIndex === void 0) {
288
+ indexByMergeKey.set(mergeKey, output.length);
289
+ output.push({
290
+ ...rule,
291
+ [patternKey]: normalizePatterns(getPatternArray(rule, patternKey))
292
+ });
293
+ continue;
294
+ }
295
+ const existing = output[existingIndex];
296
+ existing[patternKey] = normalizePatterns([...getPatternArray(existing, patternKey), ...getPatternArray(rule, patternKey)]);
297
+ mergedCount++;
298
+ }
299
+ output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a));
300
+ return {
301
+ mergedCount,
302
+ rules: output
303
+ };
304
+ };
305
+
158
306
  //#endregion
159
307
  //#region src/segmentation/tokens.ts
160
308
  /**
@@ -626,6 +774,51 @@ const shouldDefaultToFuzzy = (patterns) => {
626
774
  return FUZZY_TOKEN_REGEX.test(p);
627
775
  });
628
776
  };
777
+ /**
778
+ * Apply token mappings to a template string.
779
+ *
780
+ * Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
781
+ * Useful for applying user-configured capture names to a raw template.
782
+ *
783
+ * - Only affects exact matches of `{{token}}`.
784
+ * - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
785
+ * - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
786
+ *
787
+ * @param template - The template string to transform
788
+ * @param mappings - Array of mappings from token name to capture name
789
+ * @returns Transformed template string with captures applied
790
+ *
791
+ * @example
792
+ * applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
793
+ * // → '{{raqms:num}} {{dash}}'
794
+ */
795
+ const applyTokenMappings = (template, mappings) => {
796
+ let result = template;
797
+ for (const { token, name } of mappings) {
798
+ if (!token || !name) continue;
799
+ const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
800
+ result = result.replace(regex, `{{${token}:${name}}}`);
801
+ }
802
+ return result;
803
+ };
804
+ /**
805
+ * Strip token mappings from a template string.
806
+ *
807
+ * Transforms `{{token:name}}` back into `{{token}}`.
808
+ * Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
809
+ *
810
+ * Useful for normalizing templates for storage or comparison.
811
+ *
812
+ * @param template - The template string to strip
813
+ * @returns Template string with capture names removed
814
+ *
815
+ * @example
816
+ * stripTokenMappings('{{raqms:num}} {{dash}}')
817
+ * // → '{{raqms}} {{dash}}'
818
+ */
819
+ const stripTokenMappings = (template) => {
820
+ return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
821
+ };
629
822
 
630
823
  //#endregion
631
824
  //#region src/segmentation/pattern-validator.ts
@@ -645,8 +838,13 @@ const buildBareTokenRegex = () => {
645
838
  * Validates a single pattern for common issues.
646
839
  */
647
840
  const validatePattern = (pattern, seenPatterns) => {
841
+ if (!pattern.trim()) return {
842
+ message: "Empty pattern is not allowed",
843
+ type: "empty_pattern"
844
+ };
648
845
  if (seenPatterns.has(pattern)) return {
649
846
  message: `Duplicate pattern: "${pattern}"`,
847
+ pattern,
650
848
  type: "duplicate"
651
849
  };
652
850
  seenPatterns.add(pattern);
@@ -656,6 +854,7 @@ const validatePattern = (pattern, seenPatterns) => {
656
854
  if (!KNOWN_TOKENS.has(tokenName)) return {
657
855
  message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
658
856
  suggestion: `Check spelling or use a known token`,
857
+ token: tokenName,
659
858
  type: "unknown_token"
660
859
  };
661
860
  }
@@ -670,6 +869,7 @@ const validatePattern = (pattern, seenPatterns) => {
670
869
  if (before !== "{{" && after !== "}}") return {
671
870
  message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
672
871
  suggestion: `{{${fullMatch}}}`,
872
+ token: tokenName,
673
873
  type: "missing_braces"
674
874
  };
675
875
  }
@@ -727,7 +927,7 @@ const validateRules = (rules) => {
727
927
  hasIssues = true;
728
928
  }
729
929
  }
730
- if ("template" in rule && rule.template) {
930
+ if ("template" in rule && rule.template !== void 0) {
731
931
  const seenPatterns = /* @__PURE__ */ new Set();
732
932
  const issue = validatePattern(rule.template, seenPatterns);
733
933
  if (issue) {
@@ -738,6 +938,39 @@ const validateRules = (rules) => {
738
938
  return hasIssues ? result : void 0;
739
939
  });
740
940
  };
941
+ /**
942
+ * Formats a validation result array into a list of human-readable error messages.
943
+ *
944
+ * Useful for displaying validation errors in UIs.
945
+ *
946
+ * @param results - The result array from `validateRules()`
947
+ * @returns Array of formatted error strings
948
+ *
949
+ * @example
950
+ * const issues = validateRules(rules);
951
+ * const errors = formatValidationReport(issues);
952
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
953
+ */
954
+ const formatValidationReport = (results) => {
955
+ const errors = [];
956
+ results.forEach((result, ruleIndex) => {
957
+ if (!result) return;
958
+ const formatIssue = (issue, location) => {
959
+ if (!issue) return;
960
+ const type = issue.type;
961
+ if (type === "missing_braces" && issue.token) errors.push(`${location}: Missing {{}} around token "${issue.token}"`);
962
+ else if (type === "unknown_token" && issue.token) errors.push(`${location}: Unknown token "{{${issue.token}}}"`);
963
+ else if (type === "duplicate" && issue.pattern) errors.push(`${location}: Duplicate pattern "${issue.pattern}"`);
964
+ else if (issue.message) errors.push(`${location}: ${issue.message}`);
965
+ else errors.push(`${location}: ${type}`);
966
+ };
967
+ for (const [patternType, issues] of Object.entries(result)) {
968
+ const list = Array.isArray(issues) ? issues : [issues];
969
+ for (const issue of list) if (issue) formatIssue(issue, `Rule ${ruleIndex + 1}, ${patternType}`);
970
+ }
971
+ });
972
+ return errors;
973
+ };
741
974
 
742
975
  //#endregion
743
976
  //#region src/segmentation/replace.ts
@@ -1245,16 +1478,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
1245
1478
  */
1246
1479
  const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
1247
1480
  const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
1248
- for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
1481
+ for (let i = 0; i < expandedBreakpoints.length; i++) {
1482
+ const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
1249
1483
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
1250
1484
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1251
1485
  if (skipWhenRegex?.test(remainingContent)) continue;
1252
- if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
1486
+ if (regex === null) return {
1487
+ breakpointIndex: i,
1488
+ breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
1489
+ rule
1490
+ };
1253
1491
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1254
- if (breakPos > 0) return breakPos;
1492
+ if (breakPos > 0) return {
1493
+ breakpointIndex: i,
1494
+ breakPos,
1495
+ rule
1496
+ };
1255
1497
  }
1256
- return -1;
1498
+ return null;
1499
+ };
1500
+
1501
+ //#endregion
1502
+ //#region src/segmentation/debug-meta.ts
1503
+ const resolveDebugConfig = (debug) => {
1504
+ if (!debug) return null;
1505
+ if (debug === true) return {
1506
+ includeBreakpoint: true,
1507
+ includeRule: true,
1508
+ metaKey: "_flappa"
1509
+ };
1510
+ if (typeof debug !== "object") return null;
1511
+ const metaKey = debug.metaKey;
1512
+ const include = debug.include;
1513
+ const includeRule = Array.isArray(include) ? include.includes("rule") : true;
1514
+ return {
1515
+ includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
1516
+ includeRule,
1517
+ metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
1518
+ };
1519
+ };
1520
+ const getRulePatternType = (rule) => {
1521
+ if ("lineStartsWith" in rule) return "lineStartsWith";
1522
+ if ("lineStartsAfter" in rule) return "lineStartsAfter";
1523
+ if ("lineEndsWith" in rule) return "lineEndsWith";
1524
+ if ("template" in rule) return "template";
1525
+ return "regex";
1526
+ };
1527
+ const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
1528
+ const mergeDebugIntoMeta = (meta, metaKey, patch) => {
1529
+ const out = meta ? { ...meta } : {};
1530
+ const existing = out[metaKey];
1531
+ out[metaKey] = {
1532
+ ...isPlainObject(existing) ? existing : {},
1533
+ ...patch
1534
+ };
1535
+ return out;
1257
1536
  };
1537
+ const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
1538
+ index: ruleIndex,
1539
+ patternType: getRulePatternType(rule)
1540
+ } });
1541
+ const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
1542
+ index: breakpointIndex,
1543
+ kind: rule.pattern === "" ? "pageBoundary" : "pattern",
1544
+ pattern: rule.pattern
1545
+ } });
1258
1546
 
1259
1547
  //#endregion
1260
1548
  //#region src/segmentation/breakpoint-processor.ts
@@ -1338,15 +1626,20 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
1338
1626
  const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
1339
1627
  if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
1340
1628
  const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1341
- if (exclusionBreak > 0) return exclusionBreak;
1629
+ if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
1342
1630
  }
1343
- const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1631
+ const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1344
1632
  expandedBreakpoints,
1345
1633
  normalizedPages,
1346
1634
  pageIds,
1347
1635
  prefer
1348
1636
  });
1349
- return patternBreak > 0 ? patternBreak : windowEndPosition;
1637
+ if (patternMatch && patternMatch.breakPos > 0) return {
1638
+ breakOffset: patternMatch.breakPos,
1639
+ breakpointIndex: patternMatch.breakpointIndex,
1640
+ breakpointRule: patternMatch.rule
1641
+ };
1642
+ return { breakOffset: windowEndPosition };
1350
1643
  };
1351
1644
  /**
1352
1645
  * Advances cursor position past any leading whitespace.
@@ -1362,12 +1655,13 @@ const skipWhitespace$1 = (content, startPos) => {
1362
1655
  *
1363
1656
  * Uses precomputed boundary positions for O(log n) page attribution lookups.
1364
1657
  */
1365
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
1658
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
1366
1659
  const result = [];
1367
1660
  const fullContent = segment.content;
1368
1661
  let cursorPos = 0;
1369
1662
  let currentFromIdx = fromIdx;
1370
1663
  let isFirstPiece = true;
1664
+ let lastBreakpoint = null;
1371
1665
  const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1372
1666
  logger?.debug?.("[breakpoints] boundaryPositions built", {
1373
1667
  boundaryPositions,
@@ -1382,7 +1676,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1382
1676
  const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1383
1677
  const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1384
1678
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
1385
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
1679
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1680
+ const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
1681
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
1386
1682
  if (finalSeg) result.push(finalSeg);
1387
1683
  break;
1388
1684
  }
@@ -1393,8 +1689,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1393
1689
  cursorPos,
1394
1690
  windowEndIdx
1395
1691
  });
1396
- const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1397
- const breakPos = cursorPos + breakOffset;
1692
+ const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1693
+ if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
1694
+ breakpointIndex: found.breakpointIndex,
1695
+ rule: found.breakpointRule
1696
+ };
1697
+ const breakPos = cursorPos + found.breakOffset;
1398
1698
  const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
1399
1699
  const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1400
1700
  logger?.trace?.("[breakpoints] piece", {
@@ -1403,7 +1703,8 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1403
1703
  pieceLength: pieceContent.length
1404
1704
  });
1405
1705
  if (pieceContent) {
1406
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
1706
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1707
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
1407
1708
  if (pieceSeg) result.push(pieceSeg);
1408
1709
  }
1409
1710
  cursorPos = skipWhitespace$1(fullContent, breakPos);
@@ -1418,7 +1719,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1418
1719
  *
1419
1720
  * Note: This is an internal engine used by `segmentPages()`.
1420
1721
  */
1421
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
1722
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
1422
1723
  const pageIds = pages.map((p) => p.id);
1423
1724
  const pageIdToIndex = buildPageIdToIndexMap(pageIds);
1424
1725
  const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1446,7 +1747,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1446
1747
  result.push(segment);
1447
1748
  continue;
1448
1749
  }
1449
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
1750
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
1450
1751
  result.push(...broken.map((s) => {
1451
1752
  const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
1452
1753
  const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -2059,13 +2360,25 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
2059
2360
  }
2060
2361
  return matches;
2061
2362
  };
2062
- const applyOccurrenceFilter = (rules, splitPointsByRule) => {
2363
+ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
2063
2364
  const result = [];
2064
2365
  rules.forEach((rule, index) => {
2065
2366
  const points = splitPointsByRule.get(index);
2066
2367
  if (!points?.length) return;
2067
2368
  const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
2068
- result.push(...filtered);
2369
+ if (!debugMetaKey) {
2370
+ result.push(...filtered.map((p) => ({
2371
+ ...p,
2372
+ ruleIndex: index
2373
+ })));
2374
+ return;
2375
+ }
2376
+ const debugPatch = buildRuleDebugPatch(index, rule);
2377
+ result.push(...filtered.map((p) => ({
2378
+ ...p,
2379
+ meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
2380
+ ruleIndex: index
2381
+ })));
2069
2382
  });
2070
2383
  return result;
2071
2384
  };
@@ -2203,7 +2516,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
2203
2516
  if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
2204
2517
  return [initialSeg];
2205
2518
  };
2206
- const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2519
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
2207
2520
  logger?.debug?.("[segmenter] collecting split points from rules", {
2208
2521
  contentLength: matchContent.length,
2209
2522
  ruleCount: rules.length
@@ -2218,7 +2531,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2218
2531
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
2219
2532
  if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
2220
2533
  for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
2221
- return applyOccurrenceFilter(rules, splitPointsByRule);
2534
+ return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
2222
2535
  };
2223
2536
  /**
2224
2537
  * Finds page breaks within a given offset range using binary search.
@@ -2321,6 +2634,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2321
2634
  */
2322
2635
  const segmentPages = (pages, options) => {
2323
2636
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2637
+ const debug = resolveDebugConfig(options.debug);
2638
+ const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
2324
2639
  logger?.info?.("[segmenter] starting segmentation", {
2325
2640
  breakpointCount: breakpoints.length,
2326
2641
  maxPages,
@@ -2334,7 +2649,7 @@ const segmentPages = (pages, options) => {
2334
2649
  pageIds: pageMap.pageIds,
2335
2650
  totalContentLength: matchContent.length
2336
2651
  });
2337
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
2652
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
2338
2653
  const unique = dedupeSplitPoints(splitPoints);
2339
2654
  logger?.debug?.("[segmenter] split points collected", {
2340
2655
  rawSplitPoints: splitPoints.length,
@@ -2353,7 +2668,7 @@ const segmentPages = (pages, options) => {
2353
2668
  if (maxPages >= 0 && breakpoints.length) {
2354
2669
  logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2355
2670
  const patternProcessor = (p) => processPattern(p, false).pattern;
2356
- const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2671
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
2357
2672
  logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2358
2673
  return result;
2359
2674
  }
@@ -2450,7 +2765,7 @@ const buildTokenPriority = () => {
2450
2765
  return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
2451
2766
  };
2452
2767
  const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
2453
- const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
2768
+ const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
2454
2769
  const compileTokenRegexes = (tokenNames) => {
2455
2770
  const compiled = [];
2456
2771
  for (const token of tokenNames) {
@@ -3532,5 +3847,5 @@ function recoverMistakenMarkersForRuns(runs, opts) {
3532
3847
  }
3533
3848
 
3534
3849
  //#endregion
3535
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
3850
+ export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, applyTokenMappings, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, formatValidationReport, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules };
3536
3851
  //# sourceMappingURL=index.mjs.map