flappa-doormal 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -26,8 +26,14 @@ Traditional Arabic text segmentation requires:
26
26
  ```text
27
27
  src/
28
28
  ├── index.ts # Main entry point and exports
29
- ├── pattern-detection.ts # Token detection for auto-generating rules (NEW)
30
- ├── pattern-detection.test.ts # Pattern detection tests (22 tests)
29
+ ├── analysis/ # Analysis helpers module
30
+ ├── index.ts # Barrel exports for analysis functions
31
+ │ ├── shared.ts # Shared utilities for analysis
32
+ │ ├── line-starts.ts # analyzeCommonLineStarts (line-based patterns)
33
+ │ ├── repeating-sequences.ts # analyzeRepeatingSequences (continuous text N-grams)
34
+ │ └── *.test.ts # Analysis tests
35
+ ├── pattern-detection.ts # Token detection for auto-generating rules
36
+ ├── pattern-detection.test.ts # Pattern detection tests
31
37
  ├── recovery.ts # Marker recovery utility (recover mistaken lineStartsAfter)
32
38
  ├── recovery.test.ts # Marker recovery tests
33
39
  └── segmentation/
@@ -202,7 +208,7 @@ Raw `regex` patterns now support named capture groups for metadata extraction:
202
208
 
203
209
  ### Breakpoints Post-Processing Algorithm
204
210
 
205
- The `breakpoints` option provides a post-processing mechanism for limiting segment size. Unlike the deprecated `maxSpan` (which was per-rule), breakpoints runs AFTER all structural rules.
211
+ The `breakpoints` option provides a post-processing mechanism for limiting segment size. Breakpoints runs AFTER all structural rules.
206
212
 
207
213
  **API Options:**
208
214
  ```typescript
@@ -244,7 +250,7 @@ segmentPages(pages, {
244
250
  - **`prefer: 'shorter'`**: Finds FIRST match (conservative)
245
251
  - **Recursive**: If split result still exceeds `maxPages`, breakpoints runs again
246
252
 
247
- > **Note**: The old `maxSpan` and `fallback` properties on `SplitRule` are deprecated and removed.
253
+ > **Note**: Older per-rule span limiting approaches were removed in favor of post-processing `breakpoints`.
248
254
 
249
255
  ## Design Decisions
250
256
 
@@ -357,7 +363,7 @@ bunx biome lint .
357
363
 
358
364
  5. **Rule order matters for specificity**: When multiple rules can match the same position, put specific patterns BEFORE generic ones. Example: `## {{raqms:num}} {{dash}}` must come before `##` to capture the number.
359
365
 
360
- 6. **Post-processing beats per-rule limits**: The `maxSpan` approach (per-rule page limits) caused premature splits. Moving to post-processing `breakpoints` preserves structural integrity while still limiting segment size.
366
+ 6. **Post-processing beats per-rule limits**: Per-rule span limiting caused premature splits. Moving to post-processing `breakpoints` preserves structural integrity while still limiting segment size.
361
367
 
362
368
  7. **Window padding matters**: When calculating approximate content windows, 50% padding is needed (not 20%) to ensure enough content is captured for `prefer: 'longer'` scenarios.
363
369
 
@@ -494,3 +500,32 @@ const quoted = analyzeCommonLineStarts(pages, {
494
500
  });
495
501
  ```
496
502
 
503
+ ## Repeating Sequence Analysis (`analyzeRepeatingSequences`)
504
+
505
+ For continuous text **without line breaks** (prose-like content), use `analyzeRepeatingSequences(pages)`. It scans for commonly repeating word/token sequences (N-grams) across pages.
506
+
507
+ Key options:
508
+ - `minElements` / `maxElements`: N-gram size range (default 1-3)
509
+ - `minCount`: Minimum occurrences to include (default 3)
510
+ - `topK`: Maximum patterns to return (default 20)
511
+ - `requireToken`: Only patterns containing `{{tokens}}` (default true)
512
+ - `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
513
+
514
+ Example:
515
+ ```typescript
516
+ import { analyzeRepeatingSequences } from 'flappa-doormal';
517
+
518
+ const patterns = analyzeRepeatingSequences(pages, { minCount: 3, topK: 20 });
519
+ // [{ pattern: '{{naql}}', count: 42, examples: [...] }, ...]
520
+ ```
521
+
522
+ ## Analysis → Segmentation Workflow
523
+
524
+ Use analysis functions to discover patterns, then pass to `segmentPages()`:
525
+
526
+ 1. **Continuous text**: `analyzeRepeatingSequences()` → build rules → `segmentPages()`
527
+ 2. **Structured text**: `analyzeCommonLineStarts()` → build rules → `segmentPages()`
528
+
529
+ See README.md for complete examples.
530
+
531
+
package/README.md CHANGED
@@ -228,7 +228,6 @@ Control which matches to use:
228
228
  lineEndsWith: ['\\.'],
229
229
  split: 'after',
230
230
  occurrence: 'last', // Only split at LAST period on page
231
- maxSpan: 1, // Apply per-page
232
231
  }
233
232
  ```
234
233
 
@@ -406,8 +405,99 @@ Key options:
406
405
  - If you paste these signatures into `lineStartsWith` / `lineStartsAfter` / `template`, that’s fine: those template pattern types **auto-escape `()[]`** outside `{{tokens}}`.
407
406
  - If you paste them into a raw `regex` rule, you may need to escape literal brackets yourself.
408
407
 
408
+ ### Repeating Sequence Analysis (continuous text)
409
+
410
+ For texts without line breaks (continuous prose), use `analyzeRepeatingSequences()`:
411
+
412
+ ```typescript
413
+ import { analyzeRepeatingSequences } from 'flappa-doormal';
414
+
415
+ const patterns = analyzeRepeatingSequences(pages, {
416
+ minElements: 2,
417
+ maxElements: 4,
418
+ minCount: 3,
419
+ topK: 20,
420
+ });
421
+ // [{ pattern: "{{naql}}\\s*{{harf}}", count: 42, examples: [...] }, ...]
422
+ ```
423
+
424
+ Key options:
425
+ - `minElements` / `maxElements`: N-gram size range (default 1-3)
426
+ - `minCount`: Minimum occurrences to include (default 3)
427
+ - `topK`: Maximum patterns to return (default 20)
428
+ - `requireToken`: Only patterns containing `{{tokens}}` (default true)
429
+ - `normalizeArabicDiacritics`: Ignore diacritics when matching (default true)
430
+
431
+ ## Analysis → Segmentation Workflow
432
+
433
+ Use analysis functions to discover patterns, then pass to `segmentPages()`.
434
+
435
+ ### Example A: Continuous Text (No Punctuation)
436
+
437
+ For prose-like text without structural line breaks:
438
+
439
+ ```typescript
440
+ import { analyzeRepeatingSequences, segmentPages, type Page } from 'flappa-doormal';
441
+
442
+ // Continuous Arabic text with narrator phrases
443
+ const pages: Page[] = [
444
+ { id: 1, content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي حدثنا خالد بن زيد عن علي' },
445
+ { id: 2, content: 'حدثنا سعيد بن جبير عن ابن عباس أخبرنا يوسف عن أنس' },
446
+ ];
447
+
448
+ // Step 1: Discover repeating patterns
449
+ const patterns = analyzeRepeatingSequences(pages, { minCount: 2, topK: 10 });
450
+ // [{ pattern: '{{naql}}', count: 5, examples: [...] }, ...]
451
+
452
+ // Step 2: Build rules from discovered patterns
453
+ const rules = patterns.filter(p => p.count >= 3).map(p => ({
454
+ lineStartsWith: [p.pattern],
455
+ split: 'at' as const,
456
+ fuzzy: true,
457
+ }));
458
+
459
+ // Step 3: Segment
460
+ const segments = segmentPages(pages, { rules });
461
+ // [{ content: 'حدثنا أحمد بن محمد عن عمر قال سمعت النبي', from: 1 }, ...]
462
+ ```
463
+
464
+ ### Example B: Structured Text (With Numbering)
465
+
466
+ For hadith-style numbered entries:
467
+
468
+ ```typescript
469
+ import { analyzeCommonLineStarts, segmentPages, type Page } from 'flappa-doormal';
470
+
471
+ // Numbered hadith text
472
+ const pages: Page[] = [
473
+ { id: 1, content: '٦٦٩٦ - حَدَّثَنَا أَبُو بَكْرٍ عَنِ النَّبِيِّ\n٦٦٩٧ - أَخْبَرَنَا عُمَرُ قَالَ' },
474
+ { id: 2, content: '٦٦٩٨ - حَدَّثَنِي مُحَمَّدٌ عَنْ عَائِشَةَ' },
475
+ ];
476
+
477
+ // Step 1: Discover common line-start patterns
478
+ const patterns = analyzeCommonLineStarts(pages, { topK: 10, minCount: 2 });
479
+ // [{ pattern: '{{raqms}}\\s*{{dash}}', count: 3, examples: [...] }, ...]
480
+
481
+ // Step 2: Build rules (add named capture for hadith number)
482
+ const topPattern = patterns[0]?.pattern ?? '{{raqms}} {{dash}} ';
483
+ const rules = [{
484
+ lineStartsAfter: [topPattern.replace('{{raqms}}', '{{raqms:num}}')],
485
+ split: 'at' as const,
486
+ meta: { type: 'hadith' }
487
+ }];
488
+
489
+ // Step 3: Segment
490
+ const segments = segmentPages(pages, { rules });
491
+ // [
492
+ // { content: 'حَدَّثَنَا أَبُو بَكْرٍ...', from: 1, meta: { type: 'hadith', num: '٦٦٩٦' } },
493
+ // { content: 'أَخْبَرَنَا عُمَرُ قَالَ', from: 1, meta: { type: 'hadith', num: '٦٦٩٧' } },
494
+ // { content: 'حَدَّثَنِي مُحَمَّدٌ...', from: 2, meta: { type: 'hadith', num: '٦٦٩٨' } },
495
+ // ]
496
+ ```
497
+
409
498
  ## Rule Validation
410
499
 
500
+
411
501
  Use `validateRules()` to detect common mistakes in rule patterns before running segmentation:
412
502
 
413
503
  ```typescript
@@ -619,32 +709,10 @@ const segments = segmentPages(pages, {
619
709
  lineEndsWith: ['\\.'],
620
710
  split: 'after',
621
711
  occurrence: 'last',
622
- maxSpan: 1
623
- }]
624
- });
625
- ```
626
-
627
- ### Page Fallback for Unmatched Content
628
-
629
- When using `maxSpan` to group matches per page, use `fallback: 'page'` to prevent unmatched pages from merging with adjacent segments:
630
-
631
- ```typescript
632
- const segments = segmentPages(pages, {
633
- rules: [{
634
- template: '{{tarqim}}', // Match punctuation marks
635
- split: 'after',
636
- occurrence: 'last',
637
- maxSpan: 1,
638
- fallback: 'page' // If no punctuation found, segment the page anyway
639
712
  }]
640
713
  });
641
714
  ```
642
715
 
643
- **Without `fallback`**: Pages without matches merge into the next segment
644
- **With `fallback: 'page'`**: Each page becomes its own segment even without matches
645
-
646
- > **Future extensions**: The `fallback` option may support additional values like `'skip'` (omit unmatched content) or `'line'` (split at line breaks) in future versions.
647
-
648
716
  ### Multiple Rules with Priority
649
717
 
650
718
  ```typescript
@@ -912,9 +980,7 @@ type SplitRule = {
912
980
  // Split behavior
913
981
  split?: 'at' | 'after'; // Default: 'at'
914
982
  occurrence?: 'first' | 'last' | 'all';
915
- maxSpan?: number;
916
983
  fuzzy?: boolean;
917
- fallback?: 'page'; // NEW: Page-boundary fallback
918
984
 
919
985
  // Constraints
920
986
  min?: number;
@@ -1042,28 +1108,6 @@ The library concatenates all pages into a single string for pattern matching acr
1042
1108
 
1043
1109
  For typical book processing (up to 6,000 pages), memory usage is well within Node.js defaults. For very large books (40,000+ pages), ensure adequate heap size.
1044
1110
 
1045
- ### `maxSpan` Sliding Window Behavior
1046
-
1047
- The `maxSpan` option uses a **sliding window algorithm** based on page ID difference:
1048
-
1049
- ```typescript
1050
- // maxSpan = maximum page ID difference when looking ahead for split points
1051
- // Algorithm prefers LONGER segments by looking as far ahead as allowed
1052
-
1053
- // Pages [1, 2, 3, 4] with maxSpan: 1, occurrence: 'last'
1054
- // Window from page 1: pages 1-2 (diff <= 1), splits at page 2's last match
1055
- // Window from page 3: pages 3-4 (diff <= 1), splits at page 4's last match
1056
- // Result: 2 segments spanning pages 1-2 and 3-4
1057
-
1058
- // Pages [1, 5, 10] with maxSpan: 1, occurrence: 'last'
1059
- // Window from page 1: only page 1 (5-1=4 > 1), splits at page 1
1060
- // Window from page 5: only page 5 (10-5=5 > 1), splits at page 5
1061
- // Window from page 10: only page 10, splits at page 10
1062
- // Result: 3 segments (pages too far apart to merge)
1063
- ```
1064
-
1065
- This is intentional for books where page IDs represent actual page numbers. With `occurrence: 'last'`, the algorithm finds the last match within the lookahead window, creating longer segments where possible.
1066
-
1067
1111
  ## For AI Agents
1068
1112
 
1069
1113
  See [AGENTS.md](./AGENTS.md) for:
package/dist/index.d.mts CHANGED
@@ -238,38 +238,9 @@ type SplitBehavior = {
238
238
  * - `'first'`: Only split at the first match
239
239
  * - `'last'`: Only split at the last match
240
240
  *
241
- * When `maxSpan` is set, occurrence filtering is applied per sliding
242
- * window rather than globally. With `'last'`, the algorithm prefers
243
- * longer segments by looking as far ahead as allowed before selecting
244
- * the last match in the window.
245
- *
246
241
  * @default 'all'
247
242
  */
248
243
  occurrence?: 'first' | 'last' | 'all';
249
- /**
250
- * Maximum page ID difference allowed when looking ahead for split points.
251
- *
252
- * Uses a sliding window algorithm that prefers longer segments:
253
- * 1. Start from the first page of the current segment
254
- * 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
255
- * 3. Apply occurrence filter (e.g., 'last') to select a match
256
- * 4. Next window starts from the page after the match
257
- *
258
- * Examples:
259
- * - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
260
- * - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
261
- * - `undefined` = no limit (entire content treated as one group)
262
- *
263
- * Note: With non-consecutive page IDs, the algorithm uses actual ID
264
- * difference, not array index. Pages 1 and 5 have a difference of 4.
265
- *
266
- * @example
267
- * // Split at last period, looking up to 1 page ahead
268
- * // Pages 1,2: split at page 2's last period
269
- * // Page 3: split at page 3's last period
270
- * { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
271
- */
272
- maxSpan?: number;
273
244
  /**
274
245
  * Enable diacritic-insensitive matching for Arabic text.
275
246
  *
@@ -354,12 +325,6 @@ type RuleConstraints = {
354
325
  * { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
355
326
  */
356
327
  meta?: Record<string, unknown>;
357
- /**
358
- * Fallback behavior when no matches are found within a maxSpan boundary.
359
- * - 'page': Create split points at page boundaries
360
- * - undefined: No fallback (current behavior)
361
- */
362
- fallback?: 'page';
363
328
  /**
364
329
  * Page-start guard: only allow this rule to match at the START of a page if the
365
330
  * previous page's last non-whitespace character matches this pattern.
@@ -388,7 +353,7 @@ type RuleConstraints = {
388
353
  * Each rule must specify:
389
354
  * - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
390
355
  * `lineStartsAfter`, or `lineEndsWith`
391
- * - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `maxSpan`, `fuzzy`
356
+ * - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `fuzzy`
392
357
  * - **Constraints** (optional): `min`, `max`, `meta`
393
358
  *
394
359
  * @example
@@ -424,7 +389,6 @@ type SplitRule = PatternType & SplitBehavior & RuleConstraints;
424
389
  type Page = {
425
390
  /**
426
391
  * Unique page/entry ID used for:
427
- * - `maxSpan` grouping (segments spanning multiple pages)
428
392
  * - `min`/`max` constraint filtering
429
393
  * - `from`/`to` tracking in output segments
430
394
  */
@@ -625,6 +589,21 @@ type SegmentationOptions = {
625
589
  * rule's metadata is used for each segment.
626
590
  */
627
591
  rules?: SplitRule[];
592
+ /**
593
+ * Attach debugging provenance into `segment.meta` indicating which rule and/or breakpoint
594
+ * created the segment boundary.
595
+ *
596
+ * This is opt-in because it increases output size.
597
+ *
598
+ * When enabled (default metaKey: `_flappa`), segments may include:
599
+ * `meta._flappa.rule` and/or `meta._flappa.breakpoint`.
600
+ */
601
+ debug?: boolean | {
602
+ /** Where to store provenance in meta. @default '_flappa' */
603
+ metaKey?: string;
604
+ /** Which kinds of provenance to include. @default ['rule','breakpoint'] */
605
+ include?: Array<'rule' | 'breakpoint'>;
606
+ };
628
607
  /**
629
608
  * Maximum pages per segment before breakpoints are applied.
630
609
  *
@@ -770,7 +749,7 @@ type Segment = {
770
749
  /**
771
750
  * Types of validation issues that can be detected.
772
751
  */
773
- type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate';
752
+ type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern';
774
753
  /**
775
754
  * A validation issue found in a pattern.
776
755
  */
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/replace.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/recovery.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAuBA;AA+CA;;;;;;AAsIA;;;;AC9tBA;AAKA;AAUA;;;;;;AAwGA;;;;ACxHA;AA2DA;;;;;;;;AC6SA;;AAAqD,cJnNxC,wBImNwC,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJlTrD;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC9tBA;AAKA;AAUA;;;;;;AAwGA;;;;ACxHA,KF8CK,eAAA,GE9CkB;EA2DV;EAA4B,QAAA,EAAA,MAAA;CAAgB;;;;;;AC6SzD;;;;;;;;AChTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;KJlnBK,qBAAA;EKnEO;EAcA,cAAA,EAAA,MAAA,EAAA;AAEZ,CAAA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;;;AC9QA;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA;AAKA,KR8GK,sBAAA,GQ9GwB;EAChB;EACF,eAAA,EAAA,MAAA,EAAA;CACG;;;AAId;AA2BE;AAmnBF;;;;;;;;;AAsDA;;;;;;KRrkBK,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBL,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CA,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;AA1uBM;AA4BG;AA8BM;AAyDrB,KCpIO,mBAAA,GDoIY,gBAAA,GAAA,eAAA,GAAA,WAAA;AAAA;;;AAkBlB,KCjJM,eAAA,GDiJN;EACA,IAAA,ECjJI,mBDiJJ;EACA,OAAA,EAAA,MAAA;EAAmB,UAAA,CAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;AAAsC,KCpW1B,oBAAA,GDoW0B;EAAgB,cAAA,CAAA,EAAA,CCnWhC,eDmWgC,GAAA,SAAA,CAAA,EAAA;EAAe,eAAA,CAAA,EAAA,CClW9C,eDkW8C,GAAA,SAAA,CAAA,EAAA;EAkBzD,YAAI,CAAA,EAAA,CCnXI,eDmXJ,GAAA,SAAA,CAAA,EAAA;EAqCJ,QAAA,CAAA,ECvZG,eDuZW;AA0E1B,CAAA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC9tBA;AAKA;AAUA;;;;;AAI8B,cAoGjB,aApGiB,EAAA,CAAA,KAAA,EAoGO,SApGP,EAAA,EAAA,GAAA,CAoGsB,oBApGtB,GAAA,SAAA,CAAA,EAAA;;;AFkC9B;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;;AAkBxB;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC9tBA;AAKA;AAUA;;;;;;AAwGA;;;;ACxHA;AA2DA;;;;;;;;AC6SA;;;;;;;;AChTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;cDtVa,sBAAuB,iBAAiB,wBAAmB;;;;AJlTxE;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC9tBA;AAKA;AAUA;;;;;;AAwGA;;;;ACxHA;AA2DA;;;;;;;;AC6SA;AAAoC,cChTvB,sBDgTuB,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;AChTpC;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cA5WA,+BA4WsD,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAgBnE;;;;ACrrBA;AAcA;AAEA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;;cF0Ga,gBAAgB;AGxX7B;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA;AAKA;;;;;;AAOY,cJ6ZC,cI7ZmB,EAAA,CAAA,KAQpB,EAAA,MAMC,EAAK,GAAA,OAAA;AAahB;AAmnBF;;;;;AAO2B,KJ7Of,YAAA,GI6Oe;EAEd;;;AA6Cb;;EAEgF,OAAA,EAAA,MAAA;EACnE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cJxHA,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;ALxoBA,KM7CD,wBAAA,GN6C8E;EA+F7E,IAAA,CAAA,EAAA,MAAA;;;;ECnIR,WAAA,CAAA,EAAA,MAAY;EA4BZ,wBAAe,CAAA,EAAA,OAAA;EA8Bf,yBAAqB,CAAA,EAAA,OAAA;EAiCrB,MAAA,CAAA,EAAA,aAAA,GAAsB,OAAA;EAwBtB,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAmB,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAenB,cAAW,CAAA,EKjIK,MLiIL,EAAA;EACV,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACA;AACA,KKhIM,uBAAA,GLgIN;EACA,IAAA,EAAA,MAAA;EACA,MAAA,EAAA,MAAA;CAAmB;AAYpB,KK5IO,sBAAA,GL4IM;EA+EN,OAAA,EAAA,MAAS;EAYhB,KAAA,EAAA,MAAA;EAoHO,QAAA,EKxVE,uBLwVO,EAAA;CAAG;;;;AAkBZ,cKrGC,uBLqGG,EAAA,CAAA,KAAA,EKpGL,ILoGK,EAAA,EAAA,OAAA,CAAA,EKnGH,wBLmGG,EAAA,GKlGb,sBLkGa,EAAA;;;AAjQX,KMtHO,wBAAA,GNsHY;EAenB,WAAA,CAAA,EAAW,MAAA;EACV,WAAA,CAAA,EAAA,MAAA;EACA,QAAA,CAAA,EAAA,MAAA;EACA,IAAA,CAAA,EAAA,MAAA;EACA,yBAAA,CAAA,EAAA,OAAA;EACA,YAAA,CAAA,EAAA,OAAA;EAAmB,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;EAYpB,WAAA,CAAA,EAAA,MAAa;EA+EN,YAAS,CAAA,EAAA,MAAA;EAYhB,iBAAA,CAAe,EAAA,MAAA;AAoHpB,CAAA;AAAwB,KMxVZ,wBAAA,GNwVY;EAAc,IAAA,EAAA,MAAA;EAAgB,OAAA,EAAA,MAAA;EAAe,MAAA,EAAA,MAAA;EAkBzD,YAAI,EAAA,MAAA,EAAA;AAqChB,CAAA;AA0EY,KMldA,wBAAA,GNkdsB;EA8BjB,OAAA,EAAM,MAAA;EAuBX,KAAA,EAAA,MAAA;EA+CA,QAAA,EMnjBE,wBNmjBiB,EAAA;CAMjB;;;AAgId;;;;AC9tBY,cK6QC,yBL7QkB,EAAA,CAAA,KAAA,EK8QpB,IL9QoB,EAAA,EAAA,OAAA,CAAA,EK+QjB,wBL/QiB,EAAA,GKgR5B,wBLhR4B,EAAA;;;;AFqD/B;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KO7GO,eAAA,GP6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;AA6DkB,cOvkBL,mBPukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GOvkB8B,ePukB9B,EAAA;;;AAyElB;;;;AC9tBA;AAKA;AAUA;;;;;AAI8B,cM2HjB,wBN3HiB,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EM2HmC,eN3HnC,EAAA,EAAA,GAAA,MAAA;AAoG9B;;;;ACxHA;AA2DA;AAAyC,cK2G5B,oBL3G4B,EAAA,CAAA,QAAA,EK4G3B,eL5G2B,EAAA,EAAA,GAAA;EAAgB,WAAA,EAAA,gBAAA,GAAA,iBAAA;EAAgB,KAAA,EAAA,OAAA;EAAI,QAAA,CAAA,EAAA,MAAA;;;;AC6S7E;;;;AAAwE,cIjK3D,kBJiK2D,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;;;;EChT3D,QAAA,CAAA,EAAA,MAAA;EA0QA,QAAA,EGpHC,eHoHD,EAAA;AAsDb,CAAA,GAAa,IAAA;;;ALlUA,KS5DD,sBAAA,GT4D8E;EA+F7E,IAAA,EAAA,cAAA;;;;ECnIR,KAAA,CAAA,EAAA,OAAY,GAAA,YAAA;EA4BZ,QAAA,EAAA,MAAA,EAAe;AAAA,CAAA,GA8Bf;EAiCA,IAAA,EAAA,WAAA;EAwBA,SAAA,EAAA,CAAA,IAAA,EQxIwC,SRwIrB,EAAA,KAAA,EAAA,MAAA,EAAA,GAAA,OAAA;AAAA,CAAA;AAgBlB,KQtJM,iBAAA,GRsJN;EACA,OAAA,EQtJO,mBRsJP;EACA,KAAA,EQtJK,IRsJL,EAAA;EACA,QAAA,EQtJQ,ORsJR,EAAA;EACA,QAAA,EQtJQ,sBRsJR;CAAmB;AAYpB,KQ/JO,oBAAA,GR+JM;EA+EN,OAAA,EAAA;IAYP,IAAA,EAAA,YAAe,GAAA,wBAsDH;IA8DL,SAAS,EAAA,MAAA;IAAG,aAAA,EAAA,MAAA;IAAc,SAAA,EAAA,MAAA;IAAgB,UAAA,EAAA,MAAA;EAAe,CAAA;EAkBzD,KAAA,CAAA,EQxXA,KRwXI,CAAA;IAqCJ,SAAA,EAAA,MAAc;IA0Ed,QAAA,EAAU,MAAA;IA8BL,aAAM,EAAA,MAAA;IAuBX,UAAW,EAAA,MAAA;EA+CX,CAAA,CAAA;EAME,OAAA,EQ3kBD,KR2kBC,CAAA;IASF,IAAA,EAAA,MAAA;IA8CM,oBAAA,EAAA,MAAA;IAwDL,sBAAA,CAAA,EAAA,MAAA;IAAM,qBAAA,CAAA,EAAA,MAAA;IAiBP,YAAO,EAAA,MA6BR;;;;IC3vBC,KAAA,CAAA,EAAA,MAAA,EAAA;EAKA,CAAA,CAAA;EAUA,MAAA,EAAA,MAAA,EAAA;EACU,QAAA,EAAA,MAAA,EAAA;CACC;KOiBlB,oBAAA,GPhBe,MAAA,GAAA,YAAA,GAAA,qBAAA;AACL,iBOgoBC,qCAAA,CPhoBD,KAAA,EOioBJ,IPjoBI,EAAA,EAAA,QAAA,EOkoBD,OPloBC,EAAA,EAAA,OAAA,EOmoBF,mBPnoBE,EAAA,QAAA,EOooBD,sBPpoBC,EAAA,KAAA,EAAA;EAAe,IAAA,CAAA,EAAA,YAAA,GAAA,wBAAA;EAoGjB,gBA0CZ,CAAA,EOyf0B,oBPniByB;;UOqiBvC;YAAgC;AN7pB7C,CAAA;AA2Da,iBM+oBG,6BAAA,CNznBf,IAAA,EM0nBS,iBN1nBT,EAAA,EAAA,IAtBwE,CAsBxE,EAAA;EAtBwC,IAAA,CAAA,EAAA,YAAA,GAAA,wBAAA;EAAgB,gBAAA,CAAA,EMipBuB,oBNjpBvB;CAAgB,CAAA,EAAA;EAAI,MAAA,EMkpBhE,oBNlpBgE;YMkpBhC"}
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/replace.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/recovery.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EY,cDzZC,WCyZqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+SA;AAAoC,cJrNvB,wBIqNuB,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJpTpC;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA,KA/UK,YAAA,GA+UW;EAoCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;KD7EK,eAAA;EE9CO;EA2DC,QAAA,EAAA,MAAA;CAA4B;;;;;;;AC+SzC;;;;;;;;AClTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;KJlnBK,qBAAA;;EKnEO,cAAA,EAAA,MAAA,EAAA;AAcZ,CAAA;AAEA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;;;AC9QA;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA,KRmHK,sBAAA,GQnH6B;EAKtB;EACC,eAAA,EAAA,MAAA,EAAA;CACF;;;;AAKX;AA2BE;AAmnBF;;;;;;;;;AAsDA;;;;;KRrkBK,mBAAA,GQwkB+C;;;;;;;;;;;;;;KRzjB/C,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAiDO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAuDC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAoCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBL,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CA,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;cAiBY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA+CN;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;AAttBM;AA4BG;AA8BM;AAyDrB,KCpIO,mBAAA,GDoIY,gBAAA,GAAA,eAAA,GAAA,WAAA,GAAA,eAAA;AAAA;;;AAkBlB,KCjJM,eAAA,GDiJN;EACA,IAAA,ECjJI,mBDiJJ;EACA,OAAA,EAAA,MAAA;EAAmB,UAAA,CAAA,EAAA,MAAA;AAAA,CAAA;AA6DzB;AAAkD;AAyHlD;;AAAsC,KC/T1B,oBAAA,GD+T0B;EAAgB,cAAA,CAAA,EAAA,CC9ThC,eD8TgC,GAAA,SAAA,CAAA,EAAA;EAAe,eAAA,CAAA,EAAA,CC7T9C,eD6T8C,GAAA,SAAA,CAAA,EAAA;EAkBzD,YAAI,CAAA,EAAA,CC9UI,eD8UJ,GAAA,SAAA,CAAA,EAAA;EAoCJ,QAAA,CAAA,ECjXG,eDiXW;AA0E1B,CAAA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;AAIe,cAuGF,aAvGE,EAAA,CAAA,KAAA,EAuGsB,SAvGtB,EAAA,EAAA,GAAA,CAuGqC,oBAvGrC,GAAA,SAAA,CAAA,EAAA;;;AFkCf;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA6DzB;AAAkD;AAyHlD;AAAwB,cEpRX,iBFoRW,EAAA,CAAA,KAAA,EEpRiB,IFoRjB,EAAA,EAAA,KAAA,CAAA,EEpRiC,WFoRjC,EAAA,EAAA,GEpRiD,IFoRjD,EAAA;;;;;AAkBxB;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+SA;;;;;;;;AClTA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBa,cDpVA,YCoVsF,EAAA,CAAA,KAAA,EDpV/D,ICoV+D,EAAA,EAAA,OAAA,EDpV9C,mBCoV8C,EAAA,GDpV3B,OCoV2B,EAAA;;;;ALxoBnG;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;;AAwJA;;;;AC1sBA;AAKA;AAUA;;;;;;AA2GA;;;;AC3HA;AA2DA;;;;;;;;AC+Sa,cClTA,sBDmXZ,EAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;;;;;ACnXD;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBa,cAvVA,+BAuVmC,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;AAqBhD;AAgBA;;;;ACrrBA;AAcA;AAEA;AAwQA;;;;;;;;AClRA;AAaA;AAOA;AA2OA;;;;;;cF0Ga,gBAAgB;;AGxX7B;AA+EA;AAgEA;AAuBA;AAiCA;;;;AC7MA;AAKA;;;;;AAIoC,cJgavB,cIhauB,EAAA,CAAA,KAAA,EAAA,MAAA,EAAA,GAAA,OAAA;AAGpC;AA2BE;AAmnBF;;;;AAIc,KJ1OF,YAAA,GI0OE;EAGa;;;;AA+C3B;EACU,OAAA,EAAA,MAAA;EACsE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cJvHnE,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;ALxoBA,KM7CD,wBAAA,GN6C8E;EA+F7E,IAAA,CAAA,EAAA,MAAA;;;;ECnIR,WAAA,CAAA,EAAA,MAAY;EA4BZ,wBAAe,CAAA,EAAA,OAAA;EA8Bf,yBAAqB,CAAA,EAAA,OAAA;EAiCrB,MAAA,CAAA,EAAA,aAAA,GAAsB,OAAA;EAwBtB,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAmB,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAenB,cAAW,CAAA,EKjIK,MLiIL,EAAA;EACV,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;CACA;AACA,KKhIM,uBAAA,GLgIN;EACA,IAAA,EAAA,MAAA;EACA,MAAA,EAAA,MAAA;CAAmB;AAYpB,KK5IO,sBAAA,GL4IM;EAiDN,OAAA,EAAA,MAAS;EAYhB,KAAA,EAAA,MAAA;EA6GO,QAAA,EKnTE,uBLmTO,EAAA;CAAG;;;;AAkBZ,cKhEC,uBLgEG,EAAA,CAAA,KAAA,EK/DL,IL+DK,EAAA,EAAA,OAAA,CAAA,EK9DH,wBL8DG,EAAA,GK7Db,sBL6Da,EAAA;;;AA5NX,KMtHO,wBAAA,GNsHY;EAenB,WAAA,CAAA,EAAW,MAAA;EACV,WAAA,CAAA,EAAA,MAAA;EACA,QAAA,CAAA,EAAA,MAAA;EACA,IAAA,CAAA,EAAA,MAAA;EACA,yBAAA,CAAA,EAAA,OAAA;EACA,YAAA,CAAA,EAAA,OAAA;EAAmB,UAAA,CAAA,EAAA,OAAA,GAAA,OAAA;EAYpB,WAAA,CAAA,EAAA,MAAa;EAiDN,YAAS,CAAA,EAAA,MAAA;EAYhB,iBAAA,CAAe,EAAA,MAAA;AA6GpB,CAAA;AAAwB,KMnTZ,wBAAA,GNmTY;EAAc,IAAA,EAAA,MAAA;EAAgB,OAAA,EAAA,MAAA;EAAe,MAAA,EAAA,MAAA;EAkBzD,YAAI,EAAA,MAAA,EAAA;AAoChB,CAAA;AA0EY,KM5aA,wBAAA,GN4asB;EA8BjB,OAAA,EAAM,MAAA;EAuBX,KAAA,EAAA,MAAA;EA+CA,QAAA,EM7gBE,wBN6gBiB,EAAA;CAMjB;;;;AAkJd;;;cM7ba,mCACF,kBACG,6BACX;;;;AP3NH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KO7GO,eAAA,GP6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA6DzB;AAAkD;AAyHlD;;;;;AAkBA;AAoCA;AA0EA;AA8BA;AAuBA;AA+CA;;;AAgCwB,cOpgBX,mBPogBW,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GOpgBwB,ePogBxB,EAAA;;;;AAwHxB;;;;AC1sBA;AAKA;AAUA;;;;AAIe,cM2HF,wBN3HE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,QAAA,EM2HkD,eN3HlD,EAAA,EAAA,GAAA,MAAA;;AAuGf;;;;AC3HA;AA2Da,cK2GA,oBLrFZ,EAAA,CAAA,QAAA,EKsFa,eLtFb,EAAA,EAAA,GAAA;EAtBwC,WAAA,EAAA,gBAAA,GAAA,iBAAA;EAAgB,KAAA,EAAA,OAAA;EAAgB,QAAA,CAAA,EAAA,MAAA;CAAI;;;;AC+S7E;;;AAAwE,cInK3D,kBJmK2D,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAAA,QAAA,EAAA,MAAA;;;;EClT3D,QAAA,EGsJC,eH7Ib,EAAA;AAiQD,CAAA,GAAa,IAAA;;;AL5QA,KS5DD,sBAAA,GT4D8E;EA+F7E,IAAA,EAAA,cAAA;;;;ECnIR,KAAA,CAAA,EAAA,OAAY,GAAA,YAAA;EA4BZ,QAAA,EAAA,MAAA,EAAe;AAAA,CAAA,GA8Bf;EAiCA,IAAA,EAAA,WAAA;EAwBA,SAAA,EAAA,CAAA,IAAA,EQxIwC,SRwIrB,EAAA,KAAA,EAAA,MAAA,EAAA,GAAA,OAAA;AAAA,CAAA;AAgBlB,KQtJM,iBAAA,GRsJN;EACA,OAAA,EQtJO,mBRsJP;EACA,KAAA,EQtJK,IRsJL,EAAA;EACA,QAAA,EQtJQ,ORsJR,EAAA;EACA,QAAA,EQtJQ,sBRsJR;CAAmB;AAYpB,KQ/JO,oBAAA,GR+JM;EAiDN,OAAA,EAAA;IAYP,IAAA,EAAA,YAAe,GAAA,wBAsDH;IAuDL,SAAS,EAAA,MAAA;IAAG,aAAA,EAAA,MAAA;IAAc,SAAA,EAAA,MAAA;IAAgB,UAAA,EAAA,MAAA;EAAe,CAAA;EAkBzD,KAAA,CAAA,EQnVA,KRmVI,CAAA;IAoCJ,SAAA,EAAA,MAAc;IA0Ed,QAAA,EAAU,MAAA;IA8BL,aAAM,EAAA,MAAA;IAuBX,UAAW,EAAA,MAAA;EA+CX,CAAA,CAAA;EAME,OAAA,EQriBD,KRqiBC,CAAA;IASF,IAAA,EAAA,MAAA;IAiBY,oBAAA,EAAA,MAAA;IA+CN,sBAAA,CAAA,EAAA,MAAA;IAwDL,qBAAA,CAAA,EAAA,MAAA;IAAM,YAAA,EAAA,MAAA;IAiBP,MAAO,EAAA,WA6BR,GAAA,oBAAM,GAAA,WAAA,GAAA,sBAAA,GAAA,qBAAA;;;;ECvuBL,CAAA,CAAA;EAKA,MAAA,EAAA,MAAA,EAAA;EAUA,QAAA,EAAA,MAAA,EAAA;CACU;KOkBjB,oBAAA,GPjBkB,MAAA,GAAA,YAAA,GAAA,qBAAA;AACH,iBOioBJ,qCAAA,CPjoBI,KAAA,EOkoBT,IPloBS,EAAA,EAAA,QAAA,EOmoBN,OPnoBM,EAAA,EAAA,OAAA,EOooBP,mBPpoBO,EAAA,QAAA,EOqoBN,sBProBM,EAAA,IAwGpB,CAxGoB,EAAA;EACL,IAAA,CAAA,EAAA,YAAA,GAAA,wBAAA;EAAe,gBAAA,CAAA,EOuoBH,oBPvoBG;AAuG9B,CAAA,CAAA,EAAa;UOkiBA;YAAgC;;AN7pBjC,iBM0sBI,6BAAA,CN1sBU,IAAA,EM2sBhB,iBN3sB2B,EAAA,EAAA,IA2DoB,CA3DpB,EAAA;EA2DxB,IAAA,CAAA,EAAA,YAAA,GAsBZ,wBAAA;EAtBwC,gBAAA,CAAA,EMipBuC,oBNjpBvC;CAAgB,CAAA,EAAA;EAAgB,MAAA,EMkpB5D,oBNlpB4D;EAAI,QAAA,EMkpBhC,ONlpBgC,EAAA"}
package/dist/index.mjs CHANGED
@@ -645,6 +645,10 @@ const buildBareTokenRegex = () => {
645
645
  * Validates a single pattern for common issues.
646
646
  */
647
647
  const validatePattern = (pattern, seenPatterns) => {
648
+ if (!pattern.trim()) return {
649
+ message: "Empty pattern is not allowed",
650
+ type: "empty_pattern"
651
+ };
648
652
  if (seenPatterns.has(pattern)) return {
649
653
  message: `Duplicate pattern: "${pattern}"`,
650
654
  type: "duplicate"
@@ -727,7 +731,7 @@ const validateRules = (rules) => {
727
731
  hasIssues = true;
728
732
  }
729
733
  }
730
- if ("template" in rule && rule.template) {
734
+ if ("template" in rule && rule.template !== void 0) {
731
735
  const seenPatterns = /* @__PURE__ */ new Set();
732
736
  const issue = validatePattern(rule.template, seenPatterns);
733
737
  if (issue) {
@@ -1245,16 +1249,71 @@ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPositi
1245
1249
  */
1246
1250
  const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
1247
1251
  const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
1248
- for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
1252
+ for (let i = 0; i < expandedBreakpoints.length; i++) {
1253
+ const { rule, regex, excludeSet, skipWhenRegex } = expandedBreakpoints[i];
1249
1254
  if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
1250
1255
  if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1251
1256
  if (skipWhenRegex?.test(remainingContent)) continue;
1252
- if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
1257
+ if (regex === null) return {
1258
+ breakpointIndex: i,
1259
+ breakPos: handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages),
1260
+ rule
1261
+ };
1253
1262
  const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1254
- if (breakPos > 0) return breakPos;
1263
+ if (breakPos > 0) return {
1264
+ breakpointIndex: i,
1265
+ breakPos,
1266
+ rule
1267
+ };
1255
1268
  }
1256
- return -1;
1269
+ return null;
1270
+ };
1271
+
1272
+ //#endregion
1273
+ //#region src/segmentation/debug-meta.ts
1274
+ const resolveDebugConfig = (debug) => {
1275
+ if (!debug) return null;
1276
+ if (debug === true) return {
1277
+ includeBreakpoint: true,
1278
+ includeRule: true,
1279
+ metaKey: "_flappa"
1280
+ };
1281
+ if (typeof debug !== "object") return null;
1282
+ const metaKey = debug.metaKey;
1283
+ const include = debug.include;
1284
+ const includeRule = Array.isArray(include) ? include.includes("rule") : true;
1285
+ return {
1286
+ includeBreakpoint: Array.isArray(include) ? include.includes("breakpoint") : true,
1287
+ includeRule,
1288
+ metaKey: typeof metaKey === "string" && metaKey ? metaKey : "_flappa"
1289
+ };
1290
+ };
1291
+ const getRulePatternType = (rule) => {
1292
+ if ("lineStartsWith" in rule) return "lineStartsWith";
1293
+ if ("lineStartsAfter" in rule) return "lineStartsAfter";
1294
+ if ("lineEndsWith" in rule) return "lineEndsWith";
1295
+ if ("template" in rule) return "template";
1296
+ return "regex";
1297
+ };
1298
+ const isPlainObject = (v) => Boolean(v) && typeof v === "object" && !Array.isArray(v);
1299
+ const mergeDebugIntoMeta = (meta, metaKey, patch) => {
1300
+ const out = meta ? { ...meta } : {};
1301
+ const existing = out[metaKey];
1302
+ out[metaKey] = {
1303
+ ...isPlainObject(existing) ? existing : {},
1304
+ ...patch
1305
+ };
1306
+ return out;
1257
1307
  };
1308
+ const buildRuleDebugPatch = (ruleIndex, rule) => ({ rule: {
1309
+ index: ruleIndex,
1310
+ patternType: getRulePatternType(rule)
1311
+ } });
1312
+ const buildBreakpointDebugPatch = (breakpointIndex, rule) => ({ breakpoint: {
1313
+ index: breakpointIndex,
1314
+ kind: rule.pattern === "" ? "pageBoundary" : "pattern",
1315
+ pattern: rule.pattern
1316
+ } });
1258
1317
 
1259
1318
  //#endregion
1260
1319
  //#region src/segmentation/breakpoint-processor.ts
@@ -1338,15 +1397,20 @@ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds,
1338
1397
  const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
1339
1398
  if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
1340
1399
  const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1341
- if (exclusionBreak > 0) return exclusionBreak;
1400
+ if (exclusionBreak > 0) return { breakOffset: exclusionBreak };
1342
1401
  }
1343
- const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1402
+ const patternMatch = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1344
1403
  expandedBreakpoints,
1345
1404
  normalizedPages,
1346
1405
  pageIds,
1347
1406
  prefer
1348
1407
  });
1349
- return patternBreak > 0 ? patternBreak : windowEndPosition;
1408
+ if (patternMatch && patternMatch.breakPos > 0) return {
1409
+ breakOffset: patternMatch.breakPos,
1410
+ breakpointIndex: patternMatch.breakpointIndex,
1411
+ breakpointRule: patternMatch.rule
1412
+ };
1413
+ return { breakOffset: windowEndPosition };
1350
1414
  };
1351
1415
  /**
1352
1416
  * Advances cursor position past any leading whitespace.
@@ -1362,12 +1426,13 @@ const skipWhitespace$1 = (content, startPos) => {
1362
1426
  *
1363
1427
  * Uses precomputed boundary positions for O(log n) page attribution lookups.
1364
1428
  */
1365
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
1429
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey) => {
1366
1430
  const result = [];
1367
1431
  const fullContent = segment.content;
1368
1432
  let cursorPos = 0;
1369
1433
  let currentFromIdx = fromIdx;
1370
1434
  let isFirstPiece = true;
1435
+ let lastBreakpoint = null;
1371
1436
  const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1372
1437
  logger?.debug?.("[breakpoints] boundaryPositions built", {
1373
1438
  boundaryPositions,
@@ -1382,7 +1447,9 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1382
1447
  const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1383
1448
  const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1384
1449
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
1385
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
1450
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1451
+ const meta = debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0;
1452
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta);
1386
1453
  if (finalSeg) result.push(finalSeg);
1387
1454
  break;
1388
1455
  }
@@ -1393,8 +1460,12 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1393
1460
  cursorPos,
1394
1461
  windowEndIdx
1395
1462
  });
1396
- const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1397
- const breakPos = cursorPos + breakOffset;
1463
+ const found = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1464
+ if (found.breakpointIndex !== void 0 && found.breakpointRule) lastBreakpoint = {
1465
+ breakpointIndex: found.breakpointIndex,
1466
+ rule: found.breakpointRule
1467
+ };
1468
+ const breakPos = cursorPos + found.breakOffset;
1398
1469
  const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
1399
1470
  const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1400
1471
  logger?.trace?.("[breakpoints] piece", {
@@ -1403,7 +1474,8 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1403
1474
  pieceLength: pieceContent.length
1404
1475
  });
1405
1476
  if (pieceContent) {
1406
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
1477
+ const includeMeta = isFirstPiece || Boolean(debugMetaKey);
1478
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, debugMetaKey && lastBreakpoint ? mergeDebugIntoMeta(includeMeta ? segment.meta : void 0, debugMetaKey, buildBreakpointDebugPatch(lastBreakpoint.breakpointIndex, lastBreakpoint.rule)) : includeMeta ? segment.meta : void 0, includeMeta);
1407
1479
  if (pieceSeg) result.push(pieceSeg);
1408
1480
  }
1409
1481
  cursorPos = skipWhitespace$1(fullContent, breakPos);
@@ -1418,7 +1490,7 @@ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPag
1418
1490
  *
1419
1491
  * Note: This is an internal engine used by `segmentPages()`.
1420
1492
  */
1421
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
1493
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space", debugMetaKey) => {
1422
1494
  const pageIds = pages.map((p) => p.id);
1423
1495
  const pageIdToIndex = buildPageIdToIndexMap(pageIds);
1424
1496
  const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
@@ -1446,7 +1518,7 @@ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoi
1446
1518
  result.push(segment);
1447
1519
  continue;
1448
1520
  }
1449
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
1521
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger, debugMetaKey);
1450
1522
  result.push(...broken.map((s) => {
1451
1523
  const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
1452
1524
  const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
@@ -2059,13 +2131,25 @@ const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
2059
2131
  }
2060
2132
  return matches;
2061
2133
  };
2062
- const applyOccurrenceFilter = (rules, splitPointsByRule) => {
2134
+ const applyOccurrenceFilter = (rules, splitPointsByRule, debugMetaKey) => {
2063
2135
  const result = [];
2064
2136
  rules.forEach((rule, index) => {
2065
2137
  const points = splitPointsByRule.get(index);
2066
2138
  if (!points?.length) return;
2067
2139
  const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
2068
- result.push(...filtered);
2140
+ if (!debugMetaKey) {
2141
+ result.push(...filtered.map((p) => ({
2142
+ ...p,
2143
+ ruleIndex: index
2144
+ })));
2145
+ return;
2146
+ }
2147
+ const debugPatch = buildRuleDebugPatch(index, rule);
2148
+ result.push(...filtered.map((p) => ({
2149
+ ...p,
2150
+ meta: mergeDebugIntoMeta(p.meta, debugMetaKey, debugPatch),
2151
+ ruleIndex: index
2152
+ })));
2069
2153
  });
2070
2154
  return result;
2071
2155
  };
@@ -2203,7 +2287,7 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
2203
2287
  if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
2204
2288
  return [initialSeg];
2205
2289
  };
2206
- const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2290
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap, debugMetaKey, logger) => {
2207
2291
  logger?.debug?.("[segmenter] collecting split points from rules", {
2208
2292
  contentLength: matchContent.length,
2209
2293
  ruleCount: rules.length
@@ -2218,7 +2302,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2218
2302
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
2219
2303
  if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
2220
2304
  for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
2221
- return applyOccurrenceFilter(rules, splitPointsByRule);
2305
+ return applyOccurrenceFilter(rules, splitPointsByRule, debugMetaKey);
2222
2306
  };
2223
2307
  /**
2224
2308
  * Finds page breaks within a given offset range using binary search.
@@ -2321,6 +2405,8 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2321
2405
  */
2322
2406
  const segmentPages = (pages, options) => {
2323
2407
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2408
+ const debug = resolveDebugConfig(options.debug);
2409
+ const debugMetaKey = debug?.includeRule ? debug.metaKey : void 0;
2324
2410
  logger?.info?.("[segmenter] starting segmentation", {
2325
2411
  breakpointCount: breakpoints.length,
2326
2412
  maxPages,
@@ -2334,7 +2420,7 @@ const segmentPages = (pages, options) => {
2334
2420
  pageIds: pageMap.pageIds,
2335
2421
  totalContentLength: matchContent.length
2336
2422
  });
2337
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
2423
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, debugMetaKey, logger);
2338
2424
  const unique = dedupeSplitPoints(splitPoints);
2339
2425
  logger?.debug?.("[segmenter] split points collected", {
2340
2426
  rawSplitPoints: splitPoints.length,
@@ -2353,7 +2439,7 @@ const segmentPages = (pages, options) => {
2353
2439
  if (maxPages >= 0 && breakpoints.length) {
2354
2440
  logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2355
2441
  const patternProcessor = (p) => processPattern(p, false).pattern;
2356
- const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2442
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner, debug?.includeBreakpoint ? debug.metaKey : void 0);
2357
2443
  logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2358
2444
  return result;
2359
2445
  }