flappa-doormal 2.6.2 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -48,16 +48,6 @@ src/
48
48
  ├── fuzzy.test.ts # Fuzzy matching tests
49
49
  ├── textUtils.test.ts # Text utility tests
50
50
  └── match-utils.test.ts # Utility function tests
51
-
52
- test/
53
- ├── 2576.json # Test data for book 2576 (Sahih Bukhari)
54
- └── 2588.json # Test data for book 2588 (Al-Mughni)
55
-
56
- docs/
57
- ├── checkpoints/ # AI agent handoff documentation
58
- │ └── 2025-12-09-handoff.md
59
- └── reviews/ # Performance analysis reports
60
- └── 2025-12-10/
61
51
  ```
62
52
 
63
53
  ### Core Components
@@ -69,7 +59,9 @@ docs/
69
59
  2. **`tokens.ts`** - Template system
70
60
  - `TOKEN_PATTERNS` - Map of token names to regex patterns
71
61
  - `expandTokensWithCaptures()` - Expands `{{token:name}}` syntax
62
+ - `shouldDefaultToFuzzy()` - Checks if patterns contain fuzzy-default tokens (bab, basmalah, fasl, kitab, naql)
72
63
  - Supports fuzzy transform for diacritic-insensitive matching
64
+ - **Fuzzy-default tokens**: `bab`, `basmalah`, `fasl`, `kitab`, `naql` - auto-enable fuzzy matching unless `fuzzy: false` is set
73
65
 
74
66
  3. **`match-utils.ts`** - Extracted utilities (for testability)
75
67
  - `extractNamedCaptures()` - Get named groups from regex match
@@ -92,7 +84,9 @@ docs/
92
84
  - `buildExcludeSet()` - Create Set from PageRange[] for O(1) lookups
93
85
  - `createSegment()` - Create segment with optional to/meta fields
94
86
  - `expandBreakpoints()` - Expand patterns with pre-compiled regexes
95
- - `findActualEndPage()` - Search backwards for ending page using progressive prefix matching (handles mid-page splits)
87
+ - `buildBoundaryPositions()` - Build position map of page boundaries for O(log n) lookups
88
+ - `findPageIndexForPosition()` - Binary search to find page index for a character position
89
+ - `estimateStartOffsetInCurrentPage()` - Estimate offset when segment starts mid-page
96
90
  - `findBreakpointWindowEndPosition()` - Compute window boundary in content-space (robust to marker stripping)
97
91
  - `applyPageJoinerBetweenPages()` - Normalize page-boundary join in output segments (`space` vs `newline`)
98
92
  - `findBreakPosition()` - Find break position using breakpoint patterns
@@ -362,6 +356,8 @@ bunx biome lint .
362
356
 
363
357
  10. **Page boundary detection needs progressive prefixes**: When breakpoints split content mid-page, checking only the first N characters of a page to detect if the segment ends on that page can fail. Solution: try progressively shorter prefixes (`[80, 60, 40, 30, 20, 15, 12, 10, 8, 6]`) via `JOINER_PREFIX_LENGTHS`. The check uses `indexOf(...) > 0` (not `>= 0`) to avoid false positives when a page prefix appears at position 0 (which indicates the segment *starts* with that page, not *ends* on it).
364
358
 
359
+ 11. **Boundary-position algorithm improves page attribution**: Building a position map of page boundaries once per segment (O(n)) enables binary search for O(log n) lookups per piece. Key insight: when a segment starts mid-page (common after structural rules), expected boundary estimates must account for the offset into the starting page. Without this adjustment, position-based lookups can return the wrong page when pages have identical content prefixes.
360
+
365
361
  ### Architecture Insights
366
362
 
367
363
  - **Declarative > Imperative**: Users describe patterns, library handles regex
package/README.md CHANGED
@@ -50,7 +50,7 @@ Working with Arabic hadith and Islamic text collections requires splitting conti
50
50
 
51
51
  ✅ **Readable templates**: `{{raqms}} {{dash}}` instead of cryptic regex
52
52
  ✅ **Named captures**: `{{raqms:hadithNum}}` auto-extracts to `meta.hadithNum`
53
- ✅ **Fuzzy matching**: Ignore diacritics with `fuzzy: true`
53
+ ✅ **Fuzzy matching**: Auto-enabled for `{{bab}}`, `{{kitab}}`, `{{basmalah}}`, `{{fasl}}`, `{{naql}}` (override with `fuzzy: false`)
54
54
  ✅ **Page tracking**: Know which page each segment came from
55
55
  ✅ **Declarative rules**: Describe *what* to match, not *how*
56
56
 
@@ -345,6 +345,10 @@ const segments = segmentPages(pages, {
345
345
  // content: '...' (rumuz stripped)
346
346
  ```
347
347
 
348
+ **Supported codes**: Single-letter (`ع`, `خ`, `م`, `د`, etc.), two-letter (`خت`, `عس`, `سي`, etc.), digit `٤`, and the word `تمييز` (used in jarḥ wa taʿdīl books).
349
+
350
+ > **Note**: Single-letter rumuz like `ع` are only matched when they appear as standalone codes, not as the first letter of words like `عَن`. The pattern is diacritic-safe.
351
+
348
352
  If your data uses *only single-letter codes separated by spaces* (e.g., `د ت س ي ق`), you can also use `{{harfs}}`.
349
353
 
350
354
  ## Analysis Helpers (no LLM required)
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/replace.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6Ca,cJzeA,wBIyeyE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJxkBtF;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;KFbK,eAAA,GEaoE;EAAI;;;;ACH7E;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AAkSA;;;;;;;;AC9VA,KL4EK,qBAAA,GK5EsB;EA+Ed;EAgEA,cAAA,EAAA,MAAA,EAAA;AAuBb,CAAA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;KL1FK,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBL,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CA,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AAtXX;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AAkSA;;;;;;cHqHa,sBAAuB,iBAAiB,wBAAsB;;;AF7Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;AD9TxB;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AAsQA;AAsDA;AA2CA;AAWA;AAuKa,cAzhBA,sBA6hBV,EAAA,CAAA,OAAA,EAAA,MAoBF,EAAA,GAAA,MAAA;AAqBD;AAuBA;AAqBA;AAgBA;;;;AClsBA;AAkEA;AAEA;AAkSA;;;;AAGyB,cDnCZ,+BCmCY,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ACjWzB;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;cF6Ka,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AJpoBA,KK9DD,wBAAA,GL8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAuBA;AA+CA;EAMc,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EASF;;;;AAuHZ;;;;AC5QA;;EAAqD,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAAsB;;;;;ACnd3E;AA2DA;;;;;;;mBEXqB;EDQR;AAsQb;AAsDA;AA2CA;AAWA;AAuKA;EA6Ca,UAAA,CAAA,EAAA,OAAyE,GAAA,OAAA;AAuBtF,CAAA;AAqBa,KChnBD,uBAAA,GDgnBuD;EAgBtD,IAAA,EAAA,MAAA;;;KC9nBD,sBAAA;EApEA,OAAA,EAAA,MAAA;EAkEA,KAAA,EAAA,MAAA;EAEA,QAAA,EAGE,uBAHoB,EAAA;AAkSlC,CAAA;;;;;;;cAAa,iCACF,kBACE,6BACV;;;;AL3SH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KK7GO,eAAA,GL6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;AA6DkB,cKvkBL,mBLukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GKvkB8B,eLukB9B,EAAA;;;AAyElB;;;;AC5QA;;;;;;;cIpUa,mDAAoD;AH/IjE;AA2DA;;;;;cG2Ga,iCACC;;;EF/GD,QAAA,CAAA,EAAA,MAAA;AAsQb,CAAA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEneA,kBFmesD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAgBtD,QAAA,EAAA,MAAA;;;;EClsBD,QAAA,ECsNE,eDtNF,EAAwB;AAkEpC,CAAA,GAAY,IAAA"}
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/segmentation/fuzzy.ts","../src/segmentation/types.ts","../src/segmentation/segmenter.ts","../src/segmentation/replace.ts","../src/segmentation/tokens.ts","../src/analysis.ts","../src/detection.ts"],"sourcesContent":[],"mappings":";;AAkEA;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EY,cD/bC,WC+bqB,EAAA,CAAA,CAAA,EAAA,MAAc,EAAA,GAAA,MAAA;AA8BhD;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6Ca,cJ7eA,wBI6eyE,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;AJ5kBtF;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA,KApXK,YAAA,GAoXW;EAqCJ;EA0EA,KAAA,EAAA,MAAU;AA8BtB,CAAA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;KFbK,eAAA,GEaoE;EAAI;;;;ACH7E;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACtsBA;AAkEA;AAEA;AAkSA;;;;;;;;AC9VA,KL4EK,qBAAA,GK5EsB;EA+Ed;EAgEA,cAAA,EAAA,MAAA,EAAA;AAuBb,CAAA;AAiCA;;;;;;;;;;;;;;;;;;;;;;;;;;;;KL1FK,sBAAA;;;;;;;;;;;;;;;;;;;;;;;KAwBA,mBAAA;;;;;;;;;;;;;;KAeA,WAAA,GACC,eACA,kBACA,wBACA,yBACA;;;;;;;KAYD,aAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+EO,SAAA;;;;;;;KAYP,eAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAyCS;;;;;;;;;;;;SAaH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA8DC,SAAA,GAAY,cAAc,gBAAgB;;;;;;;;;;;;;KAkB1C,IAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCA,cAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAqCE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAqCF,UAAA,YAAsB;;;;;;;;;;;;;;;;;;;;;;;;;UA8BjB,MAAA;;;;;;;;;;;;;;;;;;;;;;KAuBL,WAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KA+CA,mBAAA;;;;;;YAME;;;;;;;;UASF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBA8CM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDL;;;;;;;;;;;;;;;;KAiBD,OAAA;;;;;;;;;;;;;;;;;;;;;;;;;;SA6BD;;;;;;AAtXX;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AA0QA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBA;AAgBA;;;;ACtsBA;AAkEA;AAEA;AAkSA;;;;;;cHqHa,sBAAuB,iBAAiB,wBAAsB;;;AF7Z3E;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAuCtB,KEpJO,WAAA,GAAc,WFoJV,CEpJsB,mBFoJtB,CAAA,SAAA,CAAA,CAAA,CAAA,MAAA,CAAA;;;;;;;AAKS;AA2FzB;AAAkD;AAgIlD;AAAwB,cEzTX,iBFyTW,EAAA,CAAA,KAAA,EEzTiB,IFyTjB,EAAA,EAAA,KAAA,CAAA,EEzTiC,WFyTjC,EAAA,EAAA,GEzTiD,IFyTjD,EAAA;;;;AD9TxB;AA+FA;;;;;ACnIiB;AA4BG;AA8BM;AAiCC;AAwBH;;;;;;;AAoBC;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;;;;AAsIA;;;;AC5QA;;;;;;;;ACndA;AA2DA;;;;;;;;ACHA;AA0QA;AAsDA;AA2CA;AAWA;AAuKa,cA7hBA,sBAiiBV,EAAA,CAAA,OAAA,EAAA,MAoBF,EAAA,GAAA,MAAA;AAqBD;AAuBA;AAqBA;AAgBA;;;;ACtsBA;AAkEA;AAEA;AAkSA;;;;AAGyB,cD/BZ,+BC+BY,EAAA,CAAA,QAAA,EAAA,MAAA,EAAA,GAAA,MAAA;;;;ACjWzB;AA+EA;AAgEA;AAuBA;AAiCA;;;;;;;;;;;;;;;;;;cFiLa,gBAAgB;;;;;;;;;;;;;;;;cA2ChB;;;;;;;KAWD,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;cAuKC,mHAIV;;;;;;;;;;;;;;;;;;;;cAyCU;;;;;;;;;;;;;;;;;;;;;;cAuBA,uCAAmC;;;;;;;;;;;;;cAqBnC;;;;;;;;;;;;;;;cAgBA;;;AJxoBA,KK9DD,wBAAA,GL8D8E;EA+F7E;;;;ECnIR;EA4BA,aAAA,CAAA,EAAA,MAAe;EA8Bf;EAiCA,QAAA,CAAA,EAAA,MAAA;EAwBA;EAeA,WAAA,CAAA,EAAW,MAAA;EACV;;;;EAIA,wBAAA,CAAA,EAAA,OAAA;EAAmB;AAAA;AA2FzB;AAAkD;AAgIlD;;;;EAAqE,yBAAA,CAAA,EAAA,OAAA;EAkBzD;AAqCZ;AA0EA;AA8BA;AAuBA;AA+CA;EAMc,MAAA,CAAA,EAAA,aAAA,GAAA,OAAA;EASF;;;;AAuHZ;;;;AC5QA;;EAAqD,UAAA,CAAA,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,MAAA,EAAA,MAAA,EAAA,GAAA,OAAA;EAAsB;;;;;ACnd3E;AA2DA;;;;;;;mBEXqB;EDQR;AA0Qb;AAsDA;AA2CA;AAWA;AAuKA;EA6Ca,UAAA,CAAA,EAAA,OAAyE,GAAA,OAAA;AAuBtF,CAAA;AAqBa,KCpnBD,uBAAA,GDonBuD;EAgBtD,IAAA,EAAA,MAAA;;;KCloBD,sBAAA;EApEA,OAAA,EAAA,MAAA;EAkEA,KAAA,EAAA,MAAA;EAEA,QAAA,EAGE,uBAHoB,EAAA;AAkSlC,CAAA;;;;;;;cAAa,iCACF,kBACE,6BACV;;;;AL3SH;AA+FA;;;;;ACnIiB;AA4BG;AA+Df,KK7GO,eAAA,GL6Ge;EAwBtB;EAeA,KAAA,EAAA,MAAA;EACC;EACA,KAAA,EAAA,MAAA;EACA;EACA,KAAA,EAAA,MAAA;EACA;EAAmB,QAAA,EAAA,MAAA;AAAA,CAAA;AA2FzB;AAAkD;AAgIlD;;;;;AAkBA;AAqCA;AA0EA;AA8BA;AAuBA;AA+CA;;;AA6DkB,cKvkBL,mBLukBK,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GKvkB8B,eLukB9B,EAAA;;;AAyElB;;;;AC5QA;;;;;;;cIpUa,mDAAoD;AH/IjE;AA2DA;;;;;cG2Ga,iCACC;;;EF/GD,QAAA,CAAA,EAAA,MAAA;AA0Qb,CAAA;AAsDA;AA2CA;AAWA;AAuKA;AA6CA;AAuBA;AAqBa,cEveA,kBFuesD,EAAA,CAAA,IAAA,EAAA,MAAA,EAAA,GAAA;EAgBtD,QAAA,EAAA,MAAA;;;;ECtsBD,QAAA,ECsNE,eDtNF,EAAwB;AAkEpC,CAAA,GAAY,IAAA"}
package/dist/index.mjs CHANGED
@@ -411,6 +411,72 @@ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, ta
411
411
  return -1;
412
412
  };
413
413
  /**
414
+ * Builds a boundary position map for pages within the given range.
415
+ *
416
+ * This function computes page boundaries once per segment and enables
417
+ * O(log n) page lookups via binary search with `findPageIndexForPosition`.
418
+ *
419
+ * Boundaries are derived from segmentContent (post-structural-rules).
420
+ * When the segment starts mid-page, an offset correction is applied to
421
+ * keep boundary estimates aligned with the segment's actual content space.
422
+ *
423
+ * @param segmentContent - Full segment content (already processed by structural rules)
424
+ * @param fromIdx - Starting page index
425
+ * @param toIdx - Ending page index
426
+ * @param pageIds - Array of all page IDs
427
+ * @param normalizedPages - Map of page ID to normalized content
428
+ * @param cumulativeOffsets - Cumulative character offsets (for estimates)
429
+ * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
430
+ * with a sentinel boundary at segmentContent.length as the last element
431
+ *
432
+ * @example
433
+ * // For a 3-page segment:
434
+ * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
435
+ * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
436
+ */
437
+ const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
438
+ const boundaryPositions = [0];
439
+ const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
440
+ for (let i = fromIdx + 1; i <= toIdx; i++) {
441
+ const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
442
+ const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
443
+ const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
444
+ if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
445
+ else {
446
+ const estimate = Math.max(prevBoundary + 1, expectedBoundary);
447
+ boundaryPositions.push(Math.min(estimate, segmentContent.length));
448
+ }
449
+ }
450
+ boundaryPositions.push(segmentContent.length);
451
+ return boundaryPositions;
452
+ };
453
+ /**
454
+ * Binary search to find which page a position falls within.
455
+ * Uses "largest i where boundaryPositions[i] <= position" semantics.
456
+ *
457
+ * @param position - Character position in segmentContent
458
+ * @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
459
+ * @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
460
+ * @returns Page index in pageIds array
461
+ *
462
+ * @example
463
+ * // With boundaries [0, 20, 40, 60] and fromIdx=0:
464
+ * findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
465
+ * findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
466
+ * findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
467
+ */
468
+ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
469
+ if (boundaryPositions.length <= 1) return fromIdx;
470
+ let left = 0;
471
+ let right = boundaryPositions.length - 2;
472
+ while (left < right) {
473
+ const mid = Math.ceil((left + right) / 2);
474
+ if (boundaryPositions[mid] <= position) left = mid;
475
+ else right = mid - 1;
476
+ }
477
+ return fromIdx + left;
478
+ };
479
+ /**
414
480
  * Finds the end position of a breakpoint window inside `remainingContent`.
415
481
  *
416
482
  * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
@@ -446,59 +512,6 @@ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds
446
512
  return -1;
447
513
  };
448
514
  /**
449
- * Finds the actual ending page index by searching backwards for page content prefix.
450
- * Used to determine which page a segment actually ends on based on content matching.
451
- *
452
- * @param pieceContent - Content of the segment piece
453
- * @param currentFromIdx - Current starting index in pageIds
454
- * @param toIdx - Maximum ending index to search
455
- * @param pageIds - Array of page IDs
456
- * @param normalizedPages - Map of page ID to normalized content
457
- * @returns The actual ending page index
458
- */
459
- const findActualEndPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
460
- for (let pi = toIdx; pi > currentFromIdx; pi--) {
461
- const pageData = normalizedPages.get(pageIds[pi]);
462
- if (!pageData) continue;
463
- const trimmedContent = pageData.content.trimStart();
464
- for (const len of JOINER_PREFIX_LENGTHS) {
465
- const checkPortion = trimmedContent.slice(0, Math.min(len, trimmedContent.length)).trim();
466
- if (checkPortion.length > 0 && pieceContent.indexOf(checkPortion) > 0) return pi;
467
- }
468
- }
469
- return currentFromIdx;
470
- };
471
- /**
472
- * Finds the actual starting page index by searching forwards for page content prefix.
473
- * Used to determine which page content actually starts from based on content matching.
474
- *
475
- * This is the counterpart to findActualEndPage - it searches forward to find which
476
- * page the content starts on, rather than which page it ends on.
477
- *
478
- * @param pieceContent - Content of the segment piece
479
- * @param currentFromIdx - Current starting index in pageIds
480
- * @param toIdx - Maximum ending index to search
481
- * @param pageIds - Array of page IDs
482
- * @param normalizedPages - Map of page ID to normalized content
483
- * @returns The actual starting page index
484
- */
485
- const findActualStartPage = (pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) => {
486
- const trimmedPiece = pieceContent.trimStart();
487
- if (!trimmedPiece) return currentFromIdx;
488
- for (let pi = currentFromIdx; pi <= toIdx; pi++) {
489
- const pageData = normalizedPages.get(pageIds[pi]);
490
- if (pageData) {
491
- const pagePrefix = pageData.content.slice(0, Math.min(30, pageData.length)).trim();
492
- const piecePrefix = trimmedPiece.slice(0, Math.min(30, trimmedPiece.length));
493
- if (pagePrefix.length > 0) {
494
- if (trimmedPiece.startsWith(pagePrefix)) return pi;
495
- if (pageData.content.trimStart().startsWith(piecePrefix)) return pi;
496
- }
497
- }
498
- }
499
- return currentFromIdx;
500
- };
501
- /**
502
515
  * Checks if any page in a range is excluded by the given exclude set.
503
516
  *
504
517
  * @param excludeSet - Set of excluded page IDs
@@ -630,10 +643,22 @@ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
630
643
  };
631
644
  const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
632
645
  const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
633
- const computePiecePages = (pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages) => {
634
- const actualStartIdx = pieceContent ? findActualStartPage(pieceContent, currentFromIdx, toIdx, pageIds, normalizedPages) : currentFromIdx;
646
+ /**
647
+ * Computes the actual start and end page indices for a piece using
648
+ * precomputed boundary positions and binary search.
649
+ *
650
+ * @param pieceStartPos - Start position of the piece in the full segment content
651
+ * @param pieceEndPos - End position (exclusive) of the piece
652
+ * @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
653
+ * @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
654
+ * @param toIdx - Maximum page index
655
+ * @returns Object with actualStartIdx and actualEndIdx
656
+ */
657
+ const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
658
+ const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
659
+ const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
635
660
  return {
636
- actualEndIdx: pieceContent ? findActualEndPage(pieceContent, actualStartIdx, windowEndIdx, pageIds, normalizedPages) : currentFromIdx,
661
+ actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
637
662
  actualStartIdx
638
663
  };
639
664
  };
@@ -650,79 +675,87 @@ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, norm
650
675
  return nextFromIdx;
651
676
  };
652
677
  const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
678
+ /**
679
+ * Finds the break offset within a window, trying exclusions first, then patterns.
680
+ *
681
+ * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
682
+ */
683
+ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
684
+ if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
685
+ const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
686
+ if (exclusionBreak > 0) return exclusionBreak;
687
+ }
688
+ const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
689
+ expandedBreakpoints,
690
+ normalizedPages,
691
+ pageIds,
692
+ prefer
693
+ });
694
+ return patternBreak > 0 ? patternBreak : windowEndPosition;
695
+ };
696
+ /**
697
+ * Advances cursor position past any leading whitespace.
698
+ */
699
+ const skipWhitespace = (content, startPos) => {
700
+ let pos = startPos;
701
+ while (pos < content.length && /\s/.test(content[pos])) pos++;
702
+ return pos;
703
+ };
704
+ /**
705
+ * Processes an oversized segment by iterating through the content and
706
+ * breaking it into smaller pieces that fit within maxPages constraints.
707
+ *
708
+ * Uses precomputed boundary positions for O(log n) page attribution lookups.
709
+ */
653
710
  const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
654
711
  const result = [];
655
- let remainingContent = segment.content;
712
+ const fullContent = segment.content;
713
+ let cursorPos = 0;
656
714
  let currentFromIdx = fromIdx;
657
715
  let isFirstPiece = true;
658
- let iterationCount = 0;
716
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
717
+ logger?.debug?.("[breakpoints] boundaryPositions built", {
718
+ boundaryPositions,
719
+ fromIdx,
720
+ fullContentLength: fullContent.length,
721
+ toIdx
722
+ });
659
723
  const maxIterations = 1e4;
660
- while (currentFromIdx <= toIdx) {
661
- iterationCount++;
662
- if (iterationCount > maxIterations) {
663
- logger?.error?.("INFINITE LOOP DETECTED! Breaking out, you should report this bug", { iterationCount: maxIterations });
664
- break;
665
- }
666
- const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
724
+ for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
725
+ const remainingContent = fullContent.slice(cursorPos);
726
+ if (!remainingContent.trim()) break;
667
727
  const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
728
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
668
729
  if (remainingSpan <= maxPages && !remainingHasExclusions) {
669
730
  const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
670
731
  if (finalSeg) result.push(finalSeg);
671
732
  break;
672
733
  }
673
734
  const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
674
- logger?.debug?.(`[breakpoints] iteration=${iterationCount}`, {
675
- currentFromIdx,
676
- currentFromPageId: pageIds[currentFromIdx],
677
- remainingContentStart: remainingContent.slice(0, 50),
678
- remainingContentLength: remainingContent.length,
679
- remainingSpan,
680
- toIdx,
681
- toPageId: pageIds[toIdx],
682
- windowEndIdx,
683
- windowEndPageId: pageIds[windowEndIdx]
684
- });
685
735
  const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
686
- const windowHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx);
687
- let breakPosition = -1;
688
- if (windowHasExclusions) breakPosition = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
689
- if (breakPosition <= 0) breakPosition = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
690
- expandedBreakpoints,
691
- normalizedPages,
692
- pageIds,
693
- prefer
736
+ logger?.debug?.(`[breakpoints] iteration=${i}`, {
737
+ currentFromIdx,
738
+ cursorPos,
739
+ windowEndIdx
694
740
  });
695
- if (breakPosition <= 0) breakPosition = windowEndPosition;
696
- const pieceContent = remainingContent.slice(0, breakPosition).trim();
697
- logger?.debug?.("[breakpoints] selectedBreak", {
698
- breakPosition,
699
- pieceContentEnd: pieceContent.slice(-50),
700
- pieceContentLength: pieceContent.length,
701
- windowEndPosition
741
+ const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
742
+ const breakPos = cursorPos + breakOffset;
743
+ const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
744
+ const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
745
+ logger?.trace?.("[breakpoints] piece", {
746
+ actualEndIdx,
747
+ actualStartIdx,
748
+ pieceLength: pieceContent.length
702
749
  });
703
- const { actualEndIdx, actualStartIdx } = computePiecePages(pieceContent, currentFromIdx, toIdx, windowEndIdx, pageIds, normalizedPages);
704
750
  if (pieceContent) {
705
751
  const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
706
752
  if (pieceSeg) result.push(pieceSeg);
707
753
  }
708
- remainingContent = remainingContent.slice(breakPosition).trim();
709
- logger?.debug?.("[breakpoints] afterSlice", {
710
- actualEndIdx,
711
- remainingContentLength: remainingContent.length,
712
- remainingContentStart: remainingContent.slice(0, 60)
713
- });
714
- if (!remainingContent) {
715
- logger?.debug?.("[breakpoints] done: no remaining content");
716
- break;
717
- }
718
- currentFromIdx = computeNextFromIdx(remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages);
719
- logger?.debug?.("[breakpoints] nextIteration", {
720
- currentFromIdx,
721
- currentFromPageId: pageIds[currentFromIdx]
722
- });
754
+ cursorPos = skipWhitespace(fullContent, breakPos);
755
+ currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
723
756
  isFirstPiece = false;
724
757
  }
725
- logger?.debug?.("[breakpoints] processOversizedSegmentDone", { resultCount: result.length });
758
+ logger?.debug?.("[breakpoints] done", { resultCount: result.length });
726
759
  return result;
727
760
  };
728
761
  /**
@@ -905,6 +938,77 @@ const anyRuleAllowsId = (rules, pageId) => {
905
938
  });
906
939
  };
907
940
 
941
+ //#endregion
942
+ //#region src/segmentation/replace.ts
943
+ const DEFAULT_REPLACE_FLAGS = "gu";
944
+ const normalizeReplaceFlags = (flags) => {
945
+ if (!flags) return DEFAULT_REPLACE_FLAGS;
946
+ const allowed = new Set([
947
+ "g",
948
+ "i",
949
+ "m",
950
+ "s",
951
+ "u",
952
+ "y"
953
+ ]);
954
+ const set = /* @__PURE__ */ new Set();
955
+ for (const ch of flags) {
956
+ if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
957
+ set.add(ch);
958
+ }
959
+ set.add("g");
960
+ set.add("u");
961
+ return [
962
+ "g",
963
+ "i",
964
+ "m",
965
+ "s",
966
+ "y",
967
+ "u"
968
+ ].filter((c) => set.has(c)).join("");
969
+ };
970
+ const compileReplaceRules = (rules) => {
971
+ const compiled = [];
972
+ for (const r of rules) {
973
+ if (r.pageIds && r.pageIds.length === 0) continue;
974
+ const flags = normalizeReplaceFlags(r.flags);
975
+ const re = new RegExp(r.regex, flags);
976
+ compiled.push({
977
+ pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
978
+ re,
979
+ replacement: r.replacement
980
+ });
981
+ }
982
+ return compiled;
983
+ };
984
+ /**
985
+ * Applies ordered regex replacements to page content (per page).
986
+ *
987
+ * - Replacement rules are applied in array order.
988
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
989
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
990
+ *
991
+ * This function is intentionally **pure**:
992
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
993
+ */
994
+ const applyReplacements = (pages, rules) => {
995
+ if (!rules || rules.length === 0 || pages.length === 0) return pages;
996
+ const compiled = compileReplaceRules(rules);
997
+ if (compiled.length === 0) return pages;
998
+ return pages.map((p) => {
999
+ let content = p.content;
1000
+ for (const rule of compiled) {
1001
+ if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1002
+ content = content.replace(rule.re, rule.replacement);
1003
+ }
1004
+ if (content === p.content) return p;
1005
+ return {
1006
+ ...p,
1007
+ content
1008
+ };
1009
+ });
1010
+ };
1011
+
908
1012
  //#endregion
909
1013
  //#region src/segmentation/tokens.ts
910
1014
  /**
@@ -977,6 +1081,7 @@ const escapeTemplateBrackets = (pattern) => {
977
1081
  });
978
1082
  };
979
1083
  const RUMUZ_ATOM = `(?:${[
1084
+ "تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
980
1085
  "خت",
981
1086
  "خغ",
982
1087
  "بخ",
@@ -1000,7 +1105,7 @@ const RUMUZ_ATOM = `(?:${[
1000
1105
  "تم",
1001
1106
  "فق",
1002
1107
  "دق",
1003
- "[خرزيمنصسدفلتقع]",
1108
+ "[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
1004
1109
  "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
1005
1110
  ].join("|")})`;
1006
1111
  const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
@@ -1344,6 +1449,37 @@ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
1344
1449
  * getTokenPattern('unknown') // → undefined
1345
1450
  */
1346
1451
  const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1452
+ /**
1453
+ * Regex to detect fuzzy-default tokens in a pattern string.
1454
+ * Matches {{token}} or {{token:name}} syntax.
1455
+ */
1456
+ const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
1457
+ "bab",
1458
+ "basmalah",
1459
+ "fasl",
1460
+ "kitab",
1461
+ "naql"
1462
+ ].join("|")})(?::\\w+)?\\}\\}`, "g");
1463
+ /**
1464
+ * Checks if a pattern (or array of patterns) contains tokens that should
1465
+ * default to fuzzy matching.
1466
+ *
1467
+ * Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
1468
+ *
1469
+ * @param patterns - Single pattern string or array of pattern strings
1470
+ * @returns `true` if any pattern contains a fuzzy-default token
1471
+ *
1472
+ * @example
1473
+ * shouldDefaultToFuzzy('{{bab}} الإيمان') // true
1474
+ * shouldDefaultToFuzzy('{{raqms}} {{dash}}') // false
1475
+ * shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
1476
+ */
1477
+ const shouldDefaultToFuzzy = (patterns) => {
1478
+ return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
1479
+ FUZZY_TOKEN_REGEX.lastIndex = 0;
1480
+ return FUZZY_TOKEN_REGEX.test(p);
1481
+ });
1482
+ };
1347
1483
 
1348
1484
  //#endregion
1349
1485
  //#region src/segmentation/rule-regex.ts
@@ -1444,7 +1580,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
1444
1580
  */
1445
1581
  const buildRuleRegex = (rule, capturePrefix) => {
1446
1582
  const s = { ...rule };
1447
- const fuzzy = rule.fuzzy ?? false;
1583
+ const allPatterns = [
1584
+ ...s.lineStartsWith ?? [],
1585
+ ...s.lineStartsAfter ?? [],
1586
+ ...s.lineEndsWith ?? []
1587
+ ];
1588
+ const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(allPatterns);
1448
1589
  let allCaptureNames = [];
1449
1590
  if (s.lineStartsAfter?.length) {
1450
1591
  const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
@@ -1482,77 +1623,6 @@ const buildRuleRegex = (rule, capturePrefix) => {
1482
1623
  };
1483
1624
  };
1484
1625
 
1485
- //#endregion
1486
- //#region src/segmentation/replace.ts
1487
- const DEFAULT_REPLACE_FLAGS = "gu";
1488
- const normalizeReplaceFlags = (flags) => {
1489
- if (!flags) return DEFAULT_REPLACE_FLAGS;
1490
- const allowed = new Set([
1491
- "g",
1492
- "i",
1493
- "m",
1494
- "s",
1495
- "u",
1496
- "y"
1497
- ]);
1498
- const set = /* @__PURE__ */ new Set();
1499
- for (const ch of flags) {
1500
- if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
1501
- set.add(ch);
1502
- }
1503
- set.add("g");
1504
- set.add("u");
1505
- return [
1506
- "g",
1507
- "i",
1508
- "m",
1509
- "s",
1510
- "y",
1511
- "u"
1512
- ].filter((c) => set.has(c)).join("");
1513
- };
1514
- const compileReplaceRules = (rules) => {
1515
- const compiled = [];
1516
- for (const r of rules) {
1517
- if (r.pageIds && r.pageIds.length === 0) continue;
1518
- const flags = normalizeReplaceFlags(r.flags);
1519
- const re = new RegExp(r.regex, flags);
1520
- compiled.push({
1521
- pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
1522
- re,
1523
- replacement: r.replacement
1524
- });
1525
- }
1526
- return compiled;
1527
- };
1528
- /**
1529
- * Applies ordered regex replacements to page content (per page).
1530
- *
1531
- * - Replacement rules are applied in array order.
1532
- * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
1533
- * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
1534
- *
1535
- * This function is intentionally **pure**:
1536
- * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
1537
- */
1538
- const applyReplacements = (pages, rules) => {
1539
- if (!rules || rules.length === 0 || pages.length === 0) return pages;
1540
- const compiled = compileReplaceRules(rules);
1541
- if (compiled.length === 0) return pages;
1542
- return pages.map((p) => {
1543
- let content = p.content;
1544
- for (const rule of compiled) {
1545
- if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1546
- content = content.replace(rule.re, rule.replacement);
1547
- }
1548
- if (content === p.content) return p;
1549
- return {
1550
- ...p,
1551
- content
1552
- };
1553
- });
1554
- };
1555
-
1556
1626
  //#endregion
1557
1627
  //#region src/segmentation/fast-fuzzy-prefix.ts
1558
1628
  /**
@@ -2122,14 +2192,43 @@ const convertPageBreaks = (content, startOffset, pageBreaks) => {
2122
2192
  */
2123
2193
  const segmentPages = (pages, options) => {
2124
2194
  const { rules = [], maxPages = 0, breakpoints = [], prefer = "longer", pageJoiner = "space", logger } = options;
2195
+ logger?.info?.("[segmenter] starting segmentation", {
2196
+ breakpointCount: breakpoints.length,
2197
+ maxPages,
2198
+ pageCount: pages.length,
2199
+ prefer,
2200
+ ruleCount: rules.length
2201
+ });
2125
2202
  const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
2126
2203
  const { content: matchContent, normalizedPages: normalizedContent, pageMap } = buildPageMap(processedPages);
2127
- let segments = buildSegments(dedupeSplitPoints(collectSplitPointsFromRules(rules, matchContent, pageMap)), matchContent, pageMap, rules);
2204
+ logger?.debug?.("[segmenter] content built", {
2205
+ pageIds: pageMap.pageIds,
2206
+ totalContentLength: matchContent.length
2207
+ });
2208
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
2209
+ const unique = dedupeSplitPoints(splitPoints);
2210
+ logger?.debug?.("[segmenter] split points collected", {
2211
+ rawSplitPoints: splitPoints.length,
2212
+ uniqueSplitPoints: unique.length
2213
+ });
2214
+ let segments = buildSegments(unique, matchContent, pageMap, rules);
2215
+ logger?.debug?.("[segmenter] structural segments built", {
2216
+ segmentCount: segments.length,
2217
+ segments: segments.map((s) => ({
2218
+ contentLength: s.content.length,
2219
+ from: s.from,
2220
+ to: s.to
2221
+ }))
2222
+ });
2128
2223
  segments = ensureFallbackSegment(segments, processedPages, normalizedContent, pageJoiner);
2129
2224
  if (maxPages >= 0 && breakpoints.length) {
2225
+ logger?.debug?.("[segmenter] applying breakpoints to oversized segments");
2130
2226
  const patternProcessor = (p) => processPattern(p, false).pattern;
2131
- return applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2227
+ const result = applyBreakpoints(segments, processedPages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner);
2228
+ logger?.info?.("[segmenter] segmentation complete (with breakpoints)", { finalSegmentCount: result.length });
2229
+ return result;
2132
2230
  }
2231
+ logger?.info?.("[segmenter] segmentation complete (structural only)", { finalSegmentCount: segments.length });
2133
2232
  return segments;
2134
2233
  };
2135
2234
  /**