npm - flappa-doormal - Versions diffs - 1.0.0 → 2.0.0 - Mend

flappa-doormal 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,460 +1,850 @@
-//#region src/types.d.ts
+//#region src/segmentation/fuzzy.d.ts
 /**
- * Numbering styles for markers
- */
-type NumberingStyle = 'arabic-indic' | 'latin';
-/**
- * Separator styles for markers
+ * Fuzzy matching utilities for Arabic text.
+ *
+ * Provides diacritic-insensitive and character-equivalence matching for Arabic text.
+ * This allows matching text regardless of:
+ * - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
+ * - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
+ *
+ * @module fuzzy
+ *
+ * @example
+ * // Make a pattern diacritic-insensitive
+ * const pattern = makeDiacriticInsensitive('حدثنا');
+ * new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
  */
-type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
 /**
- * Marker types for text segmentation
+ * Escapes a string for safe inclusion in a regular expression.
+ *
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
+ *
+ * @param s - Any string to escape
+ * @returns String with regex metacharacters escaped
+ *
+ * @example
+ * escapeRegex('hello.world')   // → 'hello\\.world'
+ * escapeRegex('[test]')        // → '\\[test\\]'
+ * escapeRegex('a+b*c?')        // → 'a\\+b\\*c\\?'
  */
-type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash';
+declare const escapeRegex: (s: string) => string;
 /**
- * Configuration for a single marker pattern
- */
-type MarkerConfig = {
-  /** The type of marker to look for */
-  type: MarkerType;
-  /** For numbered markers, the digit style */
-  numbering?: NumberingStyle;
-  /** The separator that follows the marker */
-  separator?: SeparatorStyle | string;
-  /**
-   * Template format for numbered markers using token syntax.
-   * Example: '{bullet}+ {num} {dash}'
-   * Only valid when type is 'numbered'.
-   */
-  format?: string;
-  /**
-   * For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}.
-   * For raw regex patterns that don't use templates, provide the raw pattern string here.
-   * Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)'
-   */
-  template?: string;
-  /**
-   * Alternative to template: raw regex pattern string (for 'pattern' type only).
-   * Use this for complex patterns that can't be expressed with templates.
-   * The pattern should have a capture group for the content.
-   * Example: '^CUSTOM: (.*)'
-   */
-  pattern?: string;
-  /**
-   * Custom token map for advanced users.
-   * Extends the default TOKENS with additional definitions.
-   */
-  tokens?: Record<string, string>;
-  /**
-   * List of phrases for 'phrase' and 'hadith-chain' types.
-   * For 'hadith-chain', defaults to common narrator patterns if not provided.
-   */
-  phrases?: string[];
-  /**
-   * Optional: Only apply this marker after a specific page number.
-   * Useful for books with different formatting in front matter vs main content.
-   */
-  minPage?: number;
-  /**
-   * Optional: Arbitrary metadata to attach to entries matched by this marker.
-   * This allows for agnostic handling of entry properties.
-   * Example: { type: 0, category: 'hadith' }
-   */
-  metadata?: Record<string, any>;
-};
+ * Creates a diacritic-insensitive regex pattern for Arabic text matching.
+ *
+ * Transforms input text into a regex pattern that matches the text regardless
+ * of diacritical marks (harakat) and character variations. Each character in
+ * the input is:
+ * 1. Expanded to its equivalence class (if applicable)
+ * 2. Followed by an optional diacritics matcher
+ *
+ * This allows matching:
+ * - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
+ * - `الإيمان` with `الايمان` (alef variants)
+ * - `صلاة` with `صلاه` (ta marbuta ↔ ha)
+ *
+ * @param text - Input Arabic text to make diacritic-insensitive
+ * @returns Regex pattern string that matches the text with or without diacritics
+ *
+ * @example
+ * const pattern = makeDiacriticInsensitive('حدثنا');
+ * // Each char gets equivalence class + optional diacritics
+ * // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
+ *
+ * @example
+ * const pattern = makeDiacriticInsensitive('باب');
+ * new RegExp(pattern, 'u').test('بَابٌ')  // → true
+ * new RegExp(pattern, 'u').test('باب')   // → true
+ *
+ * @example
+ * // Using with split rules
+ * {
+ *   lineStartsWith: ['باب'],
+ *   split: 'at',
+ *   fuzzy: true  // Applies makeDiacriticInsensitive internally
+ * }
+ */
+declare const makeDiacriticInsensitive: (text: string) => string;
 //#endregion
-//#region src/markers/defaults.d.ts
+//#region src/segmentation/types.d.ts
 /**
- * Default numbering style for markers
- */
-declare const DEFAULT_NUMBERING: NumberingStyle;
-/**
- * Default separator style for markers
- */
-declare const DEFAULT_SEPARATOR: SeparatorStyle;
-/**
- * Default separator pattern (used when separator is a custom string)
+ * Literal regex pattern rule - no token expansion is applied.
+ *
+ * Use this when you need full control over the regex pattern.
+ * If the regex contains capturing groups, the captured content
+ * will be used as the segment content.
+ *
+ * @example
+ * // Match Arabic-Indic numbers followed by a dash
+ * { regex: '^[٠-٩]+ - ', split: 'at' }
+ *
+ * @example
+ * // Capture group - content after the marker becomes segment content
+ * { regex: '^[٠-٩]+ - (.*)', split: 'at' }
  */
-declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
+type RegexPattern = {
+  /** Raw regex pattern string (no token expansion) */
+  regex: string;
+};
 /**
- * Numbering patterns mapped by style
+ * Template pattern rule - expands `{{tokens}}` before compiling to regex.
+ *
+ * Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
+ *
+ * @example
+ * // Using tokens for Arabic-Indic digits
+ * { template: '^{{raqms}} {{dash}}', split: 'at' }
+ *
+ * @example
+ * // Named capture to extract hadith number into metadata
+ * { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
+ *
+ * @see TOKEN_PATTERNS for available tokens
  */
-declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
+type TemplatePattern = {
+  /** Template string with `{{token}}` or `{{token:name}}` placeholders */
+  template: string;
+};
 /**
- * Separator patterns mapped by style
+ * Line-start pattern rule - matches lines starting with any of the given patterns.
+ *
+ * Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
+ * is **included** in the segment content.
+ *
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
+ *
+ * @example
+ * // Split at chapter headings (marker included in content)
+ * { lineStartsWith: ['## ', '### '], split: 'at' }
+ *
+ * @example
+ * // Split at Arabic book/chapter markers with fuzzy matching
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
  */
-declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
-//#endregion
-//#region src/markers/generator.d.ts
+type LineStartsWithPattern = {
+  /** Array of patterns that mark line beginnings (marker included in content) */
+  lineStartsWith: string[];
+};
 /**
- * Generates a regex pattern from a marker configuration.
- * Always returns a regex with three named capture groups:
- * - full: Complete match including marker
- * - marker: Just the marker part (for metadata/indexing)
- * - content: Clean content without marker (for LLM processing)
+ * Line-start-after pattern rule - matches lines starting with patterns,
+ * but **excludes** the marker from the segment content.
+ *
+ * Behaves like `lineStartsWith` but strips the marker from the output.
+ * The segment content starts after the marker and extends to the next split point
+ * (not just the end of the matching line).
  *
- * This function applies all default values before delegating to type-specific generators.
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
  *
- * @param config - Marker configuration
- * @returns Regular expression with named groups
+ * @example
+ * // Split at numbered hadiths, capturing content without the number prefix
+ * // Content extends to next split, not just end of that line
+ * { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
  *
  * @example
- * const regex = generateRegexFromMarker({ type: 'numbered' });
- * const match = regex.exec('٥ - نص');
- * match.groups.full    // "٥ - نص"
- * match.groups.marker  // "٥ -"
- * match.groups.content // "نص"
+ * // Extract hadith number to metadata while stripping the prefix
+ * { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
  */
-declare function generateRegexFromMarker(config: MarkerConfig): RegExp;
-//#endregion
-//#region src/markers/presets.d.ts
+type LineStartsAfterPattern = {
+  /** Array of patterns that mark line beginnings (marker excluded from content) */
+  lineStartsAfter: string[];
+};
 /**
- * Default phrase lists for preset marker types.
- * Export these so users can extend them.
+ * Line-end pattern rule - matches lines ending with any of the given patterns.
+ *
+ * Syntactic sugar for `(?:pattern1|pattern2|...)$`.
+ *
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
+ *
+ * @example
+ * // Split at lines ending with Arabic sentence-ending punctuation
+ * { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
  */
+type LineEndsWithPattern = {
+  /** Array of patterns that mark line endings */
+  lineEndsWith: string[];
+};
 /**
- * Common hadith narrator phrases (diacritic-insensitive)
- * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
+ * Union of all pattern types for split rules.
+ *
+ * Each rule must have exactly ONE pattern type:
+ * - `regex` - Raw regex pattern (no token expansion)
+ * - `template` - Pattern with `{{token}}` expansion
+ * - `lineStartsWith` - Match line beginnings (marker included)
+ * - `lineStartsAfter` - Match line beginnings (marker excluded)
+ * - `lineEndsWith` - Match line endings
  */
-declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"];
+type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
 /**
- * Common basmala patterns
- * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
+ * Configuration for how and where to split content when a pattern matches.
+ *
+ * Controls the split position relative to matches, which occurrences to
+ * split on, page span limits, and fuzzy matching for Arabic text.
  */
-declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"];
-//#endregion
-//#region src/markers/tokens.d.ts
+type SplitBehavior = {
+  /**
+   * Where to split relative to the match.
+   * - `'at'`: New segment starts at the match position
+   * - `'after'`: New segment starts after the match ends
+   */
+  split: 'at' | 'after';
+  /**
+   * Which occurrence(s) to split on.
+   * - `'all'`: Split at every match (default)
+   * - `'first'`: Only split at the first match
+   * - `'last'`: Only split at the last match
+   *
+   * When `maxSpan` is set, occurrence filtering is applied per sliding
+   * window rather than globally. With `'last'`, the algorithm prefers
+   * longer segments by looking as far ahead as allowed before selecting
+   * the last match in the window.
+   *
+   * @default 'all'
+   */
+  occurrence?: 'first' | 'last' | 'all';
+  /**
+   * Maximum page ID difference allowed when looking ahead for split points.
+   *
+   * Uses a sliding window algorithm that prefers longer segments:
+   * 1. Start from the first page of the current segment
+   * 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
+   * 3. Apply occurrence filter (e.g., 'last') to select a match
+   * 4. Next window starts from the page after the match
+   *
+   * Examples:
+   * - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
+   * - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
+   * - `undefined` = no limit (entire content treated as one group)
+   *
+   * Note: With non-consecutive page IDs, the algorithm uses actual ID
+   * difference, not array index. Pages 1 and 5 have a difference of 4.
+   *
+   * @example
+   * // Split at last period, looking up to 1 page ahead
+   * // Pages 1,2: split at page 2's last period
+   * // Page 3: split at page 3's last period
+   * { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
+   */
+  maxSpan?: number;
+  /**
+   * Enable diacritic-insensitive matching for Arabic text.
+   *
+   * When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
+   * `lineStartsAfter` are transformed to match text regardless of:
+   * - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
+   * - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
+   *
+   * **Note**: Does NOT apply to `regex` or `template` patterns.
+   * For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
+   *
+   * @default false
+   */
+  fuzzy?: boolean;
+};
 /**
- * Token definitions for pattern templates.
- * Tokens provide a readable alternative to raw regex patterns.
+ * A single page ID or a range of page IDs.
+ *
+ * - `number`: A single page ID
+ * - `[number, number]`: A range from first to second (inclusive)
+ *
+ * @example
+ * 5           // Single page 5
+ * [10, 20]    // Pages 10 through 20 (inclusive)
  */
+type PageRange = number | [number, number];
 /**
- * Standard tokens for building marker patterns.
- * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
+ * Optional constraints and metadata for a split rule.
+ *
+ * Use constraints to limit which pages a rule applies to, and
+ * metadata to attach arbitrary data to resulting segments.
  */
-declare const TOKENS: {
-  readonly bullet: "[•*°]";
-  readonly colon: ":";
-  readonly comma: "،";
-  readonly content: "(.*)";
-  readonly dash: "[-–—ـ]";
-  readonly dot: "\\.";
-  readonly latin: "\\d+";
-  readonly letter: "[أ-ي]";
-  readonly num: "[\\u0660-\\u0669]+";
-  readonly paren: "\\)";
-  readonly s: "\\s?";
-  readonly slash: "/";
-  readonly space: "\\s+";
+type RuleConstraints = {
+  /**
+   * Minimum page ID for this rule to apply.
+   *
+   * Matches on pages with `id < min` are ignored.
+   *
+   * @example
+   * // Only apply rule starting from page 10
+   * { min: 10, lineStartsWith: ['##'], split: 'before' }
+   */
+  min?: number;
+  /**
+   * Maximum page ID for this rule to apply.
+   *
+   * Matches on pages with `id > max` are ignored.
+   *
+   * @example
+   * // Only apply rule up to page 100
+   * { max: 100, lineStartsWith: ['##'], split: 'before' }
+   */
+  max?: number;
+  /**
+   * Specific pages or page ranges to exclude from this rule.
+   *
+   * Use this to skip the rule for specific pages without needing
+   * to repeat the rule with different min/max values.
+   *
+   * @example
+   * // Exclude specific pages
+   * { exclude: [1, 2, 5] }
+   *
+   * @example
+   * // Exclude page ranges
+   * { exclude: [[1, 10], [50, 100]] }
+   *
+   * @example
+   * // Mix single pages and ranges
+   * { exclude: [1, [5, 10], 50] }
+   */
+  exclude?: PageRange[];
+  /**
+   * Arbitrary metadata attached to segments matching this rule.
+   *
+   * This metadata is merged with any named captures from the pattern.
+   * Named captures (e.g., `{{raqms:num}}`) take precedence over
+   * static metadata with the same key.
+   *
+   * @example
+   * // Tag segments as chapters
+   * { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
+   */
+  meta?: Record<string, unknown>;
+  /**
+   * Fallback behavior when no matches are found within a maxSpan boundary.
+   * - 'page': Create split points at page boundaries
+   * - undefined: No fallback (current behavior)
+   */
+  fallback?: 'page';
 };
-type TokenMap = Record<string, string>;
-//#endregion
-//#region src/markers/template-parser.d.ts
 /**
- * Result of template validation
+ * A complete split rule combining pattern, behavior, and constraints.
+ *
+ * Each rule must specify:
+ * - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
+ *   `lineStartsAfter`, or `lineEndsWith`
+ * - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
+ * - **Constraints** (optional): `min`, `max`, `meta`
+ *
+ * @example
+ * // Basic rule: split at markdown headers
+ * const rule: SplitRule = {
+ *   lineStartsWith: ['## ', '### '],
+ *   split: 'at',
+ *   meta: { type: 'section' }
+ * };
+ *
+ * @example
+ * // Advanced rule: extract hadith numbers with fuzzy Arabic matching
+ * const rule: SplitRule = {
+ *   lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
+ *   split: 'at',
+ *   fuzzy: true,
+ *   min: 5,
+ *   max: 500,
+ *   meta: { type: 'hadith' }
+ * };
  */
-interface ValidationResult {
-  valid: boolean;
-  errors?: string[];
-}
+type SplitRule = PatternType & SplitBehavior & RuleConstraints;
 /**
- * Options for template expansion
+ * Input page structure for segmentation.
+ *
+ * Each page represents a logical unit of content (e.g., a book page,
+ * a document section) that can be tracked across segment boundaries.
+ *
+ * @example
+ * const pages: Page[] = [
+ *   { id: 1, content: '## Chapter 1\nFirst paragraph...' },
+ *   { id: 2, content: 'Continued text...\n## Chapter 2' },
+ * ];
  */
-interface ExpandOptions {
-  /** Custom token map to use instead of default TOKENS */
-  tokens?: TokenMap;
-}
+type Page = {
+  /**
+   * Unique page/entry ID used for:
+   * - `maxSpan` grouping (segments spanning multiple pages)
+   * - `min`/`max` constraint filtering
+   * - `from`/`to` tracking in output segments
+   */
+  id: number;
+  /**
+   * Raw page content (may contain HTML).
+   *
+   * Line endings are normalized internally (`\r\n` and `\r` → `\n`).
+   * Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
+   */
+  content: string;
+};
 /**
- * Expands a template string into a regex pattern using named capture groups.
- * Always creates three groups: full (entire match), marker (just the marker), content (clean text).
+ * A breakpoint pattern with optional page constraints.
  *
- * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
+ * Use this to control which pages a breakpoint pattern applies to.
+ * Patterns outside the specified range are skipped, allowing
+ * the next breakpoint pattern (or fallback) to be tried.
  *
- * @param template - Template string with {token} placeholders
- * @param options - Optional configuration
- * @returns Regex pattern string with named groups
+ * @example
+ * // Only apply punctuation-based breaking from page 10 onwards
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
  *
  * @example
- * expandTemplate('{num} {dash}')
- * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
+ * // Apply to specific page range (pages 10-50)
+ * { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
  */
-declare function expandTemplate(template: string, options?: ExpandOptions): string;
+type BreakpointRule = {
+  /**
+   * Regex pattern for breaking (supports token expansion).
+   * Empty string `''` means fall back to page boundary.
+   */
+  pattern: string;
+  /**
+   * Minimum page ID for this breakpoint to apply.
+   * Segments starting before this page skip this pattern.
+   */
+  min?: number;
+  /**
+   * Maximum page ID for this breakpoint to apply.
+   * Segments starting after this page skip this pattern.
+   */
+  max?: number;
+  /**
+   * Specific pages or page ranges to exclude from this breakpoint.
+   *
+   * Use this to skip the breakpoint for specific pages without needing
+   * to repeat the breakpoint with different min/max values.
+   *
+   * @example
+   * // Exclude specific pages
+   * { pattern: '\\.\\s*', exclude: [1, 2, 5] }
+   *
+   * @example
+   * // Exclude page ranges (front matter pages 1-10)
+   * { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
+   *
+   * @example
+   * // Mix single pages and ranges
+   * { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
+   */
+  exclude?: PageRange[];
+  /**
+   * Skip this breakpoint if the segment content matches this pattern.
+   *
+   * Supports token expansion (e.g., `{{kitab}}`). When the segment's
+   * remaining content matches this regex, the breakpoint pattern is
+   * skipped and the next breakpoint in the array is tried.
+   *
+   * Useful for excluding title pages or front matter without needing
+   * to specify explicit page ranges.
+   *
+   * @example
+   * // Skip punctuation breakpoint for short content (likely titles)
+   * { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
+   *
+   * @example
+   * // Skip for content containing "kitab" (book) marker
+   * { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
+   */
+  skipWhen?: string;
+};
 /**
- * Create a custom token map by extending the base tokens.
+ * A breakpoint can be a simple string pattern or an object with constraints.
  *
- * @param customTokens - Custom token definitions
- * @returns Combined token map
+ * String breakpoints apply to all pages. Object breakpoints can specify
+ * `min`/`max` to limit which pages they apply to.
  *
  * @example
- * const myTokens = createTokenMap({
- *   verse: '\\[[\\u0660-\\u0669]+\\]',
- *   tafsir: 'تفسير'
- * });
+ * // String (applies everywhere)
+ * '{{tarqim}}\\s*'
+ *
+ * @example
+ * // Object with constraints (only from page 10+)
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
  */
-declare function createTokenMap(customTokens: Record<string, string>): TokenMap;
+type Breakpoint = string | BreakpointRule;
 /**
- * Validates a template string.
+ * Segmentation options controlling how pages are split.
  *
- * @param template - Template to validate
- * @param tokens - Token map to validate against
- * @returns Validation result with errors if invalid
+ * @example
+ * // Basic structural rules only
+ * const options: SegmentationOptions = {
+ *   rules: [
+ *     { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
+ *     { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
+ *   ]
+ * };
  *
  * @example
- * validateTemplate('{num} {dash}')
- * // Returns: { valid: true }
+ * // With breakpoints for oversized segments
+ * const options: SegmentationOptions = {
+ *   rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
+ *   maxPages: 2,
+ *   breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
+ *   prefer: 'longer'
+ * };
+ */
+type SegmentationOptions = {
+  /**
+   * Rules applied in order to find split points.
+   *
+   * All rules are evaluated against the content, and their matches
+   * are combined to determine final split points. The first matching
+   * rule's metadata is used for each segment.
+   */
+  rules?: SplitRule[];
+  /**
+   * Maximum pages per segment before breakpoints are applied.
+   *
+   * When a segment spans more pages than this limit, the `breakpoints`
+   * patterns are tried (in order) to find a suitable break point within
+   * the allowed window.
+   *
+   * Structural markers (from rules) always take precedence - segments
+   * are only broken within their rule-defined boundaries, never across them.
+   *
+   * @example
+   * // Break segments that exceed 2 pages
+   * { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
+   */
+  maxPages?: number;
+  /**
+   * Patterns tried in order to break oversized segments.
+   *
+   * Each pattern is tried until one matches within the allowed page window.
+   * Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
+   * matches the page boundary (always succeeds as ultimate fallback).
+   *
+   * Patterns can be simple strings (apply everywhere) or objects with
+   * `min`/`max` constraints to limit which pages they apply to.
+   *
+   * Patterns are checked in order - put preferred break styles first:
+   * - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
+   * - `\\n` - Break at line breaks (useful for OCR content)
+   * - `''` - Break at page boundary (always works)
+   *
+   * Only applied to segments exceeding `maxPages`.
+   *
+   * @example
+   * // Simple patterns (backward compatible)
+   * breakpoints: ['{{tarqim}}\\s*', '\\n', '']
+   *
+   * @example
+   * // Object patterns with page constraints
+   * breakpoints: [
+   *   { pattern: '{{tarqim}}\\s*', min: 10 },  // Only from page 10+
+   *   ''  // Fallback for pages 1-9
+   * ]
+   */
+  breakpoints?: Breakpoint[];
+  /**
+   * When multiple matches exist for a breakpoint pattern, select:
+   * - `'longer'` - Last match in window (prefers longer segments)
+   * - `'shorter'` - First match in window (prefers shorter segments)
+   *
+   * @default 'longer'
+   */
+  prefer?: 'longer' | 'shorter';
+};
+/**
+ * Output segment produced by `segmentPages()`.
+ *
+ * Each segment contains extracted content, page references, and
+ * optional metadata from the matched rule and captured groups.
  *
- * validateTemplate('{invalid}')
- * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
+ * @example
+ * // Simple segment on a single page
+ * { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
+ *
+ * @example
+ * // Segment spanning pages 5-7 with captured hadith number
+ * { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
  */
-declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult;
+type Segment = {
+  /**
+   * Segment content with:
+   * - Leading/trailing whitespace trimmed
+   * - Page breaks converted to spaces (for multi-page segments)
+   * - Markers stripped (for `lineStartsAfter` patterns)
+   */
+  content: string;
+  /**
+   * Starting page ID (from `Page.id`).
+   */
+  from: number;
+  /**
+   * Ending page ID if segment spans multiple pages.
+   *
+   * Only present when the segment content extends across page boundaries.
+   * When `undefined`, the segment is contained within a single page.
+   */
+  to?: number;
+  /**
+   * Combined metadata from:
+   * 1. Rule's `meta` property (static metadata)
+   * 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
+   *
+   * Named captures override static metadata with the same key.
+   */
+  meta?: Record<string, unknown>;
+};
 //#endregion
-//#region src/markers/type-generators.d.ts
+//#region src/segmentation/segmenter.d.ts
 /**
- * Generates a regular expression for pattern-type markers.
+ * Segments pages of content based on pattern-matching rules.
  *
- * Supports two modes:
- * 1. Template-based: Uses the `template` field with token expansion
- * 2. Pattern-based: Uses the raw `pattern` field as-is
+ * This is the main entry point for the segmentation engine. It takes an array
+ * of pages and applies the provided rules to identify split points, producing
+ * an array of segments with content, page references, and metadata.
  *
- * @param config - Marker configuration with either `template` or `pattern` field
- * @returns A compiled RegExp object for matching the pattern
- * @throws {Error} When neither `template` nor `pattern` is provided
+ * @param pages - Array of pages with id and content
+ * @param options - Segmentation options including splitting rules
+ * @returns Array of segments with content, from/to page references, and optional metadata
  *
  * @example
- * // Using template
- * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
+ * // Split markdown by headers
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
+ *   ]
+ * });
  *
  * @example
- * // Using raw pattern
- * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
+ * // Split Arabic hadith text with number extraction
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     {
+ *       lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
+ *       split: 'at',
+ *       fuzzy: true,
+ *       meta: { type: 'hadith' }
+ *     }
+ *   ]
+ * });
  *
  * @example
- * // Using custom tokens
- * const regex = generatePatternRegex({
- *   type: 'pattern',
- *   template: '{verse}',
- *   tokens: { verse: '\\[[0-9]+\\]' }
+ * // Multiple rules with page constraints
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
+ *     { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
+ *     { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
+ *   ]
  * });
  */
-declare function generatePatternRegex(config: MarkerConfig): RegExp;
+declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
+//#endregion
+//#region src/segmentation/textUtils.d.ts
 /**
- * Generates a regular expression for 'bab' (chapter) markers.
+ * Strip all HTML tags from content, keeping only text.
  *
- * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
- * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
+ * @param html - HTML content
+ * @returns Plain text content
+ */
+declare const stripHtmlTags: (html: string) => string;
+/**
+ * Normalizes line endings to Unix-style (`\n`).
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
+ * for consistent pattern matching across platforms.
  *
- * @example
- * const regex = generateBabRegex();
- * const match = regex.exec('باب الصلاة');
- * // match.groups.marker -> 'باب'
- * // match.groups.content -> ' الصلاة'
+ * @param content - Raw content with potentially mixed line endings
+ * @returns Content with all line endings normalized to `\n`
  */
-declare function generateBabRegex(): RegExp;
+declare const normalizeLineEndings: (content: string) => string;
+//#endregion
+//#region src/segmentation/tokens.d.ts
 /**
- * Generates a regular expression for hadith chain (isnad) markers.
+ * Token-based template system for Arabic text pattern matching.
  *
- * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
- * Uses default phrases from presets or custom phrases from config.
- * All phrases are made diacritic-insensitive.
+ * This module provides a human-readable way to define regex patterns using
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
+ * named capture groups for extracting matched values into metadata.
  *
- * @param config - Marker configuration with optional `phrases` array
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @module tokens
  *
  * @example
- * // Using default phrases
- * const regex = generateHadithChainRegex({ type: 'hadith-chain' });
- * const match = regex.exec('حَدَّثَنَا أبو بكر');
+ * // Simple token expansion
+ * expandTokens('{{raqms}} {{dash}}')
+ * // → '[\\u0660-\\u0669]+ [-–—ـ]'
  *
  * @example
- * // Using custom phrases
- * const regex = generateHadithChainRegex({
- *   type: 'hadith-chain',
- *   phrases: ['قَالَ', 'رَوَى']
- * });
+ * // Named capture groups
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
  */
-declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
 /**
- * Generates a regular expression for basmala markers.
+ * Token definitions mapping human-readable token names to regex patterns.
  *
- * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
- * - بسم الله (without diacritics)
- * - بِسْمِ اللَّهِ (with diacritics)
- * - Special patterns like [بسم, [تم
+ * Tokens are used in template strings with double-brace syntax:
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @remarks
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
  *
  * @example
- * const regex = generateBasmalaRegex();
- * const match = regex.exec('بسم الله الرحمن الرحيم');
- * // match.groups.marker -> 'بسم الله'
- */
-declare function generateBasmalaRegex(): RegExp;
-/**
- * Generates a regular expression for custom phrase markers.
- *
- * Similar to hadith-chain markers but requires explicit phrase list.
- * All phrases are made diacritic-insensitive.
+ * // Using tokens in a split rule
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
  *
- * @param config - Marker configuration with required `phrases` array
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
- * @throws {Error} When `phrases` is undefined or empty
+ * @example
+ * // Using tokens with named captures
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
  *
  * @example
- * const regex = generatePhraseRegex({
- *   type: 'phrase',
- *   phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
- * });
+ * // Using the numbered convenience token
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
  */
-declare function generatePhraseRegex(config: MarkerConfig): RegExp;
+declare const TOKEN_PATTERNS: Record<string, string>;
 /**
- * Generates a regular expression for square bracket markers.
+ * Checks if a query string contains template tokens.
  *
- * Matches verse or hadith reference numbers in square brackets:
- * - [٦٥] - Simple bracket
- * - • [٦٥] - With bullet prefix
- * - ° [٦٥] - With degree prefix
+ * Performs a quick test for `{{token}}` patterns without actually
+ * expanding them. Useful for determining whether to apply token
+ * expansion to a string.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param query - String to check for tokens
+ * @returns `true` if the string contains at least one `{{token}}` pattern
  *
  * @example
- * const regex = generateSquareBracketRegex();
- * const match = regex.exec('[٦٥] نص الحديث');
- * // match.groups.content -> ' نص الحديث'
+ * containsTokens('{{raqms}} {{dash}}') // → true
+ * containsTokens('plain text')          // → false
+ * containsTokens('[٠-٩]+ - ')           // → false (raw regex, no tokens)
+ */
+declare const containsTokens: (query: string) => boolean;
+/**
+ * Result from expanding tokens with capture information.
+ *
+ * Contains the expanded pattern string along with metadata about
+ * any named capture groups that were created.
  */
-declare function generateSquareBracketRegex(): RegExp;
+type ExpandResult = {
+  /**
+   * The expanded regex pattern string with all tokens replaced.
+   *
+   * Named captures use the `(?<name>pattern)` syntax.
+   */
+  pattern: string;
+  /**
+   * Names of captured groups extracted from `{{token:name}}` syntax.
+   *
+   * Empty array if no named captures were found.
+   */
+  captureNames: string[];
+  /**
+   * Whether the pattern has any named capturing groups.
+   *
+   * Equivalent to `captureNames.length > 0`.
+   */
+  hasCaptures: boolean;
+};
 /**
- * Generates a regular expression for number-letter-separator markers.
+ * Expands template tokens with support for named captures.
+ *
+ * This is the primary token expansion function that handles all token syntax:
+ * - `{{token}}` → Expands to the token's pattern (no capture group)
+ * - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
+ * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
  *
- * Matches patterns like:
- * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
- * - 5 ب. (Latin number, Arabic letter, dot)
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param query - The template string containing tokens
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
+ *                         Applied to both token patterns and plain Arabic text between tokens.
+ *                         Typically `makeDiacriticInsensitive` from the fuzzy module.
+ * @returns Object with expanded pattern, capture names, and capture flag
  *
  * @example
- * const regex = generateNumLetterRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ أ - نص');
- */
-declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
-/**
- * Generates a regular expression for number-parenthetical-separator markers.
+ * // Simple token expansion
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
+ * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
  *
- * Matches patterns like:
- * - ٥ (أ) - (number, parenthetical content, separator)
- * - 5 (٦) - (number with parenthetical number)
+ * @example
+ * // Named capture
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @example
+ * // Capture-only token
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
  *
  * @example
- * const regex = generateNumParenRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ (أ) - نص');
+ * // With fuzzy transform
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
  */
-declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
+declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
 /**
- * Generates a regular expression for number-slash-number markers.
+ * Expands template tokens in a query string to their regex equivalents.
  *
- * Matches patterns like:
- * - ٥/٦ - (number slash number, separator)
- * - ٥ - (single number, separator)
+ * This is the simple version without capture support. It returns only the
+ * expanded pattern string, not capture metadata.
  *
- * The second number after the slash is optional.
+ * Unknown tokens are left as-is, allowing for partial templates.
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param query - Template string containing `{{token}}` placeholders
+ * @returns Expanded regex pattern string
  *
  * @example
- * const regex = generateNumSlashRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match1 = regex.exec('٥/٦ - نص');
- * const match2 = regex.exec('٥ - نص'); // Also matches
+ * expandTokens('، {{raqms}}')     // → '، [\\u0660-\\u0669]+'
+ * expandTokens('{{raqm}}*')       // → '[\\u0660-\\u0669]*'
+ * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
+ * expandTokens('{{unknown}}')     // → '{{unknown}}' (left as-is)
+ *
+ * @see expandTokensWithCaptures for full capture group support
  */
-declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
+declare const expandTokens: (query: string) => string;
 /**
- * Generates a regular expression for numbered markers with optional format template.
+ * Converts a template string to a compiled RegExp.
  *
- * Supports two modes:
- * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
- * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
+ * Expands all tokens and attempts to compile the result as a RegExp
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
  *
- * When using default pattern:
- * - Separator 'none' generates pattern without separator
- * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
+ * @remarks
+ * This function dynamically compiles regular expressions from template strings.
+ * If templates may come from untrusted sources, be aware of potential ReDoS
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
+ * Consider validating pattern complexity or applying execution timeouts when
+ * running user-submitted patterns.
  *
- * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param template - Template string containing `{{token}}` placeholders
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
  *
  * @example
- * // Using format template
- * const regex = generateNumberedRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash',
- *   format: '{bullet}+ {num} {dash}'
- * });
- *
- * @example
- * // Using default pattern
- * const regex = generateNumberedRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ - نص');
- *
- * @example
- * // With 'none' separator
- * const regex = generateNumberedRegex({
- *   numbering: 'latin',
- *   separator: 'none'
- * });
- * const match = regex.exec('5 text');
+ * templateToRegex('، {{raqms}}')  // → /، [٠-٩]+/u
+ * templateToRegex('{{raqms}}+')   // → /[٠-٩]++/u (might be invalid in some engines)
+ * templateToRegex('(((')          // → null (invalid regex)
  */
-declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp;
+declare const templateToRegex: (template: string) => RegExp | null;
 /**
- * Generates a regular expression for bullet-point markers.
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
  *
- * Matches common bullet characters:
- * - • (bullet)
- * - * (asterisk)
- * - ° (degree)
- * - - (dash)
+ * Useful for documentation, validation, or building user interfaces
+ * that show available tokens.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
  *
  * @example
- * const regex = generateBulletRegex();
- * const match = regex.exec('• نقطة');
- * // match.groups.content -> 'نقطة'
+ * getAvailableTokens()
+ * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
  */
-declare function generateBulletRegex(): RegExp;
+declare const getAvailableTokens: () => string[];
 /**
- * Generates a regular expression for Markdown-style heading markers.
+ * Gets the regex pattern for a specific token name.
  *
- * Matches heading levels using hash symbols:
- * - # Heading 1
- * - ## Heading 2
- * - ### Heading 3
- * - etc.
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
+ * without any expansion or capture group wrapping.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
  *
  * @example
- * const regex = generateHeadingRegex();
- * const match = regex.exec('## عنوان فرعي');
- * // match.groups.marker -> '## '
- * // match.groups.content -> 'عنوان فرعي'
+ * getTokenPattern('raqms')   // → '[\\u0660-\\u0669]+'
+ * getTokenPattern('dash')    // → '[-–—ـ]'
+ * getTokenPattern('unknown') // → undefined
  */
-declare function generateHeadingRegex(): RegExp;
+declare const getTokenPattern: (tokenName: string) => string | undefined;
 //#endregion
-export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
+export { type Breakpoint, type BreakpointRule, type ExpandResult, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, containsTokens, escapeRegex, expandTokens, expandTokensWithCaptures, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, templateToRegex };
 //# sourceMappingURL=index.d.mts.map