npm - flappa-doormal - Versions diffs - 1.0.0 → 2.1.0 - Mend

flappa-doormal 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,460 +1,1004 @@
-//#region src/types.d.ts
+//#region src/segmentation/fuzzy.d.ts
 /**
- * Numbering styles for markers
+ * Fuzzy matching utilities for Arabic text.
+ *
+ * Provides diacritic-insensitive and character-equivalence matching for Arabic text.
+ * This allows matching text regardless of:
+ * - Diacritical marks (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، سُكُون، شَدَّة، تَنْوين
+ * - Character equivalences: ا↔آ↔أ↔إ, ة↔ه, ى↔ي
+ *
+ * @module fuzzy
+ *
+ * @example
+ * // Make a pattern diacritic-insensitive
+ * const pattern = makeDiacriticInsensitive('حدثنا');
+ * new RegExp(pattern, 'u').test('حَدَّثَنَا') // → true
+ */
+/**
+ * Escapes a string for safe inclusion in a regular expression.
+ *
+ * Escapes all regex metacharacters: `.*+?^${}()|[\]\\`
+ *
+ * @param s - Any string to escape
+ * @returns String with regex metacharacters escaped
+ *
+ * @example
+ * escapeRegex('hello.world')   // → 'hello\\.world'
+ * escapeRegex('[test]')        // → '\\[test\\]'
+ * escapeRegex('a+b*c?')        // → 'a\\+b\\*c\\?'
+ */
+declare const escapeRegex: (s: string) => string;
+/**
+ * Creates a diacritic-insensitive regex pattern for Arabic text matching.
+ *
+ * Transforms input text into a regex pattern that matches the text regardless
+ * of diacritical marks (harakat) and character variations. Each character in
+ * the input is:
+ * 1. Expanded to its equivalence class (if applicable)
+ * 2. Followed by an optional diacritics matcher
+ *
+ * This allows matching:
+ * - `حدثنا` with `حَدَّثَنَا` (with full diacritics)
+ * - `الإيمان` with `الايمان` (alef variants)
+ * - `صلاة` with `صلاه` (ta marbuta ↔ ha)
+ *
+ * @param text - Input Arabic text to make diacritic-insensitive
+ * @returns Regex pattern string that matches the text with or without diacritics
+ *
+ * @example
+ * const pattern = makeDiacriticInsensitive('حدثنا');
+ * // Each char gets equivalence class + optional diacritics
+ * // Result matches: حدثنا, حَدَّثَنَا, حَدَثَنَا, etc.
+ *
+ * @example
+ * const pattern = makeDiacriticInsensitive('باب');
+ * new RegExp(pattern, 'u').test('بَابٌ')  // → true
+ * new RegExp(pattern, 'u').test('باب')   // → true
+ *
+ * @example
+ * // Using with split rules
+ * {
+ *   lineStartsWith: ['باب'],
+ *   split: 'at',
+ *   fuzzy: true  // Applies makeDiacriticInsensitive internally
+ * }
+ */
+declare const makeDiacriticInsensitive: (text: string) => string;
+//#endregion
+//#region src/segmentation/types.d.ts
+/**
+ * Literal regex pattern rule - no token expansion is applied.
+ *
+ * Use this when you need full control over the regex pattern.
+ * If the regex contains capturing groups, the captured content
+ * will be used as the segment content.
+ *
+ * @example
+ * // Match Arabic-Indic numbers followed by a dash
+ * { regex: '^[٠-٩]+ - ', split: 'at' }
+ *
+ * @example
+ * // Capture group - content after the marker becomes segment content
+ * { regex: '^[٠-٩]+ - (.*)', split: 'at' }
+ */
+type RegexPattern = {
+  /** Raw regex pattern string (no token expansion) */
+  regex: string;
+};
+/**
+ * Template pattern rule - expands `{{tokens}}` before compiling to regex.
+ *
+ * Supports all tokens defined in `TOKEN_PATTERNS` and named capture syntax.
+ *
+ * @example
+ * // Using tokens for Arabic-Indic digits
+ * { template: '^{{raqms}} {{dash}}', split: 'at' }
+ *
+ * @example
+ * // Named capture to extract hadith number into metadata
+ * { template: '^{{raqms:hadithNum}} {{dash}}', split: 'at' }
+ *
+ * @see TOKEN_PATTERNS for available tokens
+ */
+type TemplatePattern = {
+  /** Template string with `{{token}}` or `{{token:name}}` placeholders */
+  template: string;
+};
+/**
+ * Line-start pattern rule - matches lines starting with any of the given patterns.
+ *
+ * Syntactic sugar for `^(?:pattern1|pattern2|...)`. The matched marker
+ * is **included** in the segment content.
+ *
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
+ *
+ * @example
+ * // Split at chapter headings (marker included in content)
+ * { lineStartsWith: ['## ', '### '], split: 'at' }
+ *
+ * @example
+ * // Split at Arabic book/chapter markers with fuzzy matching
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
+ */
+type LineStartsWithPattern = {
+  /** Array of patterns that mark line beginnings (marker included in content) */
+  lineStartsWith: string[];
+};
+/**
+ * Line-start-after pattern rule - matches lines starting with patterns,
+ * but **excludes** the marker from the segment content.
+ *
+ * Behaves like `lineStartsWith` but strips the marker from the output.
+ * The segment content starts after the marker and extends to the next split point
+ * (not just the end of the matching line).
+ *
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
+ *
+ * @example
+ * // Split at numbered hadiths, capturing content without the number prefix
+ * // Content extends to next split, not just end of that line
+ * { lineStartsAfter: ['{{raqms}} {{dash}} '], split: 'at' }
+ *
+ * @example
+ * // Extract hadith number to metadata while stripping the prefix
+ * { lineStartsAfter: ['{{raqms:num}} {{dash}} '], split: 'at' }
  */
-type NumberingStyle = 'arabic-indic' | 'latin';
+type LineStartsAfterPattern = {
+  /** Array of patterns that mark line beginnings (marker excluded from content) */
+  lineStartsAfter: string[];
+};
 /**
- * Separator styles for markers
+ * Line-end pattern rule - matches lines ending with any of the given patterns.
+ *
+ * Syntactic sugar for `(?:pattern1|pattern2|...)$`.
+ *
+ * Token expansion is applied to each pattern. Use `fuzzy: true` for
+ * diacritic-insensitive Arabic matching.
+ *
+ * @example
+ * // Split at lines ending with Arabic sentence-ending punctuation
+ * { lineEndsWith: ['۔', '؟', '!'], split: 'after' }
  */
-type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
+type LineEndsWithPattern = {
+  /** Array of patterns that mark line endings */
+  lineEndsWith: string[];
+};
 /**
- * Marker types for text segmentation
+ * Union of all pattern types for split rules.
+ *
+ * Each rule must have exactly ONE pattern type:
+ * - `regex` - Raw regex pattern (no token expansion)
+ * - `template` - Pattern with `{{token}}` expansion
+ * - `lineStartsWith` - Match line beginnings (marker included)
+ * - `lineStartsAfter` - Match line beginnings (marker excluded)
+ * - `lineEndsWith` - Match line endings
  */
-type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash';
+type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
 /**
- * Configuration for a single marker pattern
+ * Configuration for how and where to split content when a pattern matches.
+ *
+ * Controls the split position relative to matches, which occurrences to
+ * split on, page span limits, and fuzzy matching for Arabic text.
  */
-type MarkerConfig = {
-  /** The type of marker to look for */
-  type: MarkerType;
-  /** For numbered markers, the digit style */
-  numbering?: NumberingStyle;
-  /** The separator that follows the marker */
-  separator?: SeparatorStyle | string;
+type SplitBehavior = {
   /**
-   * Template format for numbered markers using token syntax.
-   * Example: '{bullet}+ {num} {dash}'
-   * Only valid when type is 'numbered'.
+   * Where to split relative to the match.
+   * - `'at'`: New segment starts at the match position
+   * - `'after'`: New segment starts after the match ends
    */
-  format?: string;
+  split: 'at' | 'after';
   /**
-   * For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}.
-   * For raw regex patterns that don't use templates, provide the raw pattern string here.
-   * Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)'
+   * Which occurrence(s) to split on.
+   * - `'all'`: Split at every match (default)
+   * - `'first'`: Only split at the first match
+   * - `'last'`: Only split at the last match
+   *
+   * When `maxSpan` is set, occurrence filtering is applied per sliding
+   * window rather than globally. With `'last'`, the algorithm prefers
+   * longer segments by looking as far ahead as allowed before selecting
+   * the last match in the window.
+   *
+   * @default 'all'
    */
-  template?: string;
+  occurrence?: 'first' | 'last' | 'all';
   /**
-   * Alternative to template: raw regex pattern string (for 'pattern' type only).
-   * Use this for complex patterns that can't be expressed with templates.
-   * The pattern should have a capture group for the content.
-   * Example: '^CUSTOM: (.*)'
+   * Maximum page ID difference allowed when looking ahead for split points.
+   *
+   * Uses a sliding window algorithm that prefers longer segments:
+   * 1. Start from the first page of the current segment
+   * 2. Look for matches within pages where `pageId - startPageId <= maxSpan`
+   * 3. Apply occurrence filter (e.g., 'last') to select a match
+   * 4. Next window starts from the page after the match
+   *
+   * Examples:
+   * - `maxSpan: 1` = look 1 page ahead (segments span at most 2 pages)
+   * - `maxSpan: 2` = look 2 pages ahead (segments span at most 3 pages)
+   * - `undefined` = no limit (entire content treated as one group)
+   *
+   * Note: With non-consecutive page IDs, the algorithm uses actual ID
+   * difference, not array index. Pages 1 and 5 have a difference of 4.
+   *
+   * @example
+   * // Split at last period, looking up to 1 page ahead
+   * // Pages 1,2: split at page 2's last period
+   * // Page 3: split at page 3's last period
+   * { lineEndsWith: ['.'], split: 'after', occurrence: 'last', maxSpan: 1 }
    */
-  pattern?: string;
+  maxSpan?: number;
   /**
-   * Custom token map for advanced users.
-   * Extends the default TOKENS with additional definitions.
+   * Enable diacritic-insensitive matching for Arabic text.
+   *
+   * When `true`, patterns in `lineStartsWith`, `lineEndsWith`, and
+   * `lineStartsAfter` are transformed to match text regardless of:
+   * - Diacritics (harakat/tashkeel): فَتْحَة، ضَمَّة، كَسْرَة، etc.
+   * - Character equivalences: ا/آ/أ/إ, ة/ه, ى/ي
+   *
+   * **Note**: Does NOT apply to `regex` or `template` patterns.
+   * For templates, apply fuzzy manually using `makeDiacriticInsensitive()`.
+   *
+   * @default false
    */
-  tokens?: Record<string, string>;
+  fuzzy?: boolean;
+};
+/**
+ * A single page ID or a range of page IDs.
+ *
+ * - `number`: A single page ID
+ * - `[number, number]`: A range from first to second (inclusive)
+ *
+ * @example
+ * 5           // Single page 5
+ * [10, 20]    // Pages 10 through 20 (inclusive)
+ */
+type PageRange = number | [number, number];
+/**
+ * Optional constraints and metadata for a split rule.
+ *
+ * Use constraints to limit which pages a rule applies to, and
+ * metadata to attach arbitrary data to resulting segments.
+ */
+type RuleConstraints = {
+  /**
+   * Minimum page ID for this rule to apply.
+   *
+   * Matches on pages with `id < min` are ignored.
+   *
+   * @example
+   * // Only apply rule starting from page 10
+   * { min: 10, lineStartsWith: ['##'], split: 'before' }
+   */
+  min?: number;
   /**
-   * List of phrases for 'phrase' and 'hadith-chain' types.
-   * For 'hadith-chain', defaults to common narrator patterns if not provided.
+   * Maximum page ID for this rule to apply.
+   *
+   * Matches on pages with `id > max` are ignored.
+   *
+   * @example
+   * // Only apply rule up to page 100
+   * { max: 100, lineStartsWith: ['##'], split: 'before' }
    */
-  phrases?: string[];
+  max?: number;
   /**
-   * Optional: Only apply this marker after a specific page number.
-   * Useful for books with different formatting in front matter vs main content.
+   * Specific pages or page ranges to exclude from this rule.
+   *
+   * Use this to skip the rule for specific pages without needing
+   * to repeat the rule with different min/max values.
+   *
+   * @example
+   * // Exclude specific pages
+   * { exclude: [1, 2, 5] }
+   *
+   * @example
+   * // Exclude page ranges
+   * { exclude: [[1, 10], [50, 100]] }
+   *
+   * @example
+   * // Mix single pages and ranges
+   * { exclude: [1, [5, 10], 50] }
    */
-  minPage?: number;
+  exclude?: PageRange[];
   /**
-   * Optional: Arbitrary metadata to attach to entries matched by this marker.
-   * This allows for agnostic handling of entry properties.
-   * Example: { type: 0, category: 'hadith' }
+   * Arbitrary metadata attached to segments matching this rule.
+   *
+   * This metadata is merged with any named captures from the pattern.
+   * Named captures (e.g., `{{raqms:num}}`) take precedence over
+   * static metadata with the same key.
+   *
+   * @example
+   * // Tag segments as chapters
+   * { lineStartsWith: ['{{bab}}'], split: 'before', meta: { type: 'chapter' } }
    */
-  metadata?: Record<string, any>;
+  meta?: Record<string, unknown>;
+  /**
+   * Fallback behavior when no matches are found within a maxSpan boundary.
+   * - 'page': Create split points at page boundaries
+   * - undefined: No fallback (current behavior)
+   */
+  fallback?: 'page';
 };
-//#endregion
-//#region src/markers/defaults.d.ts
 /**
- * Default numbering style for markers
- */
-declare const DEFAULT_NUMBERING: NumberingStyle;
-/**
- * Default separator style for markers
- */
-declare const DEFAULT_SEPARATOR: SeparatorStyle;
-/**
- * Default separator pattern (used when separator is a custom string)
- */
-declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
-/**
- * Numbering patterns mapped by style
- */
-declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
-/**
- * Separator patterns mapped by style
- */
-declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
-//#endregion
-//#region src/markers/generator.d.ts
-/**
- * Generates a regex pattern from a marker configuration.
- * Always returns a regex with three named capture groups:
- * - full: Complete match including marker
- * - marker: Just the marker part (for metadata/indexing)
- * - content: Clean content without marker (for LLM processing)
+ * A complete split rule combining pattern, behavior, and constraints.
  *
- * This function applies all default values before delegating to type-specific generators.
+ * Each rule must specify:
+ * - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
+ *   `lineStartsAfter`, or `lineEndsWith`
+ * - **Split behavior**: `split` (required), `occurrence`, `maxSpan`, `fuzzy`
+ * - **Constraints** (optional): `min`, `max`, `meta`
  *
- * @param config - Marker configuration
- * @returns Regular expression with named groups
+ * @example
+ * // Basic rule: split at markdown headers
+ * const rule: SplitRule = {
+ *   lineStartsWith: ['## ', '### '],
+ *   split: 'at',
+ *   meta: { type: 'section' }
+ * };
  *
  * @example
- * const regex = generateRegexFromMarker({ type: 'numbered' });
- * const match = regex.exec('٥ - نص');
- * match.groups.full    // "٥ - نص"
- * match.groups.marker  // "٥ -"
- * match.groups.content // "نص"
- */
-declare function generateRegexFromMarker(config: MarkerConfig): RegExp;
-//#endregion
-//#region src/markers/presets.d.ts
-/**
- * Default phrase lists for preset marker types.
- * Export these so users can extend them.
- */
-/**
- * Common hadith narrator phrases (diacritic-insensitive)
- * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
+ * // Advanced rule: extract hadith numbers with fuzzy Arabic matching
+ * const rule: SplitRule = {
+ *   lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
+ *   split: 'at',
+ *   fuzzy: true,
+ *   min: 5,
+ *   max: 500,
+ *   meta: { type: 'hadith' }
+ * };
  */
-declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"];
+type SplitRule = PatternType & SplitBehavior & RuleConstraints;
 /**
- * Common basmala patterns
- * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
+ * Input page structure for segmentation.
+ *
+ * Each page represents a logical unit of content (e.g., a book page,
+ * a document section) that can be tracked across segment boundaries.
+ *
+ * @example
+ * const pages: Page[] = [
+ *   { id: 1, content: '## Chapter 1\nFirst paragraph...' },
+ *   { id: 2, content: 'Continued text...\n## Chapter 2' },
+ * ];
  */
-declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"];
-//#endregion
-//#region src/markers/tokens.d.ts
-/**
- * Token definitions for pattern templates.
- * Tokens provide a readable alternative to raw regex patterns.
- */
-/**
- * Standard tokens for building marker patterns.
- * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
- */
-declare const TOKENS: {
-  readonly bullet: "[•*°]";
-  readonly colon: ":";
-  readonly comma: "،";
-  readonly content: "(.*)";
-  readonly dash: "[-–—ـ]";
-  readonly dot: "\\.";
-  readonly latin: "\\d+";
-  readonly letter: "[أ-ي]";
-  readonly num: "[\\u0660-\\u0669]+";
-  readonly paren: "\\)";
-  readonly s: "\\s?";
-  readonly slash: "/";
-  readonly space: "\\s+";
+type Page = {
+  /**
+   * Unique page/entry ID used for:
+   * - `maxSpan` grouping (segments spanning multiple pages)
+   * - `min`/`max` constraint filtering
+   * - `from`/`to` tracking in output segments
+   */
+  id: number;
+  /**
+   * Raw page content (may contain HTML).
+   *
+   * Line endings are normalized internally (`\r\n` and `\r` → `\n`).
+   * Use a utility to convert html to markdown or `stripHtmlTags()` to preprocess HTML.
+   */
+  content: string;
 };
-type TokenMap = Record<string, string>;
-//#endregion
-//#region src/markers/template-parser.d.ts
 /**
- * Result of template validation
+ * A breakpoint pattern with optional page constraints.
+ *
+ * Use this to control which pages a breakpoint pattern applies to.
+ * Patterns outside the specified range are skipped, allowing
+ * the next breakpoint pattern (or fallback) to be tried.
+ *
+ * @example
+ * // Only apply punctuation-based breaking from page 10 onwards
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
+ *
+ * @example
+ * // Apply to specific page range (pages 10-50)
+ * { pattern: '{{tarqim}}\\s*', min: 10, max: 50 }
  */
-interface ValidationResult {
-  valid: boolean;
-  errors?: string[];
-}
+type BreakpointRule = {
+  /**
+   * Regex pattern for breaking (supports token expansion).
+   * Empty string `''` means fall back to page boundary.
+   */
+  pattern: string;
+  /**
+   * Minimum page ID for this breakpoint to apply.
+   * Segments starting before this page skip this pattern.
+   */
+  min?: number;
+  /**
+   * Maximum page ID for this breakpoint to apply.
+   * Segments starting after this page skip this pattern.
+   */
+  max?: number;
+  /**
+   * Specific pages or page ranges to exclude from this breakpoint.
+   *
+   * Use this to skip the breakpoint for specific pages without needing
+   * to repeat the breakpoint with different min/max values.
+   *
+   * @example
+   * // Exclude specific pages
+   * { pattern: '\\.\\s*', exclude: [1, 2, 5] }
+   *
+   * @example
+   * // Exclude page ranges (front matter pages 1-10)
+   * { pattern: '{{tarqim}}\\s*', exclude: [[1, 10]] }
+   *
+   * @example
+   * // Mix single pages and ranges
+   * { pattern: '\\.\\s*', exclude: [1, [5, 10], 50] }
+   */
+  exclude?: PageRange[];
+  /**
+   * Skip this breakpoint if the segment content matches this pattern.
+   *
+   * Supports token expansion (e.g., `{{kitab}}`). When the segment's
+   * remaining content matches this regex, the breakpoint pattern is
+   * skipped and the next breakpoint in the array is tried.
+   *
+   * Useful for excluding title pages or front matter without needing
+   * to specify explicit page ranges.
+   *
+   * @example
+   * // Skip punctuation breakpoint for short content (likely titles)
+   * { pattern: '{{tarqim}}\\s*', skipWhen: '^.{1,20}$' }
+   *
+   * @example
+   * // Skip for content containing "kitab" (book) marker
+   * { pattern: '\\.\\s*', skipWhen: '{{kitab}}' }
+   */
+  skipWhen?: string;
+};
 /**
- * Options for template expansion
+ * A breakpoint can be a simple string pattern or an object with constraints.
+ *
+ * String breakpoints apply to all pages. Object breakpoints can specify
+ * `min`/`max` to limit which pages they apply to.
+ *
+ * @example
+ * // String (applies everywhere)
+ * '{{tarqim}}\\s*'
+ *
+ * @example
+ * // Object with constraints (only from page 10+)
+ * { pattern: '{{tarqim}}\\s*', min: 10 }
  */
-interface ExpandOptions {
-  /** Custom token map to use instead of default TOKENS */
-  tokens?: TokenMap;
-}
+type Breakpoint = string | BreakpointRule;
 /**
- * Expands a template string into a regex pattern using named capture groups.
- * Always creates three groups: full (entire match), marker (just the marker), content (clean text).
+ * Logger interface for custom logging implementations.
  *
- * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
+ * All methods are optional - only implement the verbosity levels you need.
+ * When no logger is provided, no logging overhead is incurred.
  *
- * @param template - Template string with {token} placeholders
- * @param options - Optional configuration
- * @returns Regex pattern string with named groups
+ * Compatible with the Logger interface from ffmpeg-simplified and similar libraries.
  *
  * @example
- * expandTemplate('{num} {dash}')
- * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
- */
-declare function expandTemplate(template: string, options?: ExpandOptions): string;
+ * // Simple console logger
+ * const logger: Logger = {
+ *   debug: console.debug,
+ *   info: console.info,
+ *   warn: console.warn,
+ *   error: console.error,
+ * };
+ *
+ * @example
+ * // Production logger (only warnings and errors)
+ * const prodLogger: Logger = {
+ *   warn: (msg, ...args) => myLoggingService.warn(msg, args),
+ *   error: (msg, ...args) => myLoggingService.error(msg, args),
+ * };
+ */
+interface Logger {
+  /** Log a debug message (verbose debugging output) */
+  debug?: (message: string, ...args: unknown[]) => void;
+  /** Log an error message (critical failures) */
+  error?: (message: string, ...args: unknown[]) => void;
+  /** Log an informational message (key progress points) */
+  info?: (message: string, ...args: unknown[]) => void;
+  /** Log a trace message (extremely verbose, per-iteration details) */
+  trace?: (message: string, ...args: unknown[]) => void;
+  /** Log a warning message (potential issues) */
+  warn?: (message: string, ...args: unknown[]) => void;
+}
 /**
- * Create a custom token map by extending the base tokens.
+ * Segmentation options controlling how pages are split.
  *
- * @param customTokens - Custom token definitions
- * @returns Combined token map
+ * @example
+ * // Basic structural rules only
+ * const options: SegmentationOptions = {
+ *   rules: [
+ *     { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } },
+ *     { lineStartsWith: ['### '], split: 'at', meta: { type: 'section' } },
+ *   ]
+ * };
  *
  * @example
- * const myTokens = createTokenMap({
- *   verse: '\\[[\\u0660-\\u0669]+\\]',
- *   tafsir: 'تفسير'
- * });
- */
-declare function createTokenMap(customTokens: Record<string, string>): TokenMap;
+ * // With breakpoints for oversized segments
+ * const options: SegmentationOptions = {
+ *   rules: [{ lineStartsWith: ['{{fasl}}'], split: 'at' }],
+ *   maxPages: 2,
+ *   breakpoints: ['{{tarqim}}\\s*', '\\n', ''],
+ *   prefer: 'longer'
+ * };
+ *
+ * @example
+ * // With custom logger for debugging
+ * const options: SegmentationOptions = {
+ *   rules: [...],
+ *   logger: {
+ *     debug: console.debug,
+ *     info: console.info,
+ *     warn: console.warn,
+ *   }
+ * };
+ */
+type SegmentationOptions = {
+  /**
+   * Rules applied in order to find split points.
+   *
+   * All rules are evaluated against the content, and their matches
+   * are combined to determine final split points. The first matching
+   * rule's metadata is used for each segment.
+   */
+  rules?: SplitRule[];
+  /**
+   * Maximum pages per segment before breakpoints are applied.
+   *
+   * When a segment spans more pages than this limit, the `breakpoints`
+   * patterns are tried (in order) to find a suitable break point within
+   * the allowed window.
+   *
+   * Structural markers (from rules) always take precedence - segments
+   * are only broken within their rule-defined boundaries, never across them.
+   *
+   * @example
+   * // Break segments that exceed 2 pages
+   * { maxPages: 2, breakpoints: ['{{tarqim}}', ''] }
+   */
+  maxPages?: number;
+  /**
+   * Patterns tried in order to break oversized segments.
+   *
+   * Each pattern is tried until one matches within the allowed page window.
+   * Supports token expansion (e.g., `{{tarqim}}`). An empty string `''`
+   * matches the page boundary (always succeeds as ultimate fallback).
+   *
+   * Patterns can be simple strings (apply everywhere) or objects with
+   * `min`/`max` constraints to limit which pages they apply to.
+   *
+   * Patterns are checked in order - put preferred break styles first:
+   * - `{{tarqim}}\\s*` - Break at sentence-ending punctuation
+   * - `\\n` - Break at line breaks (useful for OCR content)
+   * - `''` - Break at page boundary (always works)
+   *
+   * Only applied to segments exceeding `maxPages`.
+   *
+   * @example
+   * // Simple patterns (backward compatible)
+   * breakpoints: ['{{tarqim}}\\s*', '\\n', '']
+   *
+   * @example
+   * // Object patterns with page constraints
+   * breakpoints: [
+   *   { pattern: '{{tarqim}}\\s*', min: 10 },  // Only from page 10+
+   *   ''  // Fallback for pages 1-9
+   * ]
+   */
+  breakpoints?: Breakpoint[];
+  /**
+   * When multiple matches exist for a breakpoint pattern, select:
+   * - `'longer'` - Last match in window (prefers longer segments)
+   * - `'shorter'` - First match in window (prefers shorter segments)
+   *
+   * @default 'longer'
+   */
+  prefer?: 'longer' | 'shorter';
+  /**
+   * Optional logger for debugging segmentation.
+   *
+   * Provide a logger to receive detailed information about the segmentation
+   * process. Useful for debugging pattern matching, page tracking, and
+   * breakpoint processing issues.
+   *
+   * When not provided, no logging overhead is incurred (methods are not called).
+   *
+   * Verbosity levels:
+   * - `trace`: Per-iteration details (very verbose)
+   * - `debug`: Detailed operation information
+   * - `info`: Key progress points
+   * - `warn`: Potential issues
+   * - `error`: Critical failures
+   *
+   * @example
+   * // Console logger for development
+   * logger: {
+   *   debug: console.debug,
+   *   info: console.info,
+   *   warn: console.warn,
+   * }
+   *
+   * @example
+   * // Custom logger integration
+   * logger: {
+   *   debug: (msg, ...args) => winston.debug(msg, { meta: args }),
+   *   error: (msg, ...args) => winston.error(msg, { meta: args }),
+   * }
+   */
+  logger?: Logger;
+};
 /**
- * Validates a template string.
+ * Output segment produced by `segmentPages()`.
  *
- * @param template - Template to validate
- * @param tokens - Token map to validate against
- * @returns Validation result with errors if invalid
+ * Each segment contains extracted content, page references, and
+ * optional metadata from the matched rule and captured groups.
  *
  * @example
- * validateTemplate('{num} {dash}')
- * // Returns: { valid: true }
+ * // Simple segment on a single page
+ * { content: '## Chapter 1\nIntroduction...', from: 1, meta: { type: 'chapter' } }
  *
- * validateTemplate('{invalid}')
- * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
+ * @example
+ * // Segment spanning pages 5-7 with captured hadith number
+ * { content: 'Hadith text...', from: 5, to: 7, meta: { type: 'hadith', hadithNum: '٤٢' } }
  */
-declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult;
+type Segment = {
+  /**
+   * Segment content with:
+   * - Leading/trailing whitespace trimmed
+   * - Page breaks converted to spaces (for multi-page segments)
+   * - Markers stripped (for `lineStartsAfter` patterns)
+   */
+  content: string;
+  /**
+   * Starting page ID (from `Page.id`).
+   */
+  from: number;
+  /**
+   * Ending page ID if segment spans multiple pages.
+   *
+   * Only present when the segment content extends across page boundaries.
+   * When `undefined`, the segment is contained within a single page.
+   */
+  to?: number;
+  /**
+   * Combined metadata from:
+   * 1. Rule's `meta` property (static metadata)
+   * 2. Named captures from patterns (e.g., `{{raqms:num}}` → `{ num: '٤٢' }`)
+   *
+   * Named captures override static metadata with the same key.
+   */
+  meta?: Record<string, unknown>;
+};
 //#endregion
-//#region src/markers/type-generators.d.ts
+//#region src/segmentation/segmenter.d.ts
 /**
- * Generates a regular expression for pattern-type markers.
+ * Segments pages of content based on pattern-matching rules.
  *
- * Supports two modes:
- * 1. Template-based: Uses the `template` field with token expansion
- * 2. Pattern-based: Uses the raw `pattern` field as-is
+ * This is the main entry point for the segmentation engine. It takes an array
+ * of pages and applies the provided rules to identify split points, producing
+ * an array of segments with content, page references, and metadata.
  *
- * @param config - Marker configuration with either `template` or `pattern` field
- * @returns A compiled RegExp object for matching the pattern
- * @throws {Error} When neither `template` nor `pattern` is provided
+ * @param pages - Array of pages with id and content
+ * @param options - Segmentation options including splitting rules
+ * @returns Array of segments with content, from/to page references, and optional metadata
  *
  * @example
- * // Using template
- * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
+ * // Split markdown by headers
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     { lineStartsWith: ['## '], split: 'at', meta: { type: 'chapter' } }
+ *   ]
+ * });
  *
  * @example
- * // Using raw pattern
- * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
+ * // Split Arabic hadith text with number extraction
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     {
+ *       lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '],
+ *       split: 'at',
+ *       fuzzy: true,
+ *       meta: { type: 'hadith' }
+ *     }
+ *   ]
+ * });
  *
  * @example
- * // Using custom tokens
- * const regex = generatePatternRegex({
- *   type: 'pattern',
- *   template: '{verse}',
- *   tokens: { verse: '\\[[0-9]+\\]' }
+ * // Multiple rules with page constraints
+ * const segments = segmentPages(pages, {
+ *   rules: [
+ *     { lineStartsWith: ['{{kitab}}'], split: 'at', meta: { type: 'book' } },
+ *     { lineStartsWith: ['{{bab}}'], split: 'at', min: 10, meta: { type: 'chapter' } },
+ *     { regex: '^[٠-٩]+ - ', split: 'at', meta: { type: 'hadith' } }
+ *   ]
  * });
  */
-declare function generatePatternRegex(config: MarkerConfig): RegExp;
+declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
+//#endregion
+//#region src/segmentation/textUtils.d.ts
 /**
- * Generates a regular expression for 'bab' (chapter) markers.
+ * Strip all HTML tags from content, keeping only text.
  *
- * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
- * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
+ * @param html - HTML content
+ * @returns Plain text content
+ */
+declare const stripHtmlTags: (html: string) => string;
+/**
+ * Normalizes line endings to Unix-style (`\n`).
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
+ * for consistent pattern matching across platforms.
  *
- * @example
- * const regex = generateBabRegex();
- * const match = regex.exec('باب الصلاة');
- * // match.groups.marker -> 'باب'
- * // match.groups.content -> ' الصلاة'
+ * @param content - Raw content with potentially mixed line endings
+ * @returns Content with all line endings normalized to `\n`
  */
-declare function generateBabRegex(): RegExp;
+declare const normalizeLineEndings: (content: string) => string;
+//#endregion
+//#region src/segmentation/tokens.d.ts
 /**
- * Generates a regular expression for hadith chain (isnad) markers.
+ * Token-based template system for Arabic text pattern matching.
  *
- * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
- * Uses default phrases from presets or custom phrases from config.
- * All phrases are made diacritic-insensitive.
+ * This module provides a human-readable way to define regex patterns using
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
+ * named capture groups for extracting matched values into metadata.
  *
- * @param config - Marker configuration with optional `phrases` array
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @module tokens
  *
  * @example
- * // Using default phrases
- * const regex = generateHadithChainRegex({ type: 'hadith-chain' });
- * const match = regex.exec('حَدَّثَنَا أبو بكر');
+ * // Simple token expansion
+ * expandTokens('{{raqms}} {{dash}}')
+ * // → '[\\u0660-\\u0669]+ [-–—ـ]'
  *
  * @example
- * // Using custom phrases
- * const regex = generateHadithChainRegex({
- *   type: 'hadith-chain',
- *   phrases: ['قَالَ', 'رَوَى']
- * });
+ * // Named capture groups
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
  */
-declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
 /**
- * Generates a regular expression for basmala markers.
+ * Token definitions mapping human-readable token names to regex patterns.
+ *
+ * Tokens are used in template strings with double-brace syntax:
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
  *
- * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
- * - بسم الله (without diacritics)
- * - بِسْمِ اللَّهِ (with diacritics)
- * - Special patterns like [بسم, [تم
+ * @remarks
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @example
+ * // Using tokens in a split rule
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
  *
  * @example
- * const regex = generateBasmalaRegex();
- * const match = regex.exec('بسم الله الرحمن الرحيم');
- * // match.groups.marker -> 'بسم الله'
+ * // Using tokens with named captures
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
+ *
+ * @example
+ * // Using the numbered convenience token
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
  */
-declare function generateBasmalaRegex(): RegExp;
+declare const TOKEN_PATTERNS: Record<string, string>;
 /**
- * Generates a regular expression for custom phrase markers.
+ * Checks if a query string contains template tokens.
  *
- * Similar to hadith-chain markers but requires explicit phrase list.
- * All phrases are made diacritic-insensitive.
+ * Performs a quick test for `{{token}}` patterns without actually
+ * expanding them. Useful for determining whether to apply token
+ * expansion to a string.
  *
- * @param config - Marker configuration with required `phrases` array
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
- * @throws {Error} When `phrases` is undefined or empty
+ * @param query - String to check for tokens
+ * @returns `true` if the string contains at least one `{{token}}` pattern
  *
  * @example
- * const regex = generatePhraseRegex({
- *   type: 'phrase',
- *   phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
- * });
+ * containsTokens('{{raqms}} {{dash}}') // → true
+ * containsTokens('plain text')          // → false
+ * containsTokens('[٠-٩]+ - ')           // → false (raw regex, no tokens)
+ */
+declare const containsTokens: (query: string) => boolean;
+/**
+ * Result from expanding tokens with capture information.
+ *
+ * Contains the expanded pattern string along with metadata about
+ * any named capture groups that were created.
  */
-declare function generatePhraseRegex(config: MarkerConfig): RegExp;
+type ExpandResult = {
+  /**
+   * The expanded regex pattern string with all tokens replaced.
+   *
+   * Named captures use the `(?<name>pattern)` syntax.
+   */
+  pattern: string;
+  /**
+   * Names of captured groups extracted from `{{token:name}}` syntax.
+   *
+   * Empty array if no named captures were found.
+   */
+  captureNames: string[];
+  /**
+   * Whether the pattern has any named capturing groups.
+   *
+   * Equivalent to `captureNames.length > 0`.
+   */
+  hasCaptures: boolean;
+};
 /**
- * Generates a regular expression for square bracket markers.
+ * Expands template tokens with support for named captures.
  *
- * Matches verse or hadith reference numbers in square brackets:
- * - [٦٥] - Simple bracket
- * - • [٦٥] - With bullet prefix
- * - ° [٦٥] - With degree prefix
+ * This is the primary token expansion function that handles all token syntax:
+ * - `{{token}}` → Expands to the token's pattern (no capture group)
+ * - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
+ * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
+ *
+ * @param query - The template string containing tokens
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
+ *                         Applied to both token patterns and plain Arabic text between tokens.
+ *                         Typically `makeDiacriticInsensitive` from the fuzzy module.
+ * @returns Object with expanded pattern, capture names, and capture flag
  *
  * @example
- * const regex = generateSquareBracketRegex();
- * const match = regex.exec('[٦٥] نص الحديث');
- * // match.groups.content -> ' نص الحديث'
- */
-declare function generateSquareBracketRegex(): RegExp;
-/**
- * Generates a regular expression for number-letter-separator markers.
+ * // Simple token expansion
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
+ * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
  *
- * Matches patterns like:
- * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
- * - 5 ب. (Latin number, Arabic letter, dot)
+ * @example
+ * // Named capture
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @example
+ * // Capture-only token
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
  *
  * @example
- * const regex = generateNumLetterRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ أ - نص');
+ * // With fuzzy transform
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
  */
-declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
+declare const expandTokensWithCaptures: (query: string, fuzzyTransform?: (pattern: string) => string) => ExpandResult;
 /**
- * Generates a regular expression for number-parenthetical-separator markers.
+ * Expands template tokens in a query string to their regex equivalents.
  *
- * Matches patterns like:
- * - ٥ (أ) - (number, parenthetical content, separator)
- * - 5 (٦) - (number with parenthetical number)
+ * This is the simple version without capture support. It returns only the
+ * expanded pattern string, not capture metadata.
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * Unknown tokens are left as-is, allowing for partial templates.
+ *
+ * @param query - Template string containing `{{token}}` placeholders
+ * @returns Expanded regex pattern string
  *
  * @example
- * const regex = generateNumParenRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ (أ) - نص');
+ * expandTokens('، {{raqms}}')     // → '، [\\u0660-\\u0669]+'
+ * expandTokens('{{raqm}}*')       // → '[\\u0660-\\u0669]*'
+ * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
+ * expandTokens('{{unknown}}')     // → '{{unknown}}' (left as-is)
+ *
+ * @see expandTokensWithCaptures for full capture group support
  */
-declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
+declare const expandTokens: (query: string) => string;
 /**
- * Generates a regular expression for number-slash-number markers.
+ * Converts a template string to a compiled RegExp.
  *
- * Matches patterns like:
- * - ٥/٦ - (number slash number, separator)
- * - ٥ - (single number, separator)
+ * Expands all tokens and attempts to compile the result as a RegExp
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
  *
- * The second number after the slash is optional.
+ * @remarks
+ * This function dynamically compiles regular expressions from template strings.
+ * If templates may come from untrusted sources, be aware of potential ReDoS
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
+ * Consider validating pattern complexity or applying execution timeouts when
+ * running user-submitted patterns.
  *
- * @param config - Configuration with required `numbering` and `separator` fields
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param template - Template string containing `{{token}}` placeholders
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
  *
  * @example
- * const regex = generateNumSlashRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match1 = regex.exec('٥/٦ - نص');
- * const match2 = regex.exec('٥ - نص'); // Also matches
+ * templateToRegex('، {{raqms}}')  // → /، [٠-٩]+/u
+ * templateToRegex('{{raqms}}+')   // → /[٠-٩]++/u (might be invalid in some engines)
+ * templateToRegex('(((')          // → null (invalid regex)
  */
-declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
+declare const templateToRegex: (template: string) => RegExp | null;
 /**
- * Generates a regular expression for numbered markers with optional format template.
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
  *
- * Supports two modes:
- * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
- * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
+ * Useful for documentation, validation, or building user interfaces
+ * that show available tokens.
  *
- * When using default pattern:
- * - Separator 'none' generates pattern without separator
- * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
- *
- * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
  *
  * @example
- * // Using format template
- * const regex = generateNumberedRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash',
- *   format: '{bullet}+ {num} {dash}'
- * });
+ * getAvailableTokens()
+ * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
+ */
+declare const getAvailableTokens: () => string[];
+/**
+ * Gets the regex pattern for a specific token name.
  *
- * @example
- * // Using default pattern
- * const regex = generateNumberedRegex({
- *   numbering: 'arabic-indic',
- *   separator: 'dash'
- * });
- * const match = regex.exec('٥ - نص');
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
+ * without any expansion or capture group wrapping.
+ *
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
  *
  * @example
- * // With 'none' separator
- * const regex = generateNumberedRegex({
- *   numbering: 'latin',
- *   separator: 'none'
- * });
- * const match = regex.exec('5 text');
+ * getTokenPattern('raqms')   // → '[\\u0660-\\u0669]+'
+ * getTokenPattern('dash')    // → '[-–—ـ]'
+ * getTokenPattern('unknown') // → undefined
  */
-declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp;
+declare const getTokenPattern: (tokenName: string) => string | undefined;
+//#endregion
+//#region src/pattern-detection.d.ts
 /**
- * Generates a regular expression for bullet-point markers.
+ * Pattern detection utilities for recognizing template tokens in Arabic text.
+ * Used to auto-detect patterns from user-highlighted text in the segmentation dialog.
  *
- * Matches common bullet characters:
- * - • (bullet)
- * - * (asterisk)
- * - ° (degree)
- * - - (dash)
+ * @module pattern-detection
+ */
+/**
+ * Result of detecting a token pattern in text
+ */
+type DetectedPattern = {
+  /** Token name from TOKEN_PATTERNS (e.g., 'raqms', 'dash') */
+  token: string;
+  /** The matched text */
+  match: string;
+  /** Start index in the original text */
+  index: number;
+  /** End index (exclusive) */
+  endIndex: number;
+};
+/**
+ * Analyzes text and returns all detected token patterns with their positions.
+ * Patterns are detected in priority order to avoid partial matches.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param text - The text to analyze for token patterns
+ * @returns Array of detected patterns sorted by position
  *
  * @example
- * const regex = generateBulletRegex();
- * const match = regex.exec('• نقطة');
- * // match.groups.content -> 'نقطة'
+ * detectTokenPatterns("٣٤ - حدثنا")
+ * // Returns: [
+ * //   { token: 'raqms', match: '٣٤', index: 0, endIndex: 2 },
+ * //   { token: 'dash', match: '-', index: 3, endIndex: 4 },
+ * //   { token: 'naql', match: 'حدثنا', index: 5, endIndex: 10 }
+ * // ]
  */
-declare function generateBulletRegex(): RegExp;
+declare const detectTokenPatterns: (text: string) => DetectedPattern[];
 /**
- * Generates a regular expression for Markdown-style heading markers.
- *
- * Matches heading levels using hash symbols:
- * - # Heading 1
- * - ## Heading 2
- * - ### Heading 3
- * - etc.
+ * Generates a template pattern from text using detected tokens.
+ * Replaces matched portions with {{token}} syntax.
  *
- * @returns A compiled RegExp with named groups: `full`, `marker`, `content`
+ * @param text - Original text
+ * @param detected - Array of detected patterns from detectTokenPatterns
+ * @returns Template string with tokens, e.g., "{{raqms}} {{dash}} "
  *
  * @example
- * const regex = generateHeadingRegex();
- * const match = regex.exec('## عنوان فرعي');
- * // match.groups.marker -> '## '
- * // match.groups.content -> 'عنوان فرعي'
+ * const detected = detectTokenPatterns("٣٤ - ");
+ * generateTemplateFromText("٣٤ - ", detected);
+ * // Returns: "{{raqms}} {{dash}} "
+ */
+declare const generateTemplateFromText: (text: string, detected: DetectedPattern[]) => string;
+/**
+ * Determines the best pattern type for auto-generated rules based on detected patterns.
+ *
+ * @param detected - Array of detected patterns
+ * @returns Suggested pattern type and whether to use fuzzy matching
+ */
+declare const suggestPatternConfig: (detected: DetectedPattern[]) => {
+  patternType: "lineStartsWith" | "lineStartsAfter";
+  fuzzy: boolean;
+  metaType?: string;
+};
+/**
+ * Analyzes text and generates a complete suggested rule configuration.
+ *
+ * @param text - Highlighted text from the page
+ * @returns Suggested rule configuration or null if no patterns detected
  */
-declare function generateHeadingRegex(): RegExp;
+declare const analyzeTextForRule: (text: string) => {
+  template: string;
+  patternType: "lineStartsWith" | "lineStartsAfter";
+  fuzzy: boolean;
+  metaType?: string;
+  detected: DetectedPattern[];
+} | null;
 //#endregion
-export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
+export { type Breakpoint, type BreakpointRule, type DetectedPattern, type ExpandResult, type Logger, type Page, type PageRange, type Segment, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, analyzeTextForRule, containsTokens, detectTokenPatterns, escapeRegex, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, normalizeLineEndings, segmentPages, stripHtmlTags, suggestPatternConfig, templateToRegex };
 //# sourceMappingURL=index.d.mts.map