npm - flappa-doormal - Versions diffs - 2.19.0 → 2.21.0 - Mend

flappa-doormal 2.19.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/AGENTS.md +63 -11
package/README.md +383 -11
package/dist/index.d.mts +440 -132
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +2 -4445
package/dist/index.mjs.map +1 -1
package/dist/mcp/server.d.mts +1 -0
package/dist/mcp/server.mjs +156 -0
package/dist/mcp/server.mjs.map +1 -0
package/dist/segmentation-advisor-D375TL8-.mjs +6128 -0
package/dist/segmentation-advisor-D375TL8-.mjs.map +1 -0
package/package.json +18 -4

package/dist/index.d.mts CHANGED Viewed

@@ -122,6 +122,152 @@ type BreakpointRule = PageRangeConstraintWithExclude & {
  */
 type Breakpoint = string | BreakpointRule;
 //#endregion
+//#region src/types/dictionary.d.ts
+/**
+ * Dictionary v2 profile types for Shamela-style Arabic dictionary segmentation.
+ */
+type DictionaryHeadingClass = 'chapter' | 'entry' | 'marker' | 'cluster';
+type DictionaryHeadingScanClass = DictionaryHeadingClass | 'noise';
+type DictionarySegmentKind = 'chapter' | 'entry' | 'marker';
+type DictionarySegmentMeta = {
+  kind: DictionarySegmentKind;
+  lemma?: string;
+};
+/** Family key used by diagnostics and authoring tools. */
+type DictionaryFamilyUse = DictionaryFamily['use'];
+/** Rejection reason emitted by dictionary-profile diagnostics. */
+type DictionaryDiagnosticReason = 'qualifierTail' | 'structuralLeak' | 'intro' | 'authorityIntro' | 'stopLemma' | 'previousWord' | 'previousChar' | 'pageContinuation';
+type DictionaryGate = {
+  use: 'headingText';
+  match: string;
+  fuzzy?: boolean;
+} | {
+  use: 'headingToken';
+  token: 'bab' | 'fasl' | 'kitab';
+};
+type DictionaryProfileValidationIssueCode = 'invalid_version' | 'missing_zones' | 'duplicate_zone_name' | 'empty_zone_name' | 'empty_zone_families' | 'invalid_zone_page_range' | 'empty_heading_classes' | 'inert_heading_family' | 'empty_inline_prefixes' | 'invalid_gate_match' | 'invalid_gate_fuzzy' | 'duplicate_activate_after_gate' | 'invalid_stop_words' | 'invalid_previous_words' | 'invalid_previous_chars' | 'invalid_previous_word_scope' | 'invalid_authority_intro_precision' | 'invalid_continuation_precision';
+type DictionaryProfileValidationIssue = {
+  code: DictionaryProfileValidationIssueCode;
+  message: string;
+  path: string;
+  zoneName?: string;
+};
+type HeadingFamily = {
+  use: 'heading';
+  classes: DictionaryHeadingClass[];
+  emit: DictionarySegmentKind;
+  allowNextLineColon?: boolean;
+  allowSingleLetter?: boolean;
+};
+type LineEntryFamily = {
+  use: 'lineEntry';
+  wrappers?: 'none' | 'parentheses' | 'brackets' | 'curly' | 'any';
+  allowWhitespaceBeforeColon?: boolean;
+  allowMultiWord?: boolean;
+  emit: 'entry';
+};
+type InlineSubentryFamily = {
+  use: 'inlineSubentry';
+  prefixes?: string[];
+  stripPrefixesFromLemma?: boolean;
+  emit: 'entry';
+};
+type CodeLineFamily = {
+  use: 'codeLine';
+  wrappers?: 'none' | 'paired' | 'mismatched' | 'either';
+  emit: 'marker';
+};
+type PairedFormsFamily = {
+  use: 'pairedForms';
+  separator?: 'comma' | 'space';
+  emit: 'marker' | 'entry';
+  requireStatusTail?: boolean;
+};
+type DictionaryFamily = HeadingFamily | LineEntryFamily | InlineSubentryFamily | CodeLineFamily | PairedFormsFamily;
+type PageContinuationBlocker = {
+  use: 'pageContinuation';
+  appliesTo?: DictionaryFamily['use'][];
+  authorityPrecision?: 'high' | 'aggressive';
+};
+type IntroBlocker = {
+  use: 'intro';
+  appliesTo?: DictionaryFamily['use'][];
+};
+type AuthorityIntroBlocker = {
+  use: 'authorityIntro';
+  appliesTo?: DictionaryFamily['use'][];
+  precision?: 'high' | 'aggressive';
+};
+type StopLemmaBlocker = {
+  use: 'stopLemma';
+  appliesTo?: DictionaryFamily['use'][];
+  words: string[];
+};
+type PreviousWordBlocker = {
+  use: 'previousWord';
+  appliesTo?: DictionaryFamily['use'][];
+  words: string[];
+  scope?: 'samePage' | 'pageStart' | 'any';
+};
+type PreviousCharBlocker = {
+  use: 'previousChar';
+  appliesTo?: DictionaryFamily['use'][];
+  chars: string[];
+};
+type DictionaryBlocker = PageContinuationBlocker | IntroBlocker | AuthorityIntroBlocker | StopLemmaBlocker | PreviousWordBlocker | PreviousCharBlocker;
+type DictionaryZone = {
+  name: string;
+  when?: {
+    minPageId?: number;
+    maxPageId?: number;
+    activateAfter?: DictionaryGate[];
+  };
+  families: DictionaryFamily[];
+  blockers?: DictionaryBlocker[];
+};
+type ArabicDictionaryProfile = {
+  version: 2;
+  zones: DictionaryZone[];
+};
+/** Sampled accepted or rejected candidate from dictionary-profile diagnostics. */
+type DictionaryDiagnosticSample = {
+  accepted: boolean;
+  absoluteIndex: number;
+  family: DictionaryFamilyUse;
+  kind: DictionarySegmentKind;
+  lemma?: string;
+  line: number;
+  pageId: number;
+  reason?: DictionaryDiagnosticReason;
+  text: string;
+  zone: string;
+};
+/** Options for dictionary-profile diagnostics collection. */
+type DictionaryProfileDiagnosticsOptions = {
+  sampleLimit?: number;
+};
+/** Aggregate diagnostics for tuning a dictionary profile. */
+type DictionaryProfileDiagnostics = {
+  acceptedCount: number;
+  acceptedKinds: Record<DictionarySegmentKind, number>;
+  rejectionReasons: Record<DictionaryDiagnosticReason, number>;
+  familyCounts: Record<DictionaryFamilyUse, {
+    accepted: number;
+    rejected: number;
+  }>;
+  pageCount: number;
+  rejectedCount: number;
+  rejectedLemmas: Array<{
+    count: number;
+    lemma: string;
+  }>;
+  samples: DictionaryDiagnosticSample[];
+  zoneCounts: Record<string, {
+    accepted: number;
+    rejected: number;
+  }>;
+};
+//#endregion
 //#region src/types/rules.d.ts
 /**
  * Literal regex pattern rule - no token expansion or auto-escaping is applied.
@@ -265,7 +411,7 @@ type LineEndsWithPattern = {
  * This captures authoring intent in a serializable shape and is compiled into
  * a regex internally by the rule compiler.
  */
-interface DictionaryEntryPatternOptions {
+type DictionaryEntryPatternOptions = {
   /**
    * Words that should never be treated as lemmas when followed by a colon.
    *
@@ -310,7 +456,7 @@ interface DictionaryEntryPatternOptions {
    * @default 10
    */
   maxLetters?: number;
-}
+};
 /**
  * Arabic dictionary entry pattern rule - serializable headword matcher compiled internally.
  *
@@ -632,18 +778,13 @@ type PreprocessTransform = 'removeZeroWidth' | 'condenseEllipsis' | 'fixTrailing
  *   error: (msg, ...args) => myLoggingService.error(msg, args),
  * };
  */
-interface Logger {
-  /** Log a debug message (verbose debugging output) */
-  debug?: (message: string, ...args: unknown[]) => void;
-  /** Log an error message (critical failures) */
-  error?: (message: string, ...args: unknown[]) => void;
-  /** Log an informational message (key progress points) */
-  info?: (message: string, ...args: unknown[]) => void;
-  /** Log a trace message (extremely verbose, per-iteration details) */
-  trace?: (message: string, ...args: unknown[]) => void;
-  /** Log a warning message (potential issues) */
+type Logger = {
+  /** Log a debug message (verbose debugging output) */debug?: (message: string, ...args: unknown[]) => void; /** Log an error message (critical failures) */
+  error?: (message: string, ...args: unknown[]) => void; /** Log an informational message (key progress points) */
+  info?: (message: string, ...args: unknown[]) => void; /** Log a trace message (extremely verbose, per-iteration details) */
+  trace?: (message: string, ...args: unknown[]) => void; /** Log a warning message (potential issues) */
   warn?: (message: string, ...args: unknown[]) => void;
-}
+};
 /**
  * Segmentation options controlling how pages are split.
  *
@@ -677,6 +818,13 @@ interface Logger {
  * };
  */
 type SegmentationOptions = {
+  /**
+   * Dictionary profile for Shamela-style Arabic dictionaries.
+   *
+   * This authoring contract is compiled into internal matchers and merged
+   * with any regular `rules`.
+   */
+  dictionary?: ArabicDictionaryProfile;
   /**
    * Rules applied in order to find split points.
    *
@@ -1070,6 +1218,147 @@ type RepeatingSequencePattern = {
  */
 declare const analyzeRepeatingSequences: (pages: Page[], options?: RepeatingSequenceOptions) => RepeatingSequencePattern[];
 //#endregion
+//#region src/segmentation/pattern-validator.d.ts
+/**
+ * Types of validation issues that can be detected.
+ */
+type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex' | 'invalid_option';
+/**
+ * A validation issue found in a pattern.
+ */
+type ValidationIssue = {
+  type: ValidationIssueType;
+  message: string;
+  suggestion?: string; /** The token name involved in the issue (for unknown_token / missing_braces) */
+  token?: string; /** The specific pattern involved (for duplicate) */
+  pattern?: string;
+};
+/**
+ * Validation result for a single rule, with issues keyed by pattern type.
+ * Arrays parallel the input pattern arrays - undefined means no issue.
+ */
+type RuleValidationResult = {
+  lineStartsWith?: (ValidationIssue | undefined)[];
+  lineStartsAfter?: (ValidationIssue | undefined)[];
+  lineEndsWith?: (ValidationIssue | undefined)[];
+  template?: ValidationIssue;
+  regex?: ValidationIssue;
+  dictionaryEntry?: Partial<Record<keyof DictionaryEntryPatternOptions, ValidationIssue>>;
+};
+/**
+ * Validates split rules for common pattern issues.
+ *
+ * Checks for:
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
+ * - Duplicate patterns within the same rule
+ *
+ * @param rules - Array of split rules to validate
+ * @returns Array parallel to input with validation results (undefined if no issues)
+ *
+ * @example
+ * const issues = validateRules([
+ *   { lineStartsAfter: ['raqms:num'] },  // Missing braces
+ *   { lineStartsWith: ['{{unknown}}'] }, // Unknown token
+ * ]);
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
+ */
+declare const validateRules: (rules: SplitRule[]) => (RuleValidationResult | undefined)[];
+/**
+ * Formats a validation result array into a list of human-readable error messages.
+ *
+ * Useful for displaying validation errors in UIs.
+ *
+ * @param results - The result array from `validateRules()`
+ * @returns Array of formatted error strings
+ *
+ * @example
+ * const issues = validateRules(rules);
+ * const errors = formatValidationReport(issues);
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
+ */
+declare const formatValidationReport: (results: (RuleValidationResult | undefined)[]) => string[];
+//#endregion
+//#region src/analysis/segmentation-advisor.d.ts
+type SegmentationAdvisorMode = 'structured' | 'continuous' | 'mixed';
+type SegmentationAdvisorOptions = {
+  topLineStarts?: number;
+  topRepeatingSequences?: number;
+  minLineStartCount?: number;
+  minRepeatingCount?: number;
+  maxRules?: number;
+  sampleSegments?: number;
+};
+type PreprocessDetections = {
+  ellipsisCount: number;
+  trailingWawCount: number;
+  zeroWidthCount: number;
+};
+type PreprocessSuggestion = {
+  count: number;
+  reason: string;
+  transform: PreprocessTransform;
+};
+type RuleSuggestionSource = 'line-start' | 'repeating-sequence';
+type RuleSuggestionConfidence = 'high' | 'medium' | 'low';
+type SuggestedRule = {
+  confidence: RuleSuggestionConfidence;
+  count: number;
+  example: {
+    pageId: number;
+    text: string;
+  };
+  pattern: string;
+  reason: string;
+  rule: SplitRule;
+  source: RuleSuggestionSource;
+};
+type BreakpointSuggestion = {
+  breakpoints: Breakpoint[];
+  maxPages: number;
+  prefer: 'longer' | 'shorter';
+  reason: string;
+};
+type SegmentationEvaluation = {
+  averageSegmentLength: number;
+  maxSegmentLength: number;
+  multiPageSegments: number;
+  segmentCount: number;
+  validation: SegmentValidationReport;
+};
+type SegmentationSuggestionReport = {
+  assessment: {
+    mode: SegmentationAdvisorMode;
+    reason: string;
+  };
+  breakpointSuggestions: BreakpointSuggestion[];
+  evaluation?: SegmentationEvaluation;
+  lineStarts: CommonLineStartPattern[];
+  optimization: {
+    mergedCount: number;
+    optimizedRuleCount: number;
+    originalRuleCount: number;
+  };
+  preprocess: {
+    detections: PreprocessDetections;
+    suggestions: PreprocessSuggestion[];
+  };
+  recommendedOptions: SegmentationOptions;
+  repeatingSequences: RepeatingSequencePattern[];
+  ruleSuggestions: SuggestedRule[];
+  ruleValidation: RuleValidationResult[];
+  ruleValidationErrors: string[];
+  segmentSamples: Segment[];
+};
+/**
+ * Generate a machine-readable draft segmentation report for AI agents.
+ *
+ * This helper is intentionally deterministic: it inspects pages, drafts
+ * candidate rules, validates them, and evaluates its own recommendation.
+ */
+declare const suggestSegmentationOptions: (pages: Page[], options?: SegmentationAdvisorOptions) => SegmentationSuggestionReport;
+//#endregion
 //#region src/detection.d.ts
 /**
  * Result of detecting a token pattern in text
@@ -1135,6 +1424,113 @@ declare const analyzeTextForRule: (text: string) => {
   detected: DetectedPattern[];
 } | null;
 //#endregion
+//#region src/dictionary/arabic-dictionary-rule.d.ts
+interface ArabicDictionaryEntryRuleOptions extends DictionaryEntryPatternOptions {
+  /**
+   * Suppress page-start matches when the previous page's last Arabic word
+   * is in this stoplist, unless that page ends with strong sentence punctuation.
+   */
+  pageStartPrevWordStoplist?: string[];
+  /**
+   * Suppress non-page-start matches when the immediately previous Arabic word
+   * on the same page is in this stoplist.
+   */
+  samePagePrevWordStoplist?: string[];
+  /**
+   * Static metadata merged into matching segments.
+   */
+  meta?: Record<string, unknown>;
+}
+/**
+ * Creates a reusable split rule for Arabic dictionary entries.
+ *
+ * The returned rule preserves authoring intent as a serializable
+ * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
+ * regex string.
+ *
+ * @example
+ * createArabicDictionaryEntryRule({
+ *   stopWords: ['وقيل', 'ويقال', 'قال'],
+ *   pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
+ * })
+ *
+ * @example
+ * createArabicDictionaryEntryRule({
+ *   allowParenthesized: true,
+ *   allowWhitespaceBeforeColon: true,
+ *   allowCommaSeparated: true,
+ *   stopWords: ['الليث', 'العجاج'],
+ * })
+ */
+/**
+ * @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
+ * whole-book dictionary segmentation. Keep this helper for advanced single-rule
+ * composition inside a broader `SplitRule[]` pipeline.
+ */
+declare const createArabicDictionaryEntryRule: ({
+  allowCommaSeparated,
+  allowParenthesized,
+  allowWhitespaceBeforeColon,
+  captureName,
+  maxLetters,
+  meta,
+  midLineSubentries,
+  minLetters,
+  pageStartPrevWordStoplist,
+  samePagePrevWordStoplist,
+  stopWords
+}: ArabicDictionaryEntryRuleOptions) => SplitRule;
+//#endregion
+//#region src/dictionary/heading-classifier.d.ts
+type DictionarySurfaceKind = DictionaryHeadingScanClass | 'lineEntry' | 'inlineSubentry' | 'codeLine' | 'pairedForms';
+type DictionarySurfaceMatch = {
+  kind: DictionarySurfaceKind;
+  pageId: number;
+  text: string;
+  lemma?: string;
+  line: number;
+};
+type DictionaryMarkdownPage = {
+  content: string;
+  id: number;
+};
+type DictionarySurfaceReport = {
+  counts: Record<DictionarySurfaceKind, number>;
+  matches: DictionarySurfaceMatch[];
+};
+/**
+ * Classifies a markdown heading line produced by `convertContentToMarkdown()`.
+ */
+declare const classifyDictionaryHeading: (line: string) => DictionaryHeadingScanClass;
+/**
+ * Extracts dictionary surface matches from a markdown page.
+ */
+declare const scanDictionaryMarkdownPage: (page: DictionaryMarkdownPage) => DictionarySurfaceMatch[];
+/**
+ * Aggregates dictionary surface counts across markdown pages.
+ */
+declare const analyzeDictionaryMarkdownPages: (pages: DictionaryMarkdownPage[]) => DictionarySurfaceReport;
+//#endregion
+//#region src/dictionary/profile.d.ts
+declare class DictionaryProfileValidationError extends Error {
+  readonly issues: DictionaryProfileValidationIssue[];
+  constructor(issues: DictionaryProfileValidationIssue[]);
+}
+/**
+ * Validates a dictionary profile without normalizing it.
+ */
+declare const validateDictionaryProfile: (profile: ArabicDictionaryProfile) => DictionaryProfileValidationIssue[];
+//#endregion
+//#region src/dictionary/dictionary-diagnostics.d.ts
+/**
+ * Collects tuning-oriented diagnostics for a dictionary profile without creating
+ * segments. This output is intended for profile authoring workflows rather than
+ * long-term compatibility guarantees.
+ *
+ * This is useful when tuning blockers and family choices for a new dictionary.
+ */
+declare const diagnoseDictionaryProfile: (pages: Page[], profile: ArabicDictionaryProfile, options?: DictionaryProfileDiagnosticsOptions) => DictionaryProfileDiagnostics;
+//#endregion
 //#region src/optimization/optimize-rules.d.ts
 /**
  * Result from optimizing rules.
@@ -1189,58 +1585,6 @@ declare const fixTrailingWaw: (text: string) => string;
  */
 declare const applyPreprocessToPage: (content: string, pageId: number, transforms: PreprocessTransform[]) => string;
 //#endregion
-//#region src/segmentation/arabic-dictionary-rule.d.ts
-interface ArabicDictionaryEntryRuleOptions extends DictionaryEntryPatternOptions {
-  /**
-   * Suppress page-start matches when the previous page's last Arabic word
-   * is in this stoplist, unless that page ends with strong sentence punctuation.
-   */
-  pageStartPrevWordStoplist?: string[];
-  /**
-   * Suppress non-page-start matches when the immediately previous Arabic word
-   * on the same page is in this stoplist.
-   */
-  samePagePrevWordStoplist?: string[];
-  /**
-   * Static metadata merged into matching segments.
-   */
-  meta?: Record<string, unknown>;
-}
-/**
- * Creates a reusable split rule for Arabic dictionary entries.
- *
- * The returned rule preserves authoring intent as a serializable
- * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
- * regex string.
- *
- * @example
- * createArabicDictionaryEntryRule({
- *   stopWords: ['وقيل', 'ويقال', 'قال'],
- *   pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
- * })
- *
- * @example
- * createArabicDictionaryEntryRule({
- *   allowParenthesized: true,
- *   allowWhitespaceBeforeColon: true,
- *   allowCommaSeparated: true,
- *   stopWords: ['الليث', 'العجاج'],
- * })
- */
-declare const createArabicDictionaryEntryRule: ({
-  allowCommaSeparated,
-  allowParenthesized,
-  allowWhitespaceBeforeColon,
-  captureName,
-  maxLetters,
-  meta,
-  midLineSubentries,
-  minLetters,
-  pageStartPrevWordStoplist,
-  samePagePrevWordStoplist,
-  stopWords
-}: ArabicDictionaryEntryRuleOptions) => SplitRule;
-//#endregion
 //#region src/segmentation/breakpoint-utils.d.ts
 /**
  * Escapes regex metacharacters outside of `{{token}}` delimiters.
@@ -1292,68 +1636,6 @@ declare const getDebugReason: (meta: Record<string, any> | undefined, options?:
  */
 declare const getSegmentDebugReason: (segment: Segment, options?: DebugReasonOptions) => string;
 //#endregion
-//#region src/segmentation/pattern-validator.d.ts
-/**
- * Types of validation issues that can be detected.
- */
-type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex' | 'invalid_option';
-/**
- * A validation issue found in a pattern.
- */
-type ValidationIssue = {
-  type: ValidationIssueType;
-  message: string;
-  suggestion?: string; /** The token name involved in the issue (for unknown_token / missing_braces) */
-  token?: string; /** The specific pattern involved (for duplicate) */
-  pattern?: string;
-};
-/**
- * Validation result for a single rule, with issues keyed by pattern type.
- * Arrays parallel the input pattern arrays - undefined means no issue.
- */
-type RuleValidationResult = {
-  lineStartsWith?: (ValidationIssue | undefined)[];
-  lineStartsAfter?: (ValidationIssue | undefined)[];
-  lineEndsWith?: (ValidationIssue | undefined)[];
-  template?: ValidationIssue;
-  regex?: ValidationIssue;
-  dictionaryEntry?: Partial<Record<keyof DictionaryEntryPatternOptions, ValidationIssue>>;
-};
-/**
- * Validates split rules for common pattern issues.
- *
- * Checks for:
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
- * - Duplicate patterns within the same rule
- *
- * @param rules - Array of split rules to validate
- * @returns Array parallel to input with validation results (undefined if no issues)
- *
- * @example
- * const issues = validateRules([
- *   { lineStartsAfter: ['raqms:num'] },  // Missing braces
- *   { lineStartsWith: ['{{unknown}}'] }, // Unknown token
- * ]);
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
- */
-declare const validateRules: (rules: SplitRule[]) => (RuleValidationResult | undefined)[];
-/**
- * Formats a validation result array into a list of human-readable error messages.
- *
- * Useful for displaying validation errors in UIs.
- *
- * @param results - The result array from `validateRules()`
- * @returns Array of formatted error strings
- *
- * @example
- * const issues = validateRules(rules);
- * const errors = formatValidationReport(issues);
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
- */
-declare const formatValidationReport: (results: (RuleValidationResult | undefined)[]) => string[];
-//#endregion
 //#region src/segmentation/segmenter.d.ts
 /**
  * Segments pages of content based on pattern-matching rules.
@@ -1420,6 +1702,25 @@ declare const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = "[\u0621-\u063A\u0641-
  * One or more Arabic letters, where each letter may carry combining marks.
  */
 declare const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = "(?:[\u0621-\u063A\u0641-\u064A][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)+";
+declare const BASE_TOKENS: {
+  /** Chapter marker (باب). */readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
+  readonly basmalah: string; /** Bullet point variants: `•`, `*`, `°`. */
+  readonly bullet: "[•*°]"; /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
+  readonly dash: "[-–—ـ]"; /** Section marker (فصل / مسألة). */
+  readonly fasl: string; /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
+  readonly harf: "[أ-ي]"; /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
+  readonly harfs: "[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*(?:\\s+[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)*"; /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
+  readonly hr: "[-–—ـ_=]{5,}"; /** Book marker (كتاب). */
+  readonly kitab: "كتاب"; /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
+  readonly naql: string; /** Newline character. Useful for breakpoints that split on line boundaries. */
+  readonly newline: "\\n"; /** Single ASCII digit (0-9). */
+  readonly num: "\\d"; /** One or more ASCII digits (0-9)+. */
+  readonly nums: "\\d+"; /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
+  readonly raqm: "[\\u0660-\\u0669]"; /** One or more Arabic-Indic digits (٠-٩)+. */
+  readonly raqms: "[\\u0660-\\u0669]+"; /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
+  readonly rumuz: string; /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
+  readonly tarqim: "[.!?؟؛]";
+};
 /** Pre-defined token constants for use in patterns. */
 declare const Token: {
   /** Chapter marker - باب */readonly BAB: "{{bab}}"; /** Basmala - بسم الله */
@@ -1445,12 +1746,18 @@ declare const Token: {
  * Type representing valid token constant keys.
  */
 type TokenKey = keyof typeof Token;
+/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
+declare const withCapture: (token: string, name: string) => string;
+/** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
+declare const COMPOSITE_TOKENS: {
+  /** Common hadith numbering format: Arabic-Indic digits + dash + space. */readonly numbered: "{{raqms}} {{dash}} ";
+};
+type BaseTokenName = keyof typeof BASE_TOKENS;
+type CompositeTokenName = keyof typeof COMPOSITE_TOKENS;
 /**
  * Type representing valid token pattern names for `getTokenPattern()`.
  */
-type TokenPatternName = keyof typeof TOKEN_PATTERNS;
-/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
-declare const withCapture: (token: string, name: string) => string;
+type TokenPatternName = BaseTokenName | CompositeTokenName;
 /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
 declare const expandCompositeTokensInTemplate: (template: string) => string;
 /**
@@ -1479,7 +1786,8 @@ declare const expandCompositeTokensInTemplate: (template: string) => string;
  * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
  */
 declare const TOKEN_PATTERNS: {
-  /** Chapter marker (باب). */readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
+  readonly numbered: string; /** Chapter marker (باب). */
+  readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
   readonly basmalah: string; /** Bullet point variants: `•`, `*`, `°`. */
   readonly bullet: "[•*°]"; /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
   readonly dash: "[-–—ـ]"; /** Section marker (فصل / مسألة). */
@@ -1784,5 +2092,5 @@ type ValidationOptions = {
  */
 declare const validateSegments: (pages: Page[], options: SegmentationOptions, segments: Segment[], validationOptions?: ValidationOptions) => SegmentValidationReport;
 //#endregion
-export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type DictionaryEntryPatternOptions, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
+export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type ArabicDictionaryProfile, type Breakpoint, type BreakpointRule, type BreakpointSuggestion, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type DictionaryBlocker, type DictionaryDiagnosticReason, type DictionaryDiagnosticSample, type DictionaryEntryPatternOptions, type DictionaryFamily, type DictionaryFamilyUse, type DictionaryGate, type DictionaryHeadingClass, type DictionaryHeadingScanClass, type DictionaryMarkdownPage, type DictionaryProfileDiagnostics, type DictionaryProfileDiagnosticsOptions, DictionaryProfileValidationError, type DictionaryProfileValidationIssue, type DictionaryProfileValidationIssueCode, type DictionarySegmentKind, type DictionarySegmentMeta, type DictionarySurfaceKind, type DictionarySurfaceMatch, type DictionarySurfaceReport, type DictionaryZone, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessDetections, type PreprocessSuggestion, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleSuggestionConfidence, type RuleSuggestionSource, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationAdvisorMode, type SegmentationAdvisorOptions, type SegmentationEvaluation, type SegmentationOptions, type SegmentationSuggestionReport, type SplitRule, type SuggestedRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
 //# sourceMappingURL=index.d.mts.map