flappa-doormal 2.19.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -122,6 +122,152 @@ type BreakpointRule = PageRangeConstraintWithExclude & {
122
122
  */
123
123
  type Breakpoint = string | BreakpointRule;
124
124
  //#endregion
125
+ //#region src/types/dictionary.d.ts
126
+ /**
127
+ * Dictionary v2 profile types for Shamela-style Arabic dictionary segmentation.
128
+ */
129
+ type DictionaryHeadingClass = 'chapter' | 'entry' | 'marker' | 'cluster';
130
+ type DictionaryHeadingScanClass = DictionaryHeadingClass | 'noise';
131
+ type DictionarySegmentKind = 'chapter' | 'entry' | 'marker';
132
+ type DictionarySegmentMeta = {
133
+ kind: DictionarySegmentKind;
134
+ lemma?: string;
135
+ };
136
+ /** Family key used by diagnostics and authoring tools. */
137
+ type DictionaryFamilyUse = DictionaryFamily['use'];
138
+ /** Rejection reason emitted by dictionary-profile diagnostics. */
139
+ type DictionaryDiagnosticReason = 'qualifierTail' | 'structuralLeak' | 'intro' | 'authorityIntro' | 'stopLemma' | 'previousWord' | 'previousChar' | 'pageContinuation';
140
+ type DictionaryGate = {
141
+ use: 'headingText';
142
+ match: string;
143
+ fuzzy?: boolean;
144
+ } | {
145
+ use: 'headingToken';
146
+ token: 'bab' | 'fasl' | 'kitab';
147
+ };
148
+ type DictionaryProfileValidationIssueCode = 'invalid_version' | 'missing_zones' | 'duplicate_zone_name' | 'empty_zone_name' | 'empty_zone_families' | 'invalid_zone_page_range' | 'empty_heading_classes' | 'inert_heading_family' | 'empty_inline_prefixes' | 'invalid_gate_match' | 'invalid_gate_fuzzy' | 'duplicate_activate_after_gate' | 'invalid_stop_words' | 'invalid_previous_words' | 'invalid_previous_chars' | 'invalid_previous_word_scope' | 'invalid_authority_intro_precision' | 'invalid_continuation_precision';
149
+ type DictionaryProfileValidationIssue = {
150
+ code: DictionaryProfileValidationIssueCode;
151
+ message: string;
152
+ path: string;
153
+ zoneName?: string;
154
+ };
155
+ type HeadingFamily = {
156
+ use: 'heading';
157
+ classes: DictionaryHeadingClass[];
158
+ emit: DictionarySegmentKind;
159
+ allowNextLineColon?: boolean;
160
+ allowSingleLetter?: boolean;
161
+ };
162
+ type LineEntryFamily = {
163
+ use: 'lineEntry';
164
+ wrappers?: 'none' | 'parentheses' | 'brackets' | 'curly' | 'any';
165
+ allowWhitespaceBeforeColon?: boolean;
166
+ allowMultiWord?: boolean;
167
+ emit: 'entry';
168
+ };
169
+ type InlineSubentryFamily = {
170
+ use: 'inlineSubentry';
171
+ prefixes?: string[];
172
+ stripPrefixesFromLemma?: boolean;
173
+ emit: 'entry';
174
+ };
175
+ type CodeLineFamily = {
176
+ use: 'codeLine';
177
+ wrappers?: 'none' | 'paired' | 'mismatched' | 'either';
178
+ emit: 'marker';
179
+ };
180
+ type PairedFormsFamily = {
181
+ use: 'pairedForms';
182
+ separator?: 'comma' | 'space';
183
+ emit: 'marker' | 'entry';
184
+ requireStatusTail?: boolean;
185
+ };
186
+ type DictionaryFamily = HeadingFamily | LineEntryFamily | InlineSubentryFamily | CodeLineFamily | PairedFormsFamily;
187
+ type PageContinuationBlocker = {
188
+ use: 'pageContinuation';
189
+ appliesTo?: DictionaryFamily['use'][];
190
+ authorityPrecision?: 'high' | 'aggressive';
191
+ };
192
+ type IntroBlocker = {
193
+ use: 'intro';
194
+ appliesTo?: DictionaryFamily['use'][];
195
+ };
196
+ type AuthorityIntroBlocker = {
197
+ use: 'authorityIntro';
198
+ appliesTo?: DictionaryFamily['use'][];
199
+ precision?: 'high' | 'aggressive';
200
+ };
201
+ type StopLemmaBlocker = {
202
+ use: 'stopLemma';
203
+ appliesTo?: DictionaryFamily['use'][];
204
+ words: string[];
205
+ };
206
+ type PreviousWordBlocker = {
207
+ use: 'previousWord';
208
+ appliesTo?: DictionaryFamily['use'][];
209
+ words: string[];
210
+ scope?: 'samePage' | 'pageStart' | 'any';
211
+ };
212
+ type PreviousCharBlocker = {
213
+ use: 'previousChar';
214
+ appliesTo?: DictionaryFamily['use'][];
215
+ chars: string[];
216
+ };
217
+ type DictionaryBlocker = PageContinuationBlocker | IntroBlocker | AuthorityIntroBlocker | StopLemmaBlocker | PreviousWordBlocker | PreviousCharBlocker;
218
+ type DictionaryZone = {
219
+ name: string;
220
+ when?: {
221
+ minPageId?: number;
222
+ maxPageId?: number;
223
+ activateAfter?: DictionaryGate[];
224
+ };
225
+ families: DictionaryFamily[];
226
+ blockers?: DictionaryBlocker[];
227
+ };
228
+ type ArabicDictionaryProfile = {
229
+ version: 2;
230
+ zones: DictionaryZone[];
231
+ };
232
+ /** Sampled accepted or rejected candidate from dictionary-profile diagnostics. */
233
+ type DictionaryDiagnosticSample = {
234
+ accepted: boolean;
235
+ absoluteIndex: number;
236
+ family: DictionaryFamilyUse;
237
+ kind: DictionarySegmentKind;
238
+ lemma?: string;
239
+ line: number;
240
+ pageId: number;
241
+ reason?: DictionaryDiagnosticReason;
242
+ text: string;
243
+ zone: string;
244
+ };
245
+ /** Options for dictionary-profile diagnostics collection. */
246
+ type DictionaryProfileDiagnosticsOptions = {
247
+ sampleLimit?: number;
248
+ };
249
+ /** Aggregate diagnostics for tuning a dictionary profile. */
250
+ type DictionaryProfileDiagnostics = {
251
+ acceptedCount: number;
252
+ acceptedKinds: Record<DictionarySegmentKind, number>;
253
+ rejectionReasons: Record<DictionaryDiagnosticReason, number>;
254
+ familyCounts: Record<DictionaryFamilyUse, {
255
+ accepted: number;
256
+ rejected: number;
257
+ }>;
258
+ pageCount: number;
259
+ rejectedCount: number;
260
+ rejectedLemmas: Array<{
261
+ count: number;
262
+ lemma: string;
263
+ }>;
264
+ samples: DictionaryDiagnosticSample[];
265
+ zoneCounts: Record<string, {
266
+ accepted: number;
267
+ rejected: number;
268
+ }>;
269
+ };
270
+ //#endregion
125
271
  //#region src/types/rules.d.ts
126
272
  /**
127
273
  * Literal regex pattern rule - no token expansion or auto-escaping is applied.
@@ -265,7 +411,7 @@ type LineEndsWithPattern = {
265
411
  * This captures authoring intent in a serializable shape and is compiled into
266
412
  * a regex internally by the rule compiler.
267
413
  */
268
- interface DictionaryEntryPatternOptions {
414
+ type DictionaryEntryPatternOptions = {
269
415
  /**
270
416
  * Words that should never be treated as lemmas when followed by a colon.
271
417
  *
@@ -310,7 +456,7 @@ interface DictionaryEntryPatternOptions {
310
456
  * @default 10
311
457
  */
312
458
  maxLetters?: number;
313
- }
459
+ };
314
460
  /**
315
461
  * Arabic dictionary entry pattern rule - serializable headword matcher compiled internally.
316
462
  *
@@ -632,18 +778,13 @@ type PreprocessTransform = 'removeZeroWidth' | 'condenseEllipsis' | 'fixTrailing
632
778
  * error: (msg, ...args) => myLoggingService.error(msg, args),
633
779
  * };
634
780
  */
635
- interface Logger {
636
- /** Log a debug message (verbose debugging output) */
637
- debug?: (message: string, ...args: unknown[]) => void;
638
- /** Log an error message (critical failures) */
639
- error?: (message: string, ...args: unknown[]) => void;
640
- /** Log an informational message (key progress points) */
641
- info?: (message: string, ...args: unknown[]) => void;
642
- /** Log a trace message (extremely verbose, per-iteration details) */
643
- trace?: (message: string, ...args: unknown[]) => void;
644
- /** Log a warning message (potential issues) */
781
+ type Logger = {
782
+ /** Log a debug message (verbose debugging output) */debug?: (message: string, ...args: unknown[]) => void; /** Log an error message (critical failures) */
783
+ error?: (message: string, ...args: unknown[]) => void; /** Log an informational message (key progress points) */
784
+ info?: (message: string, ...args: unknown[]) => void; /** Log a trace message (extremely verbose, per-iteration details) */
785
+ trace?: (message: string, ...args: unknown[]) => void; /** Log a warning message (potential issues) */
645
786
  warn?: (message: string, ...args: unknown[]) => void;
646
- }
787
+ };
647
788
  /**
648
789
  * Segmentation options controlling how pages are split.
649
790
  *
@@ -677,6 +818,13 @@ interface Logger {
677
818
  * };
678
819
  */
679
820
  type SegmentationOptions = {
821
+ /**
822
+ * Dictionary profile for Shamela-style Arabic dictionaries.
823
+ *
824
+ * This authoring contract is compiled into internal matchers and merged
825
+ * with any regular `rules`.
826
+ */
827
+ dictionary?: ArabicDictionaryProfile;
680
828
  /**
681
829
  * Rules applied in order to find split points.
682
830
  *
@@ -1070,6 +1218,147 @@ type RepeatingSequencePattern = {
1070
1218
  */
1071
1219
  declare const analyzeRepeatingSequences: (pages: Page[], options?: RepeatingSequenceOptions) => RepeatingSequencePattern[];
1072
1220
  //#endregion
1221
+ //#region src/segmentation/pattern-validator.d.ts
1222
+ /**
1223
+ * Types of validation issues that can be detected.
1224
+ */
1225
+ type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex' | 'invalid_option';
1226
+ /**
1227
+ * A validation issue found in a pattern.
1228
+ */
1229
+ type ValidationIssue = {
1230
+ type: ValidationIssueType;
1231
+ message: string;
1232
+ suggestion?: string; /** The token name involved in the issue (for unknown_token / missing_braces) */
1233
+ token?: string; /** The specific pattern involved (for duplicate) */
1234
+ pattern?: string;
1235
+ };
1236
+ /**
1237
+ * Validation result for a single rule, with issues keyed by pattern type.
1238
+ * Arrays parallel the input pattern arrays - undefined means no issue.
1239
+ */
1240
+ type RuleValidationResult = {
1241
+ lineStartsWith?: (ValidationIssue | undefined)[];
1242
+ lineStartsAfter?: (ValidationIssue | undefined)[];
1243
+ lineEndsWith?: (ValidationIssue | undefined)[];
1244
+ template?: ValidationIssue;
1245
+ regex?: ValidationIssue;
1246
+ dictionaryEntry?: Partial<Record<keyof DictionaryEntryPatternOptions, ValidationIssue>>;
1247
+ };
1248
+ /**
1249
+ * Validates split rules for common pattern issues.
1250
+ *
1251
+ * Checks for:
1252
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
1253
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
1254
+ * - Duplicate patterns within the same rule
1255
+ *
1256
+ * @param rules - Array of split rules to validate
1257
+ * @returns Array parallel to input with validation results (undefined if no issues)
1258
+ *
1259
+ * @example
1260
+ * const issues = validateRules([
1261
+ * { lineStartsAfter: ['raqms:num'] }, // Missing braces
1262
+ * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
1263
+ * ]);
1264
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
1265
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
1266
+ */
1267
+ declare const validateRules: (rules: SplitRule[]) => (RuleValidationResult | undefined)[];
1268
+ /**
1269
+ * Formats a validation result array into a list of human-readable error messages.
1270
+ *
1271
+ * Useful for displaying validation errors in UIs.
1272
+ *
1273
+ * @param results - The result array from `validateRules()`
1274
+ * @returns Array of formatted error strings
1275
+ *
1276
+ * @example
1277
+ * const issues = validateRules(rules);
1278
+ * const errors = formatValidationReport(issues);
1279
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
1280
+ */
1281
+ declare const formatValidationReport: (results: (RuleValidationResult | undefined)[]) => string[];
1282
+ //#endregion
1283
+ //#region src/analysis/segmentation-advisor.d.ts
1284
+ type SegmentationAdvisorMode = 'structured' | 'continuous' | 'mixed';
1285
+ type SegmentationAdvisorOptions = {
1286
+ topLineStarts?: number;
1287
+ topRepeatingSequences?: number;
1288
+ minLineStartCount?: number;
1289
+ minRepeatingCount?: number;
1290
+ maxRules?: number;
1291
+ sampleSegments?: number;
1292
+ };
1293
+ type PreprocessDetections = {
1294
+ ellipsisCount: number;
1295
+ trailingWawCount: number;
1296
+ zeroWidthCount: number;
1297
+ };
1298
+ type PreprocessSuggestion = {
1299
+ count: number;
1300
+ reason: string;
1301
+ transform: PreprocessTransform;
1302
+ };
1303
+ type RuleSuggestionSource = 'line-start' | 'repeating-sequence';
1304
+ type RuleSuggestionConfidence = 'high' | 'medium' | 'low';
1305
+ type SuggestedRule = {
1306
+ confidence: RuleSuggestionConfidence;
1307
+ count: number;
1308
+ example: {
1309
+ pageId: number;
1310
+ text: string;
1311
+ };
1312
+ pattern: string;
1313
+ reason: string;
1314
+ rule: SplitRule;
1315
+ source: RuleSuggestionSource;
1316
+ };
1317
+ type BreakpointSuggestion = {
1318
+ breakpoints: Breakpoint[];
1319
+ maxPages: number;
1320
+ prefer: 'longer' | 'shorter';
1321
+ reason: string;
1322
+ };
1323
+ type SegmentationEvaluation = {
1324
+ averageSegmentLength: number;
1325
+ maxSegmentLength: number;
1326
+ multiPageSegments: number;
1327
+ segmentCount: number;
1328
+ validation: SegmentValidationReport;
1329
+ };
1330
+ type SegmentationSuggestionReport = {
1331
+ assessment: {
1332
+ mode: SegmentationAdvisorMode;
1333
+ reason: string;
1334
+ };
1335
+ breakpointSuggestions: BreakpointSuggestion[];
1336
+ evaluation?: SegmentationEvaluation;
1337
+ lineStarts: CommonLineStartPattern[];
1338
+ optimization: {
1339
+ mergedCount: number;
1340
+ optimizedRuleCount: number;
1341
+ originalRuleCount: number;
1342
+ };
1343
+ preprocess: {
1344
+ detections: PreprocessDetections;
1345
+ suggestions: PreprocessSuggestion[];
1346
+ };
1347
+ recommendedOptions: SegmentationOptions;
1348
+ repeatingSequences: RepeatingSequencePattern[];
1349
+ ruleSuggestions: SuggestedRule[];
1350
+ ruleValidation: RuleValidationResult[];
1351
+ ruleValidationErrors: string[];
1352
+ segmentSamples: Segment[];
1353
+ };
1354
+ /**
1355
+ * Generate a machine-readable draft segmentation report for AI agents.
1356
+ *
1357
+ * This helper is intentionally deterministic: it inspects pages, drafts
1358
+ * candidate rules, validates them, and evaluates its own recommendation.
1359
+ */
1360
+ declare const suggestSegmentationOptions: (pages: Page[], options?: SegmentationAdvisorOptions) => SegmentationSuggestionReport;
1361
+ //#endregion
1073
1362
  //#region src/detection.d.ts
1074
1363
  /**
1075
1364
  * Result of detecting a token pattern in text
@@ -1135,6 +1424,113 @@ declare const analyzeTextForRule: (text: string) => {
1135
1424
  detected: DetectedPattern[];
1136
1425
  } | null;
1137
1426
  //#endregion
1427
+ //#region src/dictionary/arabic-dictionary-rule.d.ts
1428
+ interface ArabicDictionaryEntryRuleOptions extends DictionaryEntryPatternOptions {
1429
+ /**
1430
+ * Suppress page-start matches when the previous page's last Arabic word
1431
+ * is in this stoplist, unless that page ends with strong sentence punctuation.
1432
+ */
1433
+ pageStartPrevWordStoplist?: string[];
1434
+ /**
1435
+ * Suppress non-page-start matches when the immediately previous Arabic word
1436
+ * on the same page is in this stoplist.
1437
+ */
1438
+ samePagePrevWordStoplist?: string[];
1439
+ /**
1440
+ * Static metadata merged into matching segments.
1441
+ */
1442
+ meta?: Record<string, unknown>;
1443
+ }
1444
+ /**
1445
+ * Creates a reusable split rule for Arabic dictionary entries.
1446
+ *
1447
+ * The returned rule preserves authoring intent as a serializable
1448
+ * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1449
+ * regex string.
1450
+ *
1451
+ * @example
1452
+ * createArabicDictionaryEntryRule({
1453
+ * stopWords: ['وقيل', 'ويقال', 'قال'],
1454
+ * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1455
+ * })
1456
+ *
1457
+ * @example
1458
+ * createArabicDictionaryEntryRule({
1459
+ * allowParenthesized: true,
1460
+ * allowWhitespaceBeforeColon: true,
1461
+ * allowCommaSeparated: true,
1462
+ * stopWords: ['الليث', 'العجاج'],
1463
+ * })
1464
+ */
1465
+ /**
1466
+ * @deprecated Prefer the top-level `SegmentationOptions.dictionary` profile for
1467
+ * whole-book dictionary segmentation. Keep this helper for advanced single-rule
1468
+ * composition inside a broader `SplitRule[]` pipeline.
1469
+ */
1470
+ declare const createArabicDictionaryEntryRule: ({
1471
+ allowCommaSeparated,
1472
+ allowParenthesized,
1473
+ allowWhitespaceBeforeColon,
1474
+ captureName,
1475
+ maxLetters,
1476
+ meta,
1477
+ midLineSubentries,
1478
+ minLetters,
1479
+ pageStartPrevWordStoplist,
1480
+ samePagePrevWordStoplist,
1481
+ stopWords
1482
+ }: ArabicDictionaryEntryRuleOptions) => SplitRule;
1483
+ //#endregion
1484
+ //#region src/dictionary/heading-classifier.d.ts
1485
+ type DictionarySurfaceKind = DictionaryHeadingScanClass | 'lineEntry' | 'inlineSubentry' | 'codeLine' | 'pairedForms';
1486
+ type DictionarySurfaceMatch = {
1487
+ kind: DictionarySurfaceKind;
1488
+ pageId: number;
1489
+ text: string;
1490
+ lemma?: string;
1491
+ line: number;
1492
+ };
1493
+ type DictionaryMarkdownPage = {
1494
+ content: string;
1495
+ id: number;
1496
+ };
1497
+ type DictionarySurfaceReport = {
1498
+ counts: Record<DictionarySurfaceKind, number>;
1499
+ matches: DictionarySurfaceMatch[];
1500
+ };
1501
+ /**
1502
+ * Classifies a markdown heading line produced by `convertContentToMarkdown()`.
1503
+ */
1504
+ declare const classifyDictionaryHeading: (line: string) => DictionaryHeadingScanClass;
1505
+ /**
1506
+ * Extracts dictionary surface matches from a markdown page.
1507
+ */
1508
+ declare const scanDictionaryMarkdownPage: (page: DictionaryMarkdownPage) => DictionarySurfaceMatch[];
1509
+ /**
1510
+ * Aggregates dictionary surface counts across markdown pages.
1511
+ */
1512
+ declare const analyzeDictionaryMarkdownPages: (pages: DictionaryMarkdownPage[]) => DictionarySurfaceReport;
1513
+ //#endregion
1514
+ //#region src/dictionary/profile.d.ts
1515
+ declare class DictionaryProfileValidationError extends Error {
1516
+ readonly issues: DictionaryProfileValidationIssue[];
1517
+ constructor(issues: DictionaryProfileValidationIssue[]);
1518
+ }
1519
+ /**
1520
+ * Validates a dictionary profile without normalizing it.
1521
+ */
1522
+ declare const validateDictionaryProfile: (profile: ArabicDictionaryProfile) => DictionaryProfileValidationIssue[];
1523
+ //#endregion
1524
+ //#region src/dictionary/dictionary-diagnostics.d.ts
1525
+ /**
1526
+ * Collects tuning-oriented diagnostics for a dictionary profile without creating
1527
+ * segments. This output is intended for profile authoring workflows rather than
1528
+ * long-term compatibility guarantees.
1529
+ *
1530
+ * This is useful when tuning blockers and family choices for a new dictionary.
1531
+ */
1532
+ declare const diagnoseDictionaryProfile: (pages: Page[], profile: ArabicDictionaryProfile, options?: DictionaryProfileDiagnosticsOptions) => DictionaryProfileDiagnostics;
1533
+ //#endregion
1138
1534
  //#region src/optimization/optimize-rules.d.ts
1139
1535
  /**
1140
1536
  * Result from optimizing rules.
@@ -1189,58 +1585,6 @@ declare const fixTrailingWaw: (text: string) => string;
1189
1585
  */
1190
1586
  declare const applyPreprocessToPage: (content: string, pageId: number, transforms: PreprocessTransform[]) => string;
1191
1587
  //#endregion
1192
- //#region src/segmentation/arabic-dictionary-rule.d.ts
1193
- interface ArabicDictionaryEntryRuleOptions extends DictionaryEntryPatternOptions {
1194
- /**
1195
- * Suppress page-start matches when the previous page's last Arabic word
1196
- * is in this stoplist, unless that page ends with strong sentence punctuation.
1197
- */
1198
- pageStartPrevWordStoplist?: string[];
1199
- /**
1200
- * Suppress non-page-start matches when the immediately previous Arabic word
1201
- * on the same page is in this stoplist.
1202
- */
1203
- samePagePrevWordStoplist?: string[];
1204
- /**
1205
- * Static metadata merged into matching segments.
1206
- */
1207
- meta?: Record<string, unknown>;
1208
- }
1209
- /**
1210
- * Creates a reusable split rule for Arabic dictionary entries.
1211
- *
1212
- * The returned rule preserves authoring intent as a serializable
1213
- * `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
1214
- * regex string.
1215
- *
1216
- * @example
1217
- * createArabicDictionaryEntryRule({
1218
- * stopWords: ['وقيل', 'ويقال', 'قال'],
1219
- * pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
1220
- * })
1221
- *
1222
- * @example
1223
- * createArabicDictionaryEntryRule({
1224
- * allowParenthesized: true,
1225
- * allowWhitespaceBeforeColon: true,
1226
- * allowCommaSeparated: true,
1227
- * stopWords: ['الليث', 'العجاج'],
1228
- * })
1229
- */
1230
- declare const createArabicDictionaryEntryRule: ({
1231
- allowCommaSeparated,
1232
- allowParenthesized,
1233
- allowWhitespaceBeforeColon,
1234
- captureName,
1235
- maxLetters,
1236
- meta,
1237
- midLineSubentries,
1238
- minLetters,
1239
- pageStartPrevWordStoplist,
1240
- samePagePrevWordStoplist,
1241
- stopWords
1242
- }: ArabicDictionaryEntryRuleOptions) => SplitRule;
1243
- //#endregion
1244
1588
  //#region src/segmentation/breakpoint-utils.d.ts
1245
1589
  /**
1246
1590
  * Escapes regex metacharacters outside of `{{token}}` delimiters.
@@ -1292,68 +1636,6 @@ declare const getDebugReason: (meta: Record<string, any> | undefined, options?:
1292
1636
  */
1293
1637
  declare const getSegmentDebugReason: (segment: Segment, options?: DebugReasonOptions) => string;
1294
1638
  //#endregion
1295
- //#region src/segmentation/pattern-validator.d.ts
1296
- /**
1297
- * Types of validation issues that can be detected.
1298
- */
1299
- type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex' | 'invalid_option';
1300
- /**
1301
- * A validation issue found in a pattern.
1302
- */
1303
- type ValidationIssue = {
1304
- type: ValidationIssueType;
1305
- message: string;
1306
- suggestion?: string; /** The token name involved in the issue (for unknown_token / missing_braces) */
1307
- token?: string; /** The specific pattern involved (for duplicate) */
1308
- pattern?: string;
1309
- };
1310
- /**
1311
- * Validation result for a single rule, with issues keyed by pattern type.
1312
- * Arrays parallel the input pattern arrays - undefined means no issue.
1313
- */
1314
- type RuleValidationResult = {
1315
- lineStartsWith?: (ValidationIssue | undefined)[];
1316
- lineStartsAfter?: (ValidationIssue | undefined)[];
1317
- lineEndsWith?: (ValidationIssue | undefined)[];
1318
- template?: ValidationIssue;
1319
- regex?: ValidationIssue;
1320
- dictionaryEntry?: Partial<Record<keyof DictionaryEntryPatternOptions, ValidationIssue>>;
1321
- };
1322
- /**
1323
- * Validates split rules for common pattern issues.
1324
- *
1325
- * Checks for:
1326
- * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
1327
- * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
1328
- * - Duplicate patterns within the same rule
1329
- *
1330
- * @param rules - Array of split rules to validate
1331
- * @returns Array parallel to input with validation results (undefined if no issues)
1332
- *
1333
- * @example
1334
- * const issues = validateRules([
1335
- * { lineStartsAfter: ['raqms:num'] }, // Missing braces
1336
- * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
1337
- * ]);
1338
- * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
1339
- * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
1340
- */
1341
- declare const validateRules: (rules: SplitRule[]) => (RuleValidationResult | undefined)[];
1342
- /**
1343
- * Formats a validation result array into a list of human-readable error messages.
1344
- *
1345
- * Useful for displaying validation errors in UIs.
1346
- *
1347
- * @param results - The result array from `validateRules()`
1348
- * @returns Array of formatted error strings
1349
- *
1350
- * @example
1351
- * const issues = validateRules(rules);
1352
- * const errors = formatValidationReport(issues);
1353
- * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
1354
- */
1355
- declare const formatValidationReport: (results: (RuleValidationResult | undefined)[]) => string[];
1356
- //#endregion
1357
1639
  //#region src/segmentation/segmenter.d.ts
1358
1640
  /**
1359
1641
  * Segments pages of content based on pattern-matching rules.
@@ -1420,6 +1702,25 @@ declare const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = "[\u0621-\u063A\u0641-
1420
1702
  * One or more Arabic letters, where each letter may carry combining marks.
1421
1703
  */
1422
1704
  declare const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = "(?:[\u0621-\u063A\u0641-\u064A][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)+";
1705
+ declare const BASE_TOKENS: {
1706
+ /** Chapter marker (باب). */readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
1707
+ readonly basmalah: string; /** Bullet point variants: `•`, `*`, `°`. */
1708
+ readonly bullet: "[•*°]"; /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
1709
+ readonly dash: "[-–—ـ]"; /** Section marker (فصل / مسألة). */
1710
+ readonly fasl: string; /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
1711
+ readonly harf: "[أ-ي]"; /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
1712
+ readonly harfs: "[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*(?:\\s+[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)*"; /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
1713
+ readonly hr: "[-–—ـ_=]{5,}"; /** Book marker (كتاب). */
1714
+ readonly kitab: "كتاب"; /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
1715
+ readonly naql: string; /** Newline character. Useful for breakpoints that split on line boundaries. */
1716
+ readonly newline: "\\n"; /** Single ASCII digit (0-9). */
1717
+ readonly num: "\\d"; /** One or more ASCII digits (0-9)+. */
1718
+ readonly nums: "\\d+"; /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
1719
+ readonly raqm: "[\\u0660-\\u0669]"; /** One or more Arabic-Indic digits (٠-٩)+. */
1720
+ readonly raqms: "[\\u0660-\\u0669]+"; /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
1721
+ readonly rumuz: string; /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
1722
+ readonly tarqim: "[.!?؟؛]";
1723
+ };
1423
1724
  /** Pre-defined token constants for use in patterns. */
1424
1725
  declare const Token: {
1425
1726
  /** Chapter marker - باب */readonly BAB: "{{bab}}"; /** Basmala - بسم الله */
@@ -1445,12 +1746,18 @@ declare const Token: {
1445
1746
  * Type representing valid token constant keys.
1446
1747
  */
1447
1748
  type TokenKey = keyof typeof Token;
1749
+ /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
1750
+ declare const withCapture: (token: string, name: string) => string;
1751
+ /** Composite tokens that reference base tokens. Pre-expanded at load time. @internal */
1752
+ declare const COMPOSITE_TOKENS: {
1753
+ /** Common hadith numbering format: Arabic-Indic digits + dash + space. */readonly numbered: "{{raqms}} {{dash}} ";
1754
+ };
1755
+ type BaseTokenName = keyof typeof BASE_TOKENS;
1756
+ type CompositeTokenName = keyof typeof COMPOSITE_TOKENS;
1448
1757
  /**
1449
1758
  * Type representing valid token pattern names for `getTokenPattern()`.
1450
1759
  */
1451
- type TokenPatternName = keyof typeof TOKEN_PATTERNS;
1452
- /** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
1453
- declare const withCapture: (token: string, name: string) => string;
1760
+ type TokenPatternName = BaseTokenName | CompositeTokenName;
1454
1761
  /** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
1455
1762
  declare const expandCompositeTokensInTemplate: (template: string) => string;
1456
1763
  /**
@@ -1479,7 +1786,8 @@ declare const expandCompositeTokensInTemplate: (template: string) => string;
1479
1786
  * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
1480
1787
  */
1481
1788
  declare const TOKEN_PATTERNS: {
1482
- /** Chapter marker (باب). */readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
1789
+ readonly numbered: string; /** Chapter marker (باب). */
1790
+ readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
1483
1791
  readonly basmalah: string; /** Bullet point variants: `•`, `*`, `°`. */
1484
1792
  readonly bullet: "[•*°]"; /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
1485
1793
  readonly dash: "[-–—ـ]"; /** Section marker (فصل / مسألة). */
@@ -1784,5 +2092,5 @@ type ValidationOptions = {
1784
2092
  */
1785
2093
  declare const validateSegments: (pages: Page[], options: SegmentationOptions, segments: Segment[], validationOptions?: ValidationOptions) => SegmentValidationReport;
1786
2094
  //#endregion
1787
- export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type DictionaryEntryPatternOptions, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
2095
+ export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type ArabicDictionaryProfile, type Breakpoint, type BreakpointRule, type BreakpointSuggestion, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type DictionaryBlocker, type DictionaryDiagnosticReason, type DictionaryDiagnosticSample, type DictionaryEntryPatternOptions, type DictionaryFamily, type DictionaryFamilyUse, type DictionaryGate, type DictionaryHeadingClass, type DictionaryHeadingScanClass, type DictionaryMarkdownPage, type DictionaryProfileDiagnostics, type DictionaryProfileDiagnosticsOptions, DictionaryProfileValidationError, type DictionaryProfileValidationIssue, type DictionaryProfileValidationIssueCode, type DictionarySegmentKind, type DictionarySegmentMeta, type DictionarySurfaceKind, type DictionarySurfaceMatch, type DictionarySurfaceReport, type DictionaryZone, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessDetections, type PreprocessSuggestion, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleSuggestionConfidence, type RuleSuggestionSource, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationAdvisorMode, type SegmentationAdvisorOptions, type SegmentationEvaluation, type SegmentationOptions, type SegmentationSuggestionReport, type SplitRule, type SuggestedRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeDictionaryMarkdownPages, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, classifyDictionaryHeading, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, diagnoseDictionaryProfile, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, scanDictionaryMarkdownPage, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, suggestSegmentationOptions, templateToRegex, validateDictionaryProfile, validateRules, validateSegments, withCapture };
1788
2096
  //# sourceMappingURL=index.d.mts.map