flappa-doormal 2.17.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +16 -39
- package/README.md +100 -62
- package/dist/index.d.mts +210 -75
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1002 -1158
- package/dist/index.mjs.map +1 -1
- package/package.json +9 -9
package/dist/index.d.mts
CHANGED
|
@@ -372,6 +372,45 @@ type RuleConstraints = PageRangeConstraintWithExclude & {
|
|
|
372
372
|
* { lineStartsWith: ['{{naql}}'], fuzzy: true, pageStartGuard: '{{tarqim}}' }
|
|
373
373
|
*/
|
|
374
374
|
pageStartGuard?: string;
|
|
375
|
+
/**
|
|
376
|
+
* Suppress page-start matches when the previous page's last Arabic word
|
|
377
|
+
* is in this stoplist, unless that page ends with strong sentence punctuation.
|
|
378
|
+
*
|
|
379
|
+
* This is useful for dictionary-like content where a page break can split
|
|
380
|
+
* a phrase such as `قال` / `العجاج:` across two pages, causing a false entry
|
|
381
|
+
* start at the top of the next page.
|
|
382
|
+
*
|
|
383
|
+
* Notes:
|
|
384
|
+
* - Applies ONLY at page starts, not to mid-page matches.
|
|
385
|
+
* - Matching is exact after Arabic normalization:
|
|
386
|
+
* diacritics are ignored and common variants like ا/أ/إ/آ are tolerated.
|
|
387
|
+
*
|
|
388
|
+
* @example
|
|
389
|
+
* {
|
|
390
|
+
* regex: '^(?<lemma>[ء-غف-ي]+):',
|
|
391
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال']
|
|
392
|
+
* }
|
|
393
|
+
*/
|
|
394
|
+
pageStartPrevWordStoplist?: string[];
|
|
395
|
+
/**
|
|
396
|
+
* Suppress matches when the immediately previous Arabic word on the SAME page
|
|
397
|
+
* is in this stoplist.
|
|
398
|
+
*
|
|
399
|
+
* This is useful for dictionary-like content where phrases such as
|
|
400
|
+
* `جلّ وعزّ:` should not be treated as a new entry starting at `وعزّ:`.
|
|
401
|
+
*
|
|
402
|
+
* Notes:
|
|
403
|
+
* - Applies only to non-page-start matches.
|
|
404
|
+
* - Matching is exact after Arabic normalization:
|
|
405
|
+
* diacritics are ignored and common variants like ا/أ/إ/آ are tolerated.
|
|
406
|
+
*
|
|
407
|
+
* @example
|
|
408
|
+
* {
|
|
409
|
+
* regex: '(?<lemma>وعزّ):',
|
|
410
|
+
* samePagePrevWordStoplist: ['جل']
|
|
411
|
+
* }
|
|
412
|
+
*/
|
|
413
|
+
samePagePrevWordStoplist?: string[];
|
|
375
414
|
};
|
|
376
415
|
/**
|
|
377
416
|
* A complete split rule combining pattern, behavior, and constraints.
|
|
@@ -633,7 +672,7 @@ type SegmentationOptions = {
|
|
|
633
672
|
* - `\\n` - Break at line breaks (useful for OCR content)
|
|
634
673
|
* - `''` - Break at page boundary (always works)
|
|
635
674
|
*
|
|
636
|
-
*
|
|
675
|
+
* Applied to segments that exceed `maxPages` or `maxContentLength`.
|
|
637
676
|
*
|
|
638
677
|
* @example
|
|
639
678
|
* // Simple patterns (backward compatible)
|
|
@@ -1082,67 +1121,98 @@ declare const fixTrailingWaw: (text: string) => string;
|
|
|
1082
1121
|
*/
|
|
1083
1122
|
declare const applyPreprocessToPage: (content: string, pageId: number, transforms: PreprocessTransform[]) => string;
|
|
1084
1123
|
//#endregion
|
|
1085
|
-
//#region src/
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1124
|
+
//#region src/segmentation/arabic-dictionary-rule.d.ts
|
|
1125
|
+
interface ArabicDictionaryEntryRuleOptions {
|
|
1126
|
+
/**
|
|
1127
|
+
* Words that should never be treated as lemmas when followed by a colon.
|
|
1128
|
+
*
|
|
1129
|
+
* Matching is Arabic-normalized, diacritic-insensitive, and exact. Callers
|
|
1130
|
+
* should provide canonical forms only; vocalized variants do not need to be
|
|
1131
|
+
* listed separately.
|
|
1132
|
+
*/
|
|
1133
|
+
stopWords: string[];
|
|
1134
|
+
/**
|
|
1135
|
+
* Allow balanced parenthesized headwords like `(عنبر):` or `(عنبر) :`.
|
|
1136
|
+
* @default false
|
|
1137
|
+
*/
|
|
1138
|
+
allowParenthesized?: boolean;
|
|
1139
|
+
/**
|
|
1140
|
+
* Allow optional whitespace before the trailing colon.
|
|
1141
|
+
* @default false
|
|
1142
|
+
*/
|
|
1143
|
+
allowWhitespaceBeforeColon?: boolean;
|
|
1144
|
+
/**
|
|
1145
|
+
* Allow comma-separated headword lists like `سبد، دبس:`.
|
|
1146
|
+
* @default false
|
|
1147
|
+
*/
|
|
1148
|
+
allowCommaSeparated?: boolean;
|
|
1149
|
+
/**
|
|
1150
|
+
* Suppress page-start matches when the previous page's last Arabic word
|
|
1151
|
+
* is in this stoplist, unless that page ends with strong sentence punctuation.
|
|
1152
|
+
*/
|
|
1153
|
+
pageStartPrevWordStoplist?: string[];
|
|
1154
|
+
/**
|
|
1155
|
+
* Suppress non-page-start matches when the immediately previous Arabic word
|
|
1156
|
+
* on the same page is in this stoplist.
|
|
1157
|
+
*/
|
|
1158
|
+
samePagePrevWordStoplist?: string[];
|
|
1159
|
+
/**
|
|
1160
|
+
* Named capture key for the matched lemma.
|
|
1161
|
+
* @default 'lemma'
|
|
1162
|
+
*/
|
|
1163
|
+
captureName?: string;
|
|
1164
|
+
/**
|
|
1165
|
+
* Minimum number of Arabic base letters in a lemma.
|
|
1166
|
+
* @default 2
|
|
1167
|
+
*/
|
|
1168
|
+
minLetters?: number;
|
|
1169
|
+
/**
|
|
1170
|
+
* Maximum number of Arabic base letters in a lemma.
|
|
1171
|
+
* @default 10
|
|
1172
|
+
*/
|
|
1173
|
+
maxLetters?: number;
|
|
1174
|
+
/**
|
|
1175
|
+
* Static metadata merged into matching segments.
|
|
1176
|
+
*/
|
|
1177
|
+
meta?: Record<string, unknown>;
|
|
1178
|
+
}
|
|
1179
|
+
/**
|
|
1180
|
+
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1181
|
+
*
|
|
1182
|
+
* The generated rule:
|
|
1183
|
+
* - keeps the lemma marker in `segment.content`
|
|
1184
|
+
* - stores the lemma in `segment.meta[captureName]`
|
|
1185
|
+
* - matches root entries at true line/page starts
|
|
1186
|
+
* - matches mid-line subentries conservatively when they begin with `و`
|
|
1187
|
+
* - can optionally support parenthesized headwords like `(عنبر) :`
|
|
1188
|
+
* - can optionally support comma-separated headword lists like `سبد، دبس:`
|
|
1189
|
+
*
|
|
1190
|
+
* @example
|
|
1191
|
+
* createArabicDictionaryEntryRule({
|
|
1192
|
+
* stopWords: ['وقيل', 'ويقال', 'قال'],
|
|
1193
|
+
* pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
1194
|
+
* })
|
|
1195
|
+
*
|
|
1196
|
+
* @example
|
|
1197
|
+
* createArabicDictionaryEntryRule({
|
|
1198
|
+
* allowParenthesized: true,
|
|
1199
|
+
* allowWhitespaceBeforeColon: true,
|
|
1200
|
+
* allowCommaSeparated: true,
|
|
1201
|
+
* stopWords: ['الليث', 'العجاج'],
|
|
1202
|
+
* })
|
|
1203
|
+
*/
|
|
1204
|
+
declare const createArabicDictionaryEntryRule: ({
|
|
1205
|
+
allowCommaSeparated,
|
|
1206
|
+
allowParenthesized,
|
|
1207
|
+
allowWhitespaceBeforeColon,
|
|
1208
|
+
captureName,
|
|
1209
|
+
maxLetters,
|
|
1210
|
+
meta,
|
|
1211
|
+
minLetters,
|
|
1212
|
+
pageStartPrevWordStoplist,
|
|
1213
|
+
samePagePrevWordStoplist,
|
|
1214
|
+
stopWords
|
|
1215
|
+
}: ArabicDictionaryEntryRuleOptions) => SplitRule;
|
|
1146
1216
|
//#endregion
|
|
1147
1217
|
//#region src/segmentation/breakpoint-utils.d.ts
|
|
1148
1218
|
/**
|
|
@@ -1172,22 +1242,34 @@ declare const escapeWordsOutsideTokens: (word: string) => string;
|
|
|
1172
1242
|
type PatternProcessor = (pattern: string) => string;
|
|
1173
1243
|
//#endregion
|
|
1174
1244
|
//#region src/segmentation/debug-meta.d.ts
|
|
1245
|
+
/**
|
|
1246
|
+
* Options for formatting the debug reason.
|
|
1247
|
+
*/
|
|
1248
|
+
type DebugReasonOptions = {
|
|
1249
|
+
/**
|
|
1250
|
+
* If true, returns a concise string representation.
|
|
1251
|
+
* e.g. 'Rule: "Chapter"' instead of 'Rule #1 (lineStartsWith) [idx:0] (Matched: "Chapter")'
|
|
1252
|
+
*/
|
|
1253
|
+
concise?: boolean;
|
|
1254
|
+
};
|
|
1175
1255
|
/**
|
|
1176
1256
|
* Helper to format the debug info into a human-readable string.
|
|
1177
1257
|
* @param meta - The segment metadata object
|
|
1258
|
+
* @param options - Formatting options
|
|
1178
1259
|
*/
|
|
1179
|
-
declare const getDebugReason: (meta: Record<string, any> | undefined) => string;
|
|
1260
|
+
declare const getDebugReason: (meta: Record<string, any> | undefined, options?: DebugReasonOptions) => string;
|
|
1180
1261
|
/**
|
|
1181
1262
|
* Convenience helper to get the formatted debug reason directly from a segment.
|
|
1182
1263
|
* @param segment - The segment object
|
|
1264
|
+
* @param options - Formatting options
|
|
1183
1265
|
*/
|
|
1184
|
-
declare const getSegmentDebugReason: (segment: Segment) => string;
|
|
1266
|
+
declare const getSegmentDebugReason: (segment: Segment, options?: DebugReasonOptions) => string;
|
|
1185
1267
|
//#endregion
|
|
1186
1268
|
//#region src/segmentation/pattern-validator.d.ts
|
|
1187
1269
|
/**
|
|
1188
1270
|
* Types of validation issues that can be detected.
|
|
1189
1271
|
*/
|
|
1190
|
-
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern';
|
|
1272
|
+
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex';
|
|
1191
1273
|
/**
|
|
1192
1274
|
* A validation issue found in a pattern.
|
|
1193
1275
|
*/
|
|
@@ -1207,6 +1289,7 @@ type RuleValidationResult = {
|
|
|
1207
1289
|
lineStartsAfter?: (ValidationIssue | undefined)[];
|
|
1208
1290
|
lineEndsWith?: (ValidationIssue | undefined)[];
|
|
1209
1291
|
template?: ValidationIssue;
|
|
1292
|
+
regex?: ValidationIssue;
|
|
1210
1293
|
};
|
|
1211
1294
|
/**
|
|
1212
1295
|
* Validates split rules for common pattern issues.
|
|
@@ -1289,6 +1372,26 @@ declare const formatValidationReport: (results: (RuleValidationResult | undefine
|
|
|
1289
1372
|
declare const segmentPages: (pages: Page[], options: SegmentationOptions) => Segment[];
|
|
1290
1373
|
//#endregion
|
|
1291
1374
|
//#region src/segmentation/tokens.d.ts
|
|
1375
|
+
/**
|
|
1376
|
+
* Arabic base letters used by low-level dictionary-style regex helpers.
|
|
1377
|
+
*
|
|
1378
|
+
* This is intentionally broader than `{{harf}}`:
|
|
1379
|
+
* - includes standalone hamza `ء`
|
|
1380
|
+
* - stays as a raw regex fragment rather than a template token
|
|
1381
|
+
*/
|
|
1382
|
+
declare const ARABIC_BASE_LETTER_CLASS = "[\u0621-\u063A\u0641-\u064A]";
|
|
1383
|
+
/**
|
|
1384
|
+
* Arabic combining marks / annotation signs used by low-level regex helpers.
|
|
1385
|
+
*/
|
|
1386
|
+
declare const ARABIC_MARKS_CLASS = "[\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]";
|
|
1387
|
+
/**
|
|
1388
|
+
* A single Arabic base letter followed by zero or more combining marks.
|
|
1389
|
+
*/
|
|
1390
|
+
declare const ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN = "[\u0621-\u063A\u0641-\u064A][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*";
|
|
1391
|
+
/**
|
|
1392
|
+
* One or more Arabic letters, where each letter may carry combining marks.
|
|
1393
|
+
*/
|
|
1394
|
+
declare const ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN = "(?:[\u0621-\u063A\u0641-\u064A][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)+";
|
|
1292
1395
|
/** Pre-defined token constants for use in patterns. */
|
|
1293
1396
|
declare const Token: {
|
|
1294
1397
|
/** Chapter marker - باب */readonly BAB: "{{bab}}"; /** Basmala - بسم الله */
|
|
@@ -1296,7 +1399,7 @@ declare const Token: {
|
|
|
1296
1399
|
readonly BULLET: "{{bullet}}"; /** Dash variants (hyphen, en-dash, em-dash, tatweel) */
|
|
1297
1400
|
readonly DASH: "{{dash}}"; /** Section marker - فصل / مسألة */
|
|
1298
1401
|
readonly FASL: "{{fasl}}"; /** Single Arabic letter */
|
|
1299
|
-
readonly HARF: "{{harf}}"; /** Multiple Arabic letters separated by spaces */
|
|
1402
|
+
readonly HARF: "{{harf}}"; /** Multiple Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter */
|
|
1300
1403
|
readonly HARFS: "{{harfs}}"; /** Horizontal rule / separator (repeated dashes) */
|
|
1301
1404
|
readonly HR: "{{hr}}"; /** Book marker - كتاب */
|
|
1302
1405
|
readonly KITAB: "{{kitab}}"; /** Hadith transmission phrases */
|
|
@@ -1314,6 +1417,10 @@ declare const Token: {
|
|
|
1314
1417
|
* Type representing valid token constant keys.
|
|
1315
1418
|
*/
|
|
1316
1419
|
type TokenKey = keyof typeof Token;
|
|
1420
|
+
/**
|
|
1421
|
+
* Type representing valid token pattern names for `getTokenPattern()`.
|
|
1422
|
+
*/
|
|
1423
|
+
type TokenPatternName = keyof typeof TOKEN_PATTERNS;
|
|
1317
1424
|
/** Wraps a token constant with a named capture: `{{token}}` → `{{token:name}}`. */
|
|
1318
1425
|
declare const withCapture: (token: string, name: string) => string;
|
|
1319
1426
|
/** Expands composite tokens (e.g. `{{numbered}}`) to their underlying template form. */
|
|
@@ -1343,7 +1450,25 @@ declare const expandCompositeTokensInTemplate: (template: string) => string;
|
|
|
1343
1450
|
* // Using the numbered convenience token
|
|
1344
1451
|
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
1345
1452
|
*/
|
|
1346
|
-
declare const TOKEN_PATTERNS:
|
|
1453
|
+
declare const TOKEN_PATTERNS: {
|
|
1454
|
+
/** Chapter marker (باب). */readonly bab: "باب"; /** Basmala (بسم الله). Also matches ﷽. */
|
|
1455
|
+
readonly basmalah: string; /** Bullet point variants: `•`, `*`, `°`. */
|
|
1456
|
+
readonly bullet: "[•*°]"; /** Dash variants: `-` (U+002D), `–` (U+2013), `—` (U+2014), `ـ` (tatweel U+0640). */
|
|
1457
|
+
readonly dash: "[-–—ـ]"; /** Section marker (فصل / مسألة). */
|
|
1458
|
+
readonly fasl: string; /** Single Arabic letter (أ-ي). Does NOT include diacritics. */
|
|
1459
|
+
readonly harf: "[أ-ي]"; /** One or more single Arabic letters separated by spaces, allowing marks/tatweel on each isolated letter (e.g. `د ت س`, `هـ ث`). For multi-letter codes use `{{rumuz}}`. */
|
|
1460
|
+
readonly harfs: "[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*(?:\\s+[أ-غف-ي][\\u0610-\\u061A\\u0640\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*)*"; /** Horizontal rule / separator: 5+ repeated dashes, underscores, equals, or tatweels. Mixed allowed. */
|
|
1461
|
+
readonly hr: "[-–—ـ_=]{5,}"; /** Book marker (كتاب). */
|
|
1462
|
+
readonly kitab: "كتاب"; /** Hadith transmission phrases (حدثنا, أخبرنا, حدثني, etc.). */
|
|
1463
|
+
readonly naql: string; /** Newline character. Useful for breakpoints that split on line boundaries. */
|
|
1464
|
+
readonly newline: "\\n"; /** Single ASCII digit (0-9). */
|
|
1465
|
+
readonly num: "\\d"; /** One or more ASCII digits (0-9)+. */
|
|
1466
|
+
readonly nums: "\\d+"; /** Single Arabic-Indic digit (٠-٩, U+0660-U+0669). */
|
|
1467
|
+
readonly raqm: "[\\u0660-\\u0669]"; /** One or more Arabic-Indic digits (٠-٩)+. */
|
|
1468
|
+
readonly raqms: "[\\u0660-\\u0669]+"; /** Rijāl/takhrīj source abbreviations. Matches one or more codes separated by whitespace. */
|
|
1469
|
+
readonly rumuz: string; /** Arabic/common punctuation: `.`, `!`, `?`, `؟`, `؛`. */
|
|
1470
|
+
readonly tarqim: "[.!?؟؛]";
|
|
1471
|
+
};
|
|
1347
1472
|
/**
|
|
1348
1473
|
* Checks if a query string contains template tokens.
|
|
1349
1474
|
*
|
|
@@ -1475,28 +1600,28 @@ declare const templateToRegex: (template: string) => RegExp | null;
|
|
|
1475
1600
|
* Useful for documentation, validation, or building user interfaces
|
|
1476
1601
|
* that show available tokens.
|
|
1477
1602
|
*
|
|
1478
|
-
* @returns Array of token names (e.g., `['bab', '
|
|
1603
|
+
* @returns Array of token names (e.g., `['bab', 'basmalah', 'bullet', ...]`)
|
|
1479
1604
|
*
|
|
1480
1605
|
* @example
|
|
1481
1606
|
* getAvailableTokens()
|
|
1482
|
-
* // → ['bab', '
|
|
1607
|
+
* // → ['bab', 'basmalah', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
1483
1608
|
*/
|
|
1484
|
-
declare const getAvailableTokens: () =>
|
|
1609
|
+
declare const getAvailableTokens: () => TokenPatternName[];
|
|
1485
1610
|
/**
|
|
1486
1611
|
* Gets the regex pattern for a specific token name.
|
|
1487
1612
|
*
|
|
1488
1613
|
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
1489
1614
|
* without any expansion or capture group wrapping.
|
|
1490
1615
|
*
|
|
1491
|
-
* @param tokenName - The token name to look up (e.g., 'raqms'
|
|
1492
|
-
* @returns The regex pattern string
|
|
1616
|
+
* @param tokenName - The token name to look up (e.g., `'raqms'`, `'dash'`, `'harfs'`)
|
|
1617
|
+
* @returns The regex pattern string for that known token
|
|
1493
1618
|
*
|
|
1494
1619
|
* @example
|
|
1495
1620
|
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
1496
1621
|
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
1497
|
-
* getTokenPattern('
|
|
1622
|
+
* getTokenPattern('harfs') // → pattern for spaced isolated Arabic letter codes
|
|
1498
1623
|
*/
|
|
1499
|
-
declare const getTokenPattern: (tokenName:
|
|
1624
|
+
declare const getTokenPattern: (tokenName: TokenPatternName) => string;
|
|
1500
1625
|
/**
|
|
1501
1626
|
* Checks if a pattern (or array of patterns) contains tokens that should
|
|
1502
1627
|
* default to fuzzy matching.
|
|
@@ -1594,6 +1719,16 @@ declare const escapeTemplateBrackets: (pattern: string) => string;
|
|
|
1594
1719
|
* escapeRegex('a+b*c?') // → 'a\\+b\\*c\\?'
|
|
1595
1720
|
*/
|
|
1596
1721
|
declare const escapeRegex: (s: string) => string;
|
|
1722
|
+
/**
|
|
1723
|
+
* Normalizes Arabic text for exact comparisons while tolerating common variants.
|
|
1724
|
+
*
|
|
1725
|
+
* This removes Arabic diacritics, collapses whitespace, removes joiners, and
|
|
1726
|
+
* maps common equivalent letters to a shared canonical form:
|
|
1727
|
+
* - ا/آ/أ/إ -> ا
|
|
1728
|
+
* - ة/ه -> ه
|
|
1729
|
+
* - ى/ي -> ي
|
|
1730
|
+
*/
|
|
1731
|
+
declare const normalizeArabicForComparison: (text: string) => string;
|
|
1597
1732
|
declare const makeDiacriticInsensitive: (text: string) => string;
|
|
1598
1733
|
//#endregion
|
|
1599
1734
|
//#region src/validation/validate-segments.d.ts
|
|
@@ -1621,5 +1756,5 @@ type ValidationOptions = {
|
|
|
1621
1756
|
*/
|
|
1622
1757
|
declare const validateSegments: (pages: Page[], options: SegmentationOptions, segments: Segment[], validationOptions?: ValidationOptions) => SegmentValidationReport;
|
|
1623
1758
|
//#endregion
|
|
1624
|
-
export { type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type
|
|
1759
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
1625
1760
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/types/breakpoints.ts","../src/types/rules.ts","../src/types/options.ts","../src/types/validation.ts","../src/types/index.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/optimization/optimize-rules.ts","../src/preprocessing/transforms.ts","../src/
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/types/breakpoints.ts","../src/types/rules.ts","../src/types/options.ts","../src/types/validation.ts","../src/types/index.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/optimization/optimize-rules.ts","../src/preprocessing/transforms.ts","../src/segmentation/arabic-dictionary-rule.ts","../src/segmentation/breakpoint-utils.ts","../src/segmentation/debug-meta.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/utils/textUtils.ts","../src/validation/validate-segments.ts"],"mappings":";AAqBA;;;;;;;;;;;;AA2GA;;;;;;;AA3GA,KAAY,cAAA,GAAiB,8BAAA;ECOxB;;;;;AAEI;;;;;AA4BG;EDzBR,OAAA;;;;ACuDc;;;;;AAiCC;;;;;AAwBH;EDhGZ,KAAA;;;;;;;;;;;;;;;;ACmIJ;;;;;AAOA;;;;;AAAgE;;;ED5G5D,KAAA;EC6HA;;;;;AAyBK;;;;;ED1IL,KAAA;ECiKO;;;;;;AA8FX;;;;;;;;;;;;ED3OI,QAAA;AAAA;;;AErFJ;;;;;;;;;AAgCA;;;KFsEY,UAAA,YAAsB,cAAA;;;AA3GlC;;;;;;;;;;;;AA2GA;;;;;;;;AChIiE;;;;;AA8BxD;ADTT,KCOK,YAAA;wEAED,KAAA;AAAA;AA4BQ;;;;;AA8BM;;;;;AAiCC;;;;;AAwBH;;;;;;;;AAvFJ,KAFP,eAAA;EA2GoB,2GAzGrB,QAAA;AAAA;;;;;;AA0HJ;;;;;AAOA;;;;;AAAgE;;;;;;;;;AA0CvD;KA/IJ,qBAAA;oHAED,cAAA;AAAA;;;;;;;;AAkQJ;;;;;;;;;;;;;;;;AChUA;;;;;KD6FK,sBAAA;ECpFD,oHDsFA,eAAA;AAAA;AC/DJ;;;;;AAyBA;;;;;AAsBA;;;;;;;;;AA/CA,KDqFK,mBAAA;EChCC,kFDkCF,YAAA;AAAA;ACRJ;;;;;;;;;;AAAA,KDqBK,WAAA,GACC,YAAA,GACA,eAAA,GACA,qBAAA,GACA,sBAAA,GACA,mBAAA;;;;;;;;;;;ACmBN;;;;;cDFa,iBAAA;;;;;;KAOD,cAAA,WAAyB,iBAAA;;;;;;;KAUhC,aAAA;ECsEa;;;;;;ED/Dd,KAAA;ECiJgC;;;;;AC7VpC;;;EFsNI,UAAA;EEtNsC;AAE1C;;;;;AAMA;;;;;;;EF6NI,KAAA;AAAA;;;;;;;;;KAWC,eAAA,GAAkB,8BAAA;EEzNf;;;;;;;;;AAWR;;EF0NI,IAAA,GAAO,MAAA;EEjNuB;;;;;;;;;;;;;;;AC7BlC;;;;;EHoQI,cAAA;EGhPA;;;;;AAwBJ;;;;;AA2BA;;;;;AAgBA;;;;EHkMI,yBAAA;EG1KQ;;;;;;;;;;;;AC1GZ;;;;;;EJwSI,wBAAA;AAAA;;;;;;;;;;;;;AI1RJ;;;;;AAEA;;;;;;;;;KJuTY,SAAA,GAAY,WAAA,GAAc,aAAA,GAAgB,eAAA;;;;;;;;;;;;;AD1NtD;;;;;;;;AChIiE;;;;KC0BrD,mBAAA,GAAsB,mBAAA;EAC9B,IAAA;ED6BgB;;;;AAER;;;ECvBR,IAAA;AAAA;ADqDc;;;;;AAiCC;;;;;AAwBH;;;;;;;;;;AAzDE,KC9BN,oBAAA,GAAuB,mBAAA;EAC/B,IAAA;AAAA;;;;;ADyHJ;;;;;AAOA;;;;;AAAgE;;;;;;;KCxGpD,kBAAA,GAAqB,mBAAA;EAC7B,IAAA;AAAA;;;;;;;;;;;;ADsQJ;;;;;;;KCjPY,mBAAA,+DAIN,mBAAA,GACA,oBAAA,GACA,kBAAA;;;;;;;;;AArFN;;;;;;;;;AAgCA;;;;;AAyBA;;UAsDiB,MAAA;EAtDgB;EAwD7B,KAAA,IAAS,OAAA,aAAoB,IAAA;EAlCrB;EAoCR,KAAA,IAAS,OAAA,aAAoB,IAAA;;EAE7B,IAAA,IAAQ,OAAA,aAAoB,IAAA;EAjC1B;EAmCF,KAAA,IAAS,OAAA,aAAoB,IAAA;EAlCT;EAoCpB,IAAA,IAAQ,OAAA,aAAoB,IAAA;AAAA;;;;;AAVhC;;;;;;;;;;;;;;;;;;;;;AA6CA;;;;;;;KAAY,mBAAA;EAuKwB;;;;;;;EA/JhC,KAAA,GAAQ,SAAA;EAkCR;;;;;;;;;EAvBA,KAAA;IAoJgC,4DAhJtB,OAAA;IAEA,OAAA,GAAU,KAAA;EAAA;EC/MZ;;;;;AAEZ;;;;;AAMA;;;;EDwNI,QAAA;ECvNM;;;;;;;;;;;EDoON,gBAAA;ECvNI;;;;;;;;;;AAYR;;;;;;;;;;;;;;;;;;EDyOI,WAAA,GAAc,UAAA;EE7PC;;;;;;;EFsQf,MAAA;EEzOa;;AAejB;;;;;AA2BA;;;;;EF6MI,UAAA;EE7L2B;;;;AAwB/B;;;;;;;;;;;;AC1GA;;;;;;;;;;;;;;;EHgTI,MAAA,GAAS,MAAA;EGtSQ;;;;AAIrB;;;;;AAEA;;;;;;;;;;AA0QA;;;;;EHgDI,UAAA,GAAa,mBAAA;AAAA;;;KC7VL,8BAAA;AAAA,KAEA,0BAAA;AAAA,KAMA,sBAAA;EACR,IAAA,EAAM,0BAAA;EACN,QAAA,EAAU,8BAAA;EACV,YAAA;EACA,OAAA;IACI,IAAA;IACA,EAAA;IACA,cAAA;EAAA;EAEJ,QAAA;IACI,IAAA;IACA,EAAA;EAAA;EAEJ,MAAA;IACI,IAAA;IACA,EAAA;EAAA;EAEJ,WAAA;IACI,MAAA;IACA,WAAA;IACA,UAAA;EAAA;EAEJ,QAAA;EACA,IAAA;AAAA;AAAA,KAGQ,uBAAA;EACR,EAAA;EACA,OAAA;IACI,YAAA;IACA,SAAA;IACA,MAAA;IACA,MAAA;IACA,QAAA;EAAA;EAEJ,MAAA,EAAQ,sBAAA;AAAA;;;;AHtBZ;;;;;;;;;;;;AA2GA;KIlHY,OAAA;;;;;;;EAOR,OAAA;EHOa;;;EGFb,IAAA;EH8BC;;;;;AAEO;EGxBR,EAAA;;;;AHsDc;;;;EG7Cd,IAAA,GAAO,MAAA;AAAA;;;;;AHsGK;;;;;;;;KGvFJ,IAAA;EHyGa;;;;;EGnGrB,EAAA;EHmGqB;;AAiBzB;;;;EG5GI,OAAA;AAAA;;;;;AHmH4D;;;;;;KGtGpD,SAAA;;;AHgJH;;;;;;;;;;;;KGhIG,mBAAA;EHqPS;;;;EGhPjB,GAAA;EHgPiE;;;;EG1OjE,GAAA;AAAA;;;;;AFtFJ;;;;;;KEmGY,8BAAA,GAAiC,mBAAA;EF1FrC;;AAuBR;;;;;AAyBA;;;;;AAsBA;;;EEoCI,OAAA,GAAU,SAAA;AAAA;;;KC1HF,wBAAA;EACR,IAAA;EACA,WAAA;EACA,aAAA;EACA,QAAA;EACA,WAAA;EACA,wBAAA;EACA,yBAAA;EACA,MAAA;EACA,UAAA,IAAc,IAAA,UAAc,MAAA;EAC5B,cAAA,GAAiB,MAAA;EACjB,UAAA;AAAA;AAAA,KAGQ,uBAAA;EAA4B,IAAA;EAAc,MAAA;AAAA;AAAA,KAE1C,sBAAA;EACR,OAAA;EACA,KAAA;EACA,QAAA,EAAU,uBAAA;AAAA;;;;cAuQD,uBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,GAAS,wBAAA,KACV,sBAAA;;;KCvRS,wBAAA;EACR,WAAA;EACA,WAAA;EACA,QAAA;EACA,IAAA;EACA,yBAAA;EACA,YAAA;EACA,UAAA;EACA,WAAA;EACA,YAAA;EACA,iBAAA;AAAA;AAAA,KAGQ,wBAAA;EACR,IAAA;EACA,OAAA;EACA,MAAA;EACA,YAAA;AAAA;AAAA,KAGQ,wBAAA;EACR,OAAA;EACA,KAAA;EACA,QAAA,EAAU,wBAAA;AAAA;;ALwCI;;;;;cK6LL,yBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,GAAU,wBAAA,KACX,wBAAA;;;;ANnQH;;KOhBY,eAAA;EPgB+C,6DOdvD,KAAA,UP0BA;EOxBA,KAAA,UPsEA;EOpEA,KAAA,UPoGA;EOlGA,QAAA;AAAA;APmHJ;;;;;;;;AChIiE;;;;;AA8BxD;;ADkGT,cO5Ca,mBAAA,GAAuB,IAAA,aAAY,eAAA;;;AN1BpC;;;;;AA8BM;;;;;AAiCC;cM2BN,wBAAA,GAA4B,IAAA,UAAc,QAAA,EAAU,eAAA;;;;ANHjD;;;cM0BH,oBAAA,GACT,QAAA,EAAU,eAAA;EACT,WAAA;EAAmD,KAAA;EAAgB,QAAA;AAAA;;;;;;;cA+B3D,kBAAA,GACT,IAAA;EAEA,QAAA;EACA,WAAA;EACA,KAAA;EACA,QAAA;EACA,QAAA,EAAU,eAAA;AAAA;;;AP9Ld;;;AAAA,KQbY,cAAA;ERaiB,yDQXzB,KAAA,EAAO,SAAA,IRuCP;EQrCA,WAAA;AAAA;AAAA,cAkCS,aAAA,GAAiB,KAAA,EAAO,SAAA;;SAAA,SAAA;AAAA;;;;;ARkFrC;;;;;cShGa,eAAA,GAAmB,IAAA,UAAc,IAAA;;;ARhCmB;;;;;AA8BxD;;cQkCI,gBAAA,GAAoB,IAAA;;;ARNrB;;;;;AA8BM;cQdL,cAAA,GAAkB,IAAA;;;;AR+CZ;;;;;AAwBH;;;cQ5BH,qBAAA,GAAyB,OAAA,UAAiB,MAAA,UAAgB,UAAA,EAAY,mBAAA;;;UCjHlE,gCAAA;EViBS;;;;;;;EUTtB,SAAA;EVmGA;;;AAiBJ;EU9GI,kBAAA;;;;;EAMA,0BAAA;;ATxB6D;;;ES8B7D,mBAAA;ETAK;AAAA;;;ESML,yBAAA;ETsBQ;AAAA;;;EShBR,wBAAA;ET8Cc;AAAA;;;ESxCd,WAAA;ETyEe;AAAA;;;ESnEf,UAAA;ET2FY;AAAA;;;ESrFZ,UAAA;EToGE;;;ES/FF,IAAA,GAAO,MAAA;AAAA;;;;;;;;;ATmHX;;;;;AAOA;;;;;AAAgE;;;;;;;cS1BnD,+BAAA;EAAmC,mBAAA;EAAA,kBAAA;EAAA,0BAAA;EAAA,WAAA;EAAA,UAAA;EAAA,IAAA;EAAA,UAAA;EAAA,yBAAA;EAAA,wBAAA;EAAA;AAAA,GAW7C,gCAAA,KAAmC,SAAA;;;AV5CtC;;;;;;;;AChIiE;;;;;AA8BxD;;;;;AA4BG;;;;ADsEZ,cWnFa,wBAAA,GAA4B,IAAA;;KA0K7B,gBAAA,IAAoB,OAAA;;;;;;KCnIpB,kBAAA;EX0EP;;;;EWrED,OAAA;AAAA;;;;;;cAyDS,cAAA,GAAkB,IAAA,EAAM,MAAA,2BAAiC,OAAA,GAAU,kBAAA;;;;;;cA4BnE,qBAAA,GAAyB,OAAA,EAAS,OAAA,EAAS,OAAA,GAAU,kBAAA;;;;;;KCjKtD,mBAAA;;;AbmHZ;Ka9GY,eAAA;EACR,IAAA,EAAM,mBAAA;EACN,OAAA;EACA,UAAA;EAEA,KAAA;EAEA,OAAA;AAAA;;;;AZKK;KYEG,oBAAA;EACR,cAAA,IAAkB,eAAA;EAClB,eAAA,IAAmB,eAAA;EACnB,YAAA,IAAgB,eAAA;EAChB,QAAA,GAAW,eAAA;EACX,KAAA,GAAQ,eAAA;AAAA;;;AZmDM;;;;;AAiCC;;;;;AAwBH;;;;;;;cYmDH,aAAA,GAAiB,KAAA,EAAO,SAAA,QAAW,oBAAA;;;;;;;;;;AZhBhD;;;;cYyCa,sBAAA,GAA0B,OAAA,GAAU,oBAAA;;;;;;;;AZ7NgB;;;;;AA8BxD;;;;;AA4BG;;;;;AA8BM;;;;;AAiCC;;;;;AAwBH;;;;;;;;;;;;cauLH,YAAA,GAAgB,KAAA,EAAO,IAAA,IAAQ,OAAA,EAAS,mBAAA,KAAmB,OAAA;;;;AdnTxE;;;;;;ceda,wBAAA;;;;cAKA,kBAAA;;AfoHb;;ce/Ga,yCAAA;;;;cAKA,uCAAA;;cAiHA,KAAA;Ed3GI,oDAEb;EAAA,mCA0BC;EAAA;6BAEO;EAAA,2BA4Bc;EAAA,2BAEtB;EAAA,6BA+BC;EAAA;+BAEc;EAAA,2BAsBK;EAAA,iCAEpB;EAAA,yBAaC;EAAA;6BAEC;EAAA,2BAEA;EAAA,6BACmB;EAAA,6BAJnB;EAAA;;;;;KckBM,QAAA,gBAAwB,KAAA;AdGpC;;;AAAA,KcEY,gBAAA,gBAAgC,cAAA;;cAG/B,WAAA,GAAe,KAAA,UAAe,IAAA;;cAiB9B,+BAAA,GAAmC,QAAA;;;AdfgB;;;;;;;;;AA0CvD;;;;;;;;;;;;AAqHT;;cchGa,cAAA;EdgGW,iDAA8B;EAAA,2BAAe;EAAA,0BAA/B;EAAA,yBAA+B;EAAA;;2KChUtC;EAAA,6BAAsB;EAAA,wBACjD;EAAA,uBAQI;EAAA,yBAuBI;EAAA;yBACJ;EAAA,oCAwBsB;EAAA,sCAAG;EAAA,wBAsBrB;EAAA;;;;;;;;;;;;AAgCZ;;;;;ca4Ja,cAAA,GAAkB,KAAA;;;;;;;KAWnB,YAAA;Eb/JR;;;;;EaqKA,OAAA;EbnKgC;;AAmCpC;;;EauII,YAAA;Eb9GoB;;;;;EaqHpB,WAAA;AAAA;;;;;;;;;;;;;;;;;;;;;AZpUJ;;;;;AAEA;;;;;AAMA;;;;;;cY0ba,wBAAA,GACT,KAAA,UACA,cAAA,IAAkB,OAAA,qBAClB,aAAA;;;;;;;;;;;;;;;;;;;;;AZnaJ;;;cY8ca,YAAA,GAAgB,KAAA;;;;;;;;;;;;;;;AXle7B;;;;;;;cWyfa,eAAA,GAAmB,QAAA,aAAgB,MAAA;;;;AX7chD;;;;;AA2BA;;;;cWuca,kBAAA,QAAyB,gBAAA;AXvbtC;;;;;AAwBA;;;;;;;;;AAxBA,cWuca,eAAA,GAAmB,SAAA,EAAW,gBAAA;;;AVzhB3C;;;;;;;;;;;;cUujBa,oBAAA,GAAwB,QAAA;;;;KAWzB,YAAA;EAAiB,KAAA;EAAe,IAAA;AAAA;AVpjB5C;;;;;AAEA;;;;;;;;;;AA0QA;;;AA5QA,cUwkBa,kBAAA,GAAsB,QAAA,UAAkB,QAAA,EAAU,YAAA;;;;;;;;;;;;;;;AThlB/D;cS6mBa,kBAAA,GAAsB,QAAA;;;;;;AftgBnC;;;;;;;;AChIiE;;;;;AA8BxD;;;;;AA4BG;;cenBC,sBAAA,GAA0B,OAAA;;;AfiDrB;;;;;AAiCC;;;;;AAwBH;cejDH,WAAA,GAAe,CAAA;;;;;;;;;;cAwBf,4BAAA,GAAgC,IAAA;AAAA,cAiBhC,wBAAA,GAA4B,IAAA;;;KCzC7B,iBAAA;EjB3E+C;;;;;EiBiFvD,mBAAA;AAAA;;;AjB0BJ;;;;;;;;AChIiE;;;;cgB0iBpD,gBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,EAAS,mBAAA,EACT,QAAA,EAAU,OAAA,IACV,iBAAA,GAAoB,iBAAA,KACrB,uBAAA"}
|