flappa-doormal 2.18.0 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -2
- package/dist/index.d.mts +79 -51
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +98 -39
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -285,6 +285,7 @@ const rules = [{
|
|
|
285
285
|
| `lineEndsWith` | ✅ Included | Match patterns at end of line |
|
|
286
286
|
| `template` | Depends | Custom pattern with full control |
|
|
287
287
|
| `regex` | Depends | Raw regex for complex cases |
|
|
288
|
+
| `dictionaryEntry` | ✅ Included | Serializable Arabic dictionary headword rule |
|
|
288
289
|
|
|
289
290
|
#### Building UIs with Pattern Type Keys
|
|
290
291
|
|
|
@@ -293,7 +294,7 @@ The library exports `PATTERN_TYPE_KEYS` (a const array) and `PatternTypeKey` (a
|
|
|
293
294
|
```typescript
|
|
294
295
|
import { PATTERN_TYPE_KEYS, type PatternTypeKey } from 'flappa-doormal';
|
|
295
296
|
|
|
296
|
-
// PATTERN_TYPE_KEYS = ['lineStartsWith', 'lineStartsAfter', 'lineEndsWith', 'template', 'regex']
|
|
297
|
+
// PATTERN_TYPE_KEYS = ['lineStartsWith', 'lineStartsAfter', 'lineEndsWith', 'template', 'regex', 'dictionaryEntry']
|
|
297
298
|
|
|
298
299
|
// Build a dropdown/select
|
|
299
300
|
PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
|
|
@@ -351,7 +352,9 @@ the stoplist guard is skipped and the page-start match is allowed.
|
|
|
351
352
|
#### Arabic Dictionary Helper
|
|
352
353
|
|
|
353
354
|
Use `createArabicDictionaryEntryRule()` to build a conservative rule for Arabic
|
|
354
|
-
dictionaries with lemma capture, stopword filtering, and page-wrap protection
|
|
355
|
+
dictionaries with lemma capture, stopword filtering, and page-wrap protection.
|
|
356
|
+
The helper now returns a serializable native `dictionaryEntry` rule rather than
|
|
357
|
+
an eagerly-compiled regex blob:
|
|
355
358
|
|
|
356
359
|
```typescript
|
|
357
360
|
import { createArabicDictionaryEntryRule, segmentPages } from 'flappa-doormal';
|
|
@@ -364,16 +367,35 @@ const rule = createArabicDictionaryEntryRule({
|
|
|
364
367
|
allowParenthesized: true, // e.g. (عنبر) :
|
|
365
368
|
allowWhitespaceBeforeColon: true, // e.g. عنبر :
|
|
366
369
|
allowCommaSeparated: true, // e.g. سبد، دبس:
|
|
370
|
+
midLineSubentries: false, // line/page starts only
|
|
367
371
|
});
|
|
368
372
|
|
|
369
373
|
const segments = segmentPages(pages, { rules: [rule] });
|
|
370
374
|
```
|
|
371
375
|
|
|
376
|
+
Equivalent direct JSON-authored rule:
|
|
377
|
+
|
|
378
|
+
```typescript
|
|
379
|
+
const rule = {
|
|
380
|
+
dictionaryEntry: {
|
|
381
|
+
stopWords: ['وقيل', 'ويقال', 'قال', 'العجاج', 'أخاك'],
|
|
382
|
+
allowParenthesized: true,
|
|
383
|
+
allowWhitespaceBeforeColon: true,
|
|
384
|
+
allowCommaSeparated: true,
|
|
385
|
+
midLineSubentries: false,
|
|
386
|
+
},
|
|
387
|
+
pageStartPrevWordStoplist: ['قال', 'وقيل', 'ويقال'],
|
|
388
|
+
samePagePrevWordStoplist: ['جل'],
|
|
389
|
+
meta: { type: 'entry' },
|
|
390
|
+
};
|
|
391
|
+
```
|
|
392
|
+
|
|
372
393
|
Behavior:
|
|
373
394
|
- Keeps the lemma marker in `segment.content`
|
|
374
395
|
- Stores the matched lemma in `segment.meta.lemma`
|
|
375
396
|
- Matches root entries at true line/page starts like `عز:` and `لع:`
|
|
376
397
|
- Matches mid-line subentries conservatively when they begin with `و`
|
|
398
|
+
- Supports disabling mid-line subentries entirely with `midLineSubentries: false`
|
|
377
399
|
- Can match parenthesized headwords like `(عنبر) :` when enabled
|
|
378
400
|
- Can match comma-separated headword lists like `سبد، دبس:` when enabled
|
|
379
401
|
- Can suppress same-page false positives like `جلّ وعزّ:` with `samePagePrevWordStoplist`
|
package/dist/index.d.mts
CHANGED
|
@@ -259,6 +259,73 @@ type LineStartsAfterPattern = {
|
|
|
259
259
|
type LineEndsWithPattern = {
|
|
260
260
|
/** Array of patterns that mark line endings. Brackets `()[]` are auto-escaped. */lineEndsWith: string[];
|
|
261
261
|
};
|
|
262
|
+
/**
|
|
263
|
+
* Dictionary entry pattern options for Arabic lexicon-style headword matching.
|
|
264
|
+
*
|
|
265
|
+
* This captures authoring intent in a serializable shape and is compiled into
|
|
266
|
+
* a regex internally by the rule compiler.
|
|
267
|
+
*/
|
|
268
|
+
interface DictionaryEntryPatternOptions {
|
|
269
|
+
/**
|
|
270
|
+
* Words that should never be treated as lemmas when followed by a colon.
|
|
271
|
+
*
|
|
272
|
+
* Matching is Arabic-normalized, diacritic-insensitive, and exact. Callers
|
|
273
|
+
* should provide canonical forms only; vocalized variants do not need to be
|
|
274
|
+
* listed separately.
|
|
275
|
+
*/
|
|
276
|
+
stopWords: string[];
|
|
277
|
+
/**
|
|
278
|
+
* Allow balanced parenthesized headwords like `(عنبر):` or `(عنبر) :`.
|
|
279
|
+
* @default false
|
|
280
|
+
*/
|
|
281
|
+
allowParenthesized?: boolean;
|
|
282
|
+
/**
|
|
283
|
+
* Allow optional whitespace before the trailing colon.
|
|
284
|
+
* @default false
|
|
285
|
+
*/
|
|
286
|
+
allowWhitespaceBeforeColon?: boolean;
|
|
287
|
+
/**
|
|
288
|
+
* Allow comma-separated headword lists like `سبد، دبس:`.
|
|
289
|
+
* @default false
|
|
290
|
+
*/
|
|
291
|
+
allowCommaSeparated?: boolean;
|
|
292
|
+
/**
|
|
293
|
+
* Allow conservative mid-line subentries that begin with `و`.
|
|
294
|
+
* Disable this when the rule should only split true line/page starts.
|
|
295
|
+
* @default true
|
|
296
|
+
*/
|
|
297
|
+
midLineSubentries?: boolean;
|
|
298
|
+
/**
|
|
299
|
+
* Named capture key for the matched lemma metadata.
|
|
300
|
+
* @default 'lemma'
|
|
301
|
+
*/
|
|
302
|
+
captureName?: string;
|
|
303
|
+
/**
|
|
304
|
+
* Minimum number of Arabic base letters in a lemma.
|
|
305
|
+
* @default 2
|
|
306
|
+
*/
|
|
307
|
+
minLetters?: number;
|
|
308
|
+
/**
|
|
309
|
+
* Maximum number of Arabic base letters in a lemma.
|
|
310
|
+
* @default 10
|
|
311
|
+
*/
|
|
312
|
+
maxLetters?: number;
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Arabic dictionary entry pattern rule - serializable headword matcher compiled internally.
|
|
316
|
+
*
|
|
317
|
+
* @example
|
|
318
|
+
* {
|
|
319
|
+
* dictionaryEntry: {
|
|
320
|
+
* stopWords: ['قال', 'وقيل'],
|
|
321
|
+
* allowCommaSeparated: true,
|
|
322
|
+
* },
|
|
323
|
+
* meta: { type: 'entry' }
|
|
324
|
+
* }
|
|
325
|
+
*/
|
|
326
|
+
type DictionaryEntryPattern = {
|
|
327
|
+
dictionaryEntry: DictionaryEntryPatternOptions;
|
|
328
|
+
};
|
|
262
329
|
/**
|
|
263
330
|
* Union of all pattern types for split rules.
|
|
264
331
|
*
|
|
@@ -268,8 +335,9 @@ type LineEndsWithPattern = {
|
|
|
268
335
|
* - `lineStartsWith` - Match line beginnings (marker included)
|
|
269
336
|
* - `lineStartsAfter` - Match line beginnings (marker excluded)
|
|
270
337
|
* - `lineEndsWith` - Match line endings
|
|
338
|
+
* - `dictionaryEntry` - Arabic dictionary headword matching
|
|
271
339
|
*/
|
|
272
|
-
type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern;
|
|
340
|
+
type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | LineStartsAfterPattern | LineEndsWithPattern | DictionaryEntryPattern;
|
|
273
341
|
/**
|
|
274
342
|
* Pattern type key names for split rules.
|
|
275
343
|
*
|
|
@@ -285,7 +353,7 @@ type PatternType = RegexPattern | TemplatePattern | LineStartsWithPattern | Line
|
|
|
285
353
|
* const validateKey = (k: string): k is PatternTypeKey =>
|
|
286
354
|
* (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
|
|
287
355
|
*/
|
|
288
|
-
declare const PATTERN_TYPE_KEYS: readonly ["lineStartsWith", "lineStartsAfter", "lineEndsWith", "template", "regex"];
|
|
356
|
+
declare const PATTERN_TYPE_KEYS: readonly ["lineStartsWith", "lineStartsAfter", "lineEndsWith", "template", "regex", "dictionaryEntry"];
|
|
289
357
|
/**
|
|
290
358
|
* String union of pattern type key names.
|
|
291
359
|
*
|
|
@@ -417,7 +485,7 @@ type RuleConstraints = PageRangeConstraintWithExclude & {
|
|
|
417
485
|
*
|
|
418
486
|
* Each rule must specify:
|
|
419
487
|
* - **Pattern** (exactly one): `regex`, `template`, `lineStartsWith`,
|
|
420
|
-
* `lineStartsAfter`, or `
|
|
488
|
+
* `lineStartsAfter`, `lineEndsWith`, or `dictionaryEntry`
|
|
421
489
|
* - **Split behavior**: `split` (optional, defaults to `'at'`), `occurrence`, `fuzzy`
|
|
422
490
|
* - **Constraints** (optional): `min`, `max`, `meta`
|
|
423
491
|
*
|
|
@@ -1122,30 +1190,7 @@ declare const fixTrailingWaw: (text: string) => string;
|
|
|
1122
1190
|
declare const applyPreprocessToPage: (content: string, pageId: number, transforms: PreprocessTransform[]) => string;
|
|
1123
1191
|
//#endregion
|
|
1124
1192
|
//#region src/segmentation/arabic-dictionary-rule.d.ts
|
|
1125
|
-
interface ArabicDictionaryEntryRuleOptions {
|
|
1126
|
-
/**
|
|
1127
|
-
* Words that should never be treated as lemmas when followed by a colon.
|
|
1128
|
-
*
|
|
1129
|
-
* Matching is Arabic-normalized, diacritic-insensitive, and exact. Callers
|
|
1130
|
-
* should provide canonical forms only; vocalized variants do not need to be
|
|
1131
|
-
* listed separately.
|
|
1132
|
-
*/
|
|
1133
|
-
stopWords: string[];
|
|
1134
|
-
/**
|
|
1135
|
-
* Allow balanced parenthesized headwords like `(عنبر):` or `(عنبر) :`.
|
|
1136
|
-
* @default false
|
|
1137
|
-
*/
|
|
1138
|
-
allowParenthesized?: boolean;
|
|
1139
|
-
/**
|
|
1140
|
-
* Allow optional whitespace before the trailing colon.
|
|
1141
|
-
* @default false
|
|
1142
|
-
*/
|
|
1143
|
-
allowWhitespaceBeforeColon?: boolean;
|
|
1144
|
-
/**
|
|
1145
|
-
* Allow comma-separated headword lists like `سبد، دبس:`.
|
|
1146
|
-
* @default false
|
|
1147
|
-
*/
|
|
1148
|
-
allowCommaSeparated?: boolean;
|
|
1193
|
+
interface ArabicDictionaryEntryRuleOptions extends DictionaryEntryPatternOptions {
|
|
1149
1194
|
/**
|
|
1150
1195
|
* Suppress page-start matches when the previous page's last Arabic word
|
|
1151
1196
|
* is in this stoplist, unless that page ends with strong sentence punctuation.
|
|
@@ -1156,21 +1201,6 @@ interface ArabicDictionaryEntryRuleOptions {
|
|
|
1156
1201
|
* on the same page is in this stoplist.
|
|
1157
1202
|
*/
|
|
1158
1203
|
samePagePrevWordStoplist?: string[];
|
|
1159
|
-
/**
|
|
1160
|
-
* Named capture key for the matched lemma.
|
|
1161
|
-
* @default 'lemma'
|
|
1162
|
-
*/
|
|
1163
|
-
captureName?: string;
|
|
1164
|
-
/**
|
|
1165
|
-
* Minimum number of Arabic base letters in a lemma.
|
|
1166
|
-
* @default 2
|
|
1167
|
-
*/
|
|
1168
|
-
minLetters?: number;
|
|
1169
|
-
/**
|
|
1170
|
-
* Maximum number of Arabic base letters in a lemma.
|
|
1171
|
-
* @default 10
|
|
1172
|
-
*/
|
|
1173
|
-
maxLetters?: number;
|
|
1174
1204
|
/**
|
|
1175
1205
|
* Static metadata merged into matching segments.
|
|
1176
1206
|
*/
|
|
@@ -1179,13 +1209,9 @@ interface ArabicDictionaryEntryRuleOptions {
|
|
|
1179
1209
|
/**
|
|
1180
1210
|
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1181
1211
|
*
|
|
1182
|
-
* The
|
|
1183
|
-
*
|
|
1184
|
-
*
|
|
1185
|
-
* - matches root entries at true line/page starts
|
|
1186
|
-
* - matches mid-line subentries conservatively when they begin with `و`
|
|
1187
|
-
* - can optionally support parenthesized headwords like `(عنبر) :`
|
|
1188
|
-
* - can optionally support comma-separated headword lists like `سبد، دبس:`
|
|
1212
|
+
* The returned rule preserves authoring intent as a serializable
|
|
1213
|
+
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1214
|
+
* regex string.
|
|
1189
1215
|
*
|
|
1190
1216
|
* @example
|
|
1191
1217
|
* createArabicDictionaryEntryRule({
|
|
@@ -1208,6 +1234,7 @@ declare const createArabicDictionaryEntryRule: ({
|
|
|
1208
1234
|
captureName,
|
|
1209
1235
|
maxLetters,
|
|
1210
1236
|
meta,
|
|
1237
|
+
midLineSubentries,
|
|
1211
1238
|
minLetters,
|
|
1212
1239
|
pageStartPrevWordStoplist,
|
|
1213
1240
|
samePagePrevWordStoplist,
|
|
@@ -1269,7 +1296,7 @@ declare const getSegmentDebugReason: (segment: Segment, options?: DebugReasonOpt
|
|
|
1269
1296
|
/**
|
|
1270
1297
|
* Types of validation issues that can be detected.
|
|
1271
1298
|
*/
|
|
1272
|
-
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex';
|
|
1299
|
+
type ValidationIssueType = 'missing_braces' | 'unknown_token' | 'duplicate' | 'empty_pattern' | 'invalid_regex' | 'invalid_option';
|
|
1273
1300
|
/**
|
|
1274
1301
|
* A validation issue found in a pattern.
|
|
1275
1302
|
*/
|
|
@@ -1290,6 +1317,7 @@ type RuleValidationResult = {
|
|
|
1290
1317
|
lineEndsWith?: (ValidationIssue | undefined)[];
|
|
1291
1318
|
template?: ValidationIssue;
|
|
1292
1319
|
regex?: ValidationIssue;
|
|
1320
|
+
dictionaryEntry?: Partial<Record<keyof DictionaryEntryPatternOptions, ValidationIssue>>;
|
|
1293
1321
|
};
|
|
1294
1322
|
/**
|
|
1295
1323
|
* Validates split rules for common pattern issues.
|
|
@@ -1756,5 +1784,5 @@ type ValidationOptions = {
|
|
|
1756
1784
|
*/
|
|
1757
1785
|
declare const validateSegments: (pages: Page[], options: SegmentationOptions, segments: Segment[], validationOptions?: ValidationOptions) => SegmentValidationReport;
|
|
1758
1786
|
//#endregion
|
|
1759
|
-
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
1787
|
+
export { ARABIC_BASE_LETTER_CLASS, ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN, ARABIC_MARKS_CLASS, ARABIC_WORD_WITH_OPTIONAL_MARKS_PATTERN, type ArabicDictionaryEntryRuleOptions, type Breakpoint, type BreakpointRule, type CommonLineStartPattern, type CondenseEllipsisRule, type DetectedPattern, type DictionaryEntryPatternOptions, type ExpandResult, type FixTrailingWawRule, type LineStartAnalysisOptions, type LineStartPatternExample, type Logger, type OptimizeResult, PATTERN_TYPE_KEYS, type Page, type PageRange, type PageRangeConstraint, type PageRangeConstraintWithExclude, type PatternProcessor, type PatternTypeKey, type PreprocessTransform, type RemoveZeroWidthRule, type RepeatingSequenceExample, type RepeatingSequenceOptions, type RepeatingSequencePattern, type RuleValidationResult, type Segment, type SegmentValidationIssue, type SegmentValidationIssueSeverity, type SegmentValidationIssueType, type SegmentValidationReport, type SegmentationOptions, type SplitRule, TOKEN_PATTERNS, Token, type TokenKey, type TokenMapping, type TokenPatternName, type ValidationIssue, type ValidationIssueType, type ValidationOptions, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyPreprocessToPage, applyTokenMappings, condenseEllipsis, containsTokens, createArabicDictionaryEntryRule, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, escapeWordsOutsideTokens, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, fixTrailingWaw, formatValidationReport, generateTemplateFromText, getAvailableTokens, getDebugReason, getSegmentDebugReason, getTokenPattern, makeDiacriticInsensitive, normalizeArabicForComparison, optimizeRules, removeZeroWidth, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules, validateSegments, withCapture };
|
|
1760
1788
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/types/breakpoints.ts","../src/types/rules.ts","../src/types/options.ts","../src/types/validation.ts","../src/types/index.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/optimization/optimize-rules.ts","../src/preprocessing/transforms.ts","../src/segmentation/arabic-dictionary-rule.ts","../src/segmentation/breakpoint-utils.ts","../src/segmentation/debug-meta.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/utils/textUtils.ts","../src/validation/validate-segments.ts"],"mappings":";AAqBA;;;;;;;;;;;;AA2GA;;;;;;;AA3GA,KAAY,cAAA,GAAiB,8BAAA;ECOxB;;;;;AAEI;;;;;AA4BG;EDzBR,OAAA;;;;ACuDc;;;;;AAiCC;;;;;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/types/breakpoints.ts","../src/types/rules.ts","../src/types/options.ts","../src/types/validation.ts","../src/types/index.ts","../src/analysis/line-starts.ts","../src/analysis/repeating-sequences.ts","../src/detection.ts","../src/optimization/optimize-rules.ts","../src/preprocessing/transforms.ts","../src/segmentation/arabic-dictionary-rule.ts","../src/segmentation/breakpoint-utils.ts","../src/segmentation/debug-meta.ts","../src/segmentation/pattern-validator.ts","../src/segmentation/segmenter.ts","../src/segmentation/tokens.ts","../src/utils/textUtils.ts","../src/validation/validate-segments.ts"],"mappings":";AAqBA;;;;;;;;;;;;AA2GA;;;;;;;AA3GA,KAAY,cAAA,GAAiB,8BAAA;ECOxB;;;;;AAEI;;;;;AA4BG;EDzBR,OAAA;;;;ACuDc;;;;;AAiCC;;;;;AAiCnB;EDzGI,KAAA;;;;;;;;;;;;;AC6JH;;;;;AAeiD;;;;;;;;;;;ED9I9C,KAAA;EC8JE;;;;;;;AAqBN;;;EDvKI,KAAA;EC8KM;AAOV;;;;;AAAgE;;;;;;;;;AA0CvD;;;ED3ML,QAAA;AAAA;;;;;;;;ACgUJ;;;;;;;KD/SY,UAAA,YAAsB,cAAA;;;AA3GlC;;;;;;;;;;;;AA2GA;;;;;;;;AChIiE;;;;;AA8BxD;ADTT,KCOK,YAAA;wEAED,KAAA;AAAA;AA4BQ;;;;;AA8BM;;;;;AAiCC;;;;;AAiCnB;;;;;;;;AAhGY,KAFP,eAAA;EAyID,2GAvIA,QAAA;AAAA;;;AAoJH;;;;;AAeiD;;;;;;;;;;;;;;;;;;KAvI7C,qBAAA;EA4KQ,kHA1KT,cAAA;AAAA;;;AAwLJ;;;;;AAAgE;;;;;;;;;AA0CvD;;;;;;;;;;;;KAnMJ,sBAAA;EAwTgB,oHAtTjB,eAAA;AAAA;;;;;;;;;;;;;AC/FJ;;;;;;;KDqHK,mBAAA;EC5GG,kFD8GJ,YAAA;AAAA;;;;AC9DJ;;;UDuEiB,6BAAA;ECtET;AAqBR;;;;;;EDyDI,SAAA;ECnDoB;;;;EDyDpB,kBAAA;ECzDoB;AA0BxB;;;EDqCI,0BAAA;ECnCA;;;;EDyCA,mBAAA;ECvC6B;;;;;ED8C7B,iBAAA;EC1C6B;;;;EDgD7B,WAAA;EC9CgC;AAmCpC;;;EDiBI,UAAA;ECQoB;;;;EDFpB,UAAA;AAAA;;;;;;;;;;;;;KAeC,sBAAA;EACD,eAAA,EAAiB,6BAAA;AAAA;;;;;;;AE7NrB;;;;;KF2OK,WAAA,GACC,YAAA,GACA,eAAA,GACA,qBAAA,GACA,sBAAA,GACA,mBAAA,GACA,sBAAA;;;;;AEzON;;;;;;;;;;;cF0Pa,iBAAA;;;;;;KAcD,cAAA,WAAyB,iBAAA;;;;;;;KAUhC,aAAA;EE3PG;;AAGR;;;;EF+PI,KAAA;EE7PA;;;;;;;;EFuQA,UAAA;EEhQ8B;;;;AC7BlC;;;;;;;;;EH4SI,KAAA;AAAA;AGhQJ;;;;;AA2BA;;;AA3BA,KH2QK,eAAA,GAAkB,8BAAA;EGhPF;AAgBrB;;;;;AAwBA;;;;;EHoNI,IAAA,GAAO,MAAA;EGpMG;;;;;;AC1Hd;;;;;;;;;;;;;;EJoVI,cAAA;EI1UA;;;;;AAIJ;;;;;AAEA;;;;;;;;;EJyVI,yBAAA;EI/ES;;;;;;;;;;;;;;;;;;EJmGT,wBAAA;AAAA;;;;;;;;;;;;;;;AK1WJ;;;;;;;;;;AAOA;;KLkYY,SAAA,GAAY,WAAA,GAAc,aAAA,GAAgB,eAAA;;;;;;;;;;;;;AD/StD;;;;;;;;AChIiE;;;;KC0BrD,mBAAA,GAAsB,mBAAA;EAC9B,IAAA;ED6BgB;;;;AAER;;;ECvBR,IAAA;AAAA;ADqDc;;;;;AAiCC;;;;;AAiCnB;;;;;;;;;;AAlEkB,KC9BN,oBAAA,GAAuB,mBAAA;EAC/B,IAAA;AAAA;;ADmJH;;;;;AAeiD;;;;;;;;;;;;;;;KC1ItC,kBAAA,GAAqB,mBAAA;EAC7B,IAAA;AAAA;;AD8KJ;;;;;AAcA;;;;;AAAgE;;;;;;;KCvKpD,mBAAA,+DAIN,mBAAA,GACA,oBAAA,GACA,kBAAA;;AD2MG;;;;;;;;;;;;AAqHT;;;;;;;;;;;UCtSiB,MAAA;EDsSoD;ECpSjE,KAAA,IAAS,OAAA,aAAoB,IAAA;;EAE7B,KAAA,IAAS,OAAA,aAAoB,IAAA;EAnHrB;EAqHR,IAAA,IAAQ,OAAA,aAAoB,IAAA;;EAE5B,KAAA,IAAS,OAAA,aAAoB,IAAA;EAvHC;EAyH9B,IAAA,IAAQ,OAAA,aAAoB,IAAA;AAAA;;;AAzFhC;;;;;AAyBA;;;;;AAsBA;;;;;;;;;;;;;AAgCA;;;;;;;KA6CY,mBAAA;EAzCC;;;;;;;EAiDT,KAAA,GAAQ,SAAA;EA3CR;;;;;AAmCJ;;;;EAmBI,KAAA;IAkEc,4DA9DJ,OAAA,WAgJG;IA9IH,OAAA,GAAU,KAAA;EAAA;EAjBpB;;;;;;;;;;;;;;EAkCA,QAAA;EA6Ha;;;;;;AC7VjB;;;;;ED6OI,gBAAA;EC3OkC;;;;AAMtC;;;;;;;;;;;;;;;;;;;;;;;;EDmQI,WAAA,GAAc,UAAA;EC5OV;;AAGR;;;;;EDkPI,MAAA;EC/OI;;;;;;;;;;;;ED6PJ,UAAA;EEpRe;;;;;;;;;;AA4CnB;;;;;AA2BA;;;;;AAgBA;;;;;AAwBA;;;;;;EFsMI,MAAA,GAAS,MAAA;EEtLU;;;;;AC1HvB;;;;;;;;;;;;;;;;;;;EH0UI,UAAA,GAAa,mBAAA;AAAA;;;KC7VL,8BAAA;AAAA,KAEA,0BAAA;AAAA,KAMA,sBAAA;EACR,IAAA,EAAM,0BAAA;EACN,QAAA,EAAU,8BAAA;EACV,YAAA;EACA,OAAA;IACI,IAAA;IACA,EAAA;IACA,cAAA;EAAA;EAEJ,QAAA;IACI,IAAA;IACA,EAAA;EAAA;EAEJ,MAAA;IACI,IAAA;IACA,EAAA;EAAA;EAEJ,WAAA;IACI,MAAA;IACA,WAAA;IACA,UAAA;EAAA;EAEJ,QAAA;EACA,IAAA;AAAA;AAAA,KAGQ,uBAAA;EACR,EAAA;EACA,OAAA;IACI,YAAA;IACA,SAAA;IACA,MAAA;IACA,MAAA;IACA,QAAA;EAAA;EAEJ,MAAA,EAAQ,sBAAA;AAAA;;;;AHtBZ;;;;;;;;;;;;AA2GA;KIlHY,OAAA;;;;;;;EAOR,OAAA;EHOa;;;EGFb,IAAA;EH8BC;;;;;AAEO;EGxBR,EAAA;;;;AHsDc;;;;EG7Cd,IAAA,GAAO,MAAA;AAAA;;;;;AH+GX;;;;;;;;KGhGY,IAAA;EHuIR;;;;;EGjIA,EAAA;EH4JuB;;;;AACuB;;EGrJ9C,OAAA;AAAA;;;;;;;;;;;KAaQ,SAAA;;;;;AH6KZ;;;;;AAcA;;;;;KG3KY,mBAAA;EHqLM;;;;EGhLd,GAAA;EHgNA;;;AAAK;EG1ML,GAAA;AAAA;;;;;;;;;;AH+TJ;KGlTY,8BAAA,GAAiC,mBAAA;;;;;;;;;;;;;;;AFnG7C;EEmHI,OAAA,GAAU,SAAA;AAAA;;;KC1HF,wBAAA;EACR,IAAA;EACA,WAAA;EACA,aAAA;EACA,QAAA;EACA,WAAA;EACA,wBAAA;EACA,yBAAA;EACA,MAAA;EACA,UAAA,IAAc,IAAA,UAAc,MAAA;EAC5B,cAAA,GAAiB,MAAA;EACjB,UAAA;AAAA;AAAA,KAGQ,uBAAA;EAA4B,IAAA;EAAc,MAAA;AAAA;AAAA,KAE1C,sBAAA;EACR,OAAA;EACA,KAAA;EACA,QAAA,EAAU,uBAAA;AAAA;;;;cAuQD,uBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,GAAS,wBAAA,KACV,sBAAA;;;KCvRS,wBAAA;EACR,WAAA;EACA,WAAA;EACA,QAAA;EACA,IAAA;EACA,yBAAA;EACA,YAAA;EACA,UAAA;EACA,WAAA;EACA,YAAA;EACA,iBAAA;AAAA;AAAA,KAGQ,wBAAA;EACR,IAAA;EACA,OAAA;EACA,MAAA;EACA,YAAA;AAAA;AAAA,KAGQ,wBAAA;EACR,OAAA;EACA,KAAA;EACA,QAAA,EAAU,wBAAA;AAAA;;ALwCI;;;;;cK6LL,yBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,GAAU,wBAAA,KACX,wBAAA;;;;ANnQH;;KOhBY,eAAA;EPgB+C,6DOdvD,KAAA,UP0BA;EOxBA,KAAA,UPsEA;EOpEA,KAAA,UPoGA;EOlGA,QAAA;AAAA;APmHJ;;;;;;;;AChIiE;;;;;AA8BxD;;ADkGT,cO5Ca,mBAAA,GAAuB,IAAA,aAAY,eAAA;;;AN1BpC;;;;;AA8BM;;;;;AAiCC;cM2BN,wBAAA,GAA4B,IAAA,UAAc,QAAA,EAAU,eAAA;;;;ANMjE;;;cMiBa,oBAAA,GACT,QAAA,EAAU,eAAA;EACT,WAAA;EAAmD,KAAA;EAAgB,QAAA;AAAA;;;;;;;cA+B3D,kBAAA,GACT,IAAA;EAEA,QAAA;EACA,WAAA;EACA,KAAA;EACA,QAAA;EACA,QAAA,EAAU,eAAA;AAAA;;;AP9Ld;;;AAAA,KQbY,cAAA;ERaiB,yDQXzB,KAAA,EAAO,SAAA,IRuCP;EQrCA,WAAA;AAAA;AAAA,cAqES,aAAA,GAAiB,KAAA,EAAO,SAAA;;SAAA,SAAA;AAAA;;;;;AR+CrC;;;;;cShGa,eAAA,GAAmB,IAAA,UAAc,IAAA;;;ARhCmB;;;;;AA8BxD;;cQkCI,gBAAA,GAAoB,IAAA;;;ARNrB;;;;;AA8BM;cQdL,cAAA,GAAkB,IAAA;;;;AR+CZ;;;;;AAiCnB;;;cQrCa,qBAAA,GAAyB,OAAA,UAAiB,MAAA,UAAgB,UAAA,EAAY,mBAAA;;;UCjHlE,gCAAA,SAAyC,6BAAA;EViBhC;;;;EUZtB,yBAAA;EVwCA;;;;EUlCA,wBAAA;EVgGQ;AAiBZ;;EU5GI,IAAA,GAAO,MAAA;AAAA;;;;;ATUF;;;;;AA4BG;;;;;AA8BM;;;;;AAiCC;;cSgEN,+BAAA;EAAmC,mBAAA;EAAA,kBAAA;EAAA,0BAAA;EAAA,WAAA;EAAA,UAAA;EAAA,IAAA;EAAA,iBAAA;EAAA,UAAA;EAAA,yBAAA;EAAA,wBAAA;EAAA;AAAA,GAY7C,gCAAA,KAAmC,SAAA;;;AVrEtC;;;;;;;;AChIiE;;;;;AA8BxD;;;;;AA4BG;;;;ADsEZ,cWnFa,wBAAA,GAA4B,IAAA;ATazC;AAAA,KS6JY,gBAAA,IAAoB,OAAA;;;;;;KCnIpB,kBAAA;EXsEK;;;;EWjEb,OAAA;AAAA;;;;;;cAyDS,cAAA,GAAkB,IAAA,EAAM,MAAA,2BAAiC,OAAA,GAAU,kBAAA;;;AX4D/E;;;cWhCY,qBAAA,GAAyB,OAAA,EAAS,OAAA,EAAS,OAAA,GAAU,kBAAA;;;;;;KCjKtD,mBAAA;;;AbmHZ;KaxGY,eAAA;EACR,IAAA,EAAM,mBAAA;EACN,OAAA;EACA,UAAA;EAEA,KAAA;EAEA,OAAA;AAAA;;;;AZDK;KYQG,oBAAA;EACR,cAAA,IAAkB,eAAA;EAClB,eAAA,IAAmB,eAAA;EACnB,YAAA,IAAgB,eAAA;EAChB,QAAA,GAAW,eAAA;EACX,KAAA,GAAQ,eAAA;EACR,eAAA,GAAkB,OAAA,CAAQ,MAAA,OAAa,6BAAA,EAA+B,eAAA;AAAA;;AZ4CxD;;;;;AAiCC;;;;;AAiCnB;;;;;;;;cY0Ga,aAAA,GAAiB,KAAA,EAAO,SAAA,QAAW,oBAAA;;;;;;AZtD/C;;;;;AAeiD;;;cYuErC,sBAAA,GAA0B,OAAA,GAAU,oBAAA;;;;;;;;AZpSgB;;;;;AA8BxD;;;;;AA4BG;;;;;AA8BM;;;;;AAiCC;;;;;AAiCnB;;;;;;;;;;;;ca8Ka,YAAA,GAAgB,KAAA,EAAO,IAAA,IAAQ,OAAA,EAAS,mBAAA,KAAmB,OAAA;;;;AdnTxE;;;;;;ceda,wBAAA;;;;cAKA,kBAAA;;AfoHb;;ce/Ga,yCAAA;;;;cAKA,uCAAA;;cAiHA,KAAA;Ed3GI,oDAEb;EAAA,mCA0BC;EAAA;6BAEO;EAAA,2BA4Bc;EAAA,2BAEtB;EAAA,6BA+BC;EAAA;+BAEc;EAAA,2BAsBK;EAAA,iCAEpB;EAAA,yBASa;EAAA;6BAQb;EAAA,2BAYA;EAAA,6BAaA;EAAA,6BAYA;EAAA;;;AAOH;;Kc7BW,QAAA,gBAAwB,KAAA;;;Ad4Cc;KcvCtC,gBAAA,gBAAgC,cAAA;;cAG/B,WAAA,GAAe,KAAA,UAAe,IAAA;;cAiB9B,+BAAA,GAAmC,QAAA;;;;;;;;;;;;;;AdwDhD;;;;;AAcA;;;;;AAAgE;;cctBnD,cAAA;EdgCK,iDAiBd;EAAA,2BAeK;EAAA,0BAWJ;EAAA;yBAAkB;EAAA,wBAYZ;EAAA,yKA2CP;EAAA,6BAoBwB;EAAA,wBA+BhB;EAAA;2BAA0B;EAAA,qBAA+B;EAAA,uBAA7C;EAAA,oCAA8B;EAAA,sCAAe;EAAA;;;;;;;;;;;ACrXrE;;;;;AAyBA;;cakNa,cAAA,GAAkB,KAAA;;;Ab5L/B;;;;KauMY,YAAA;EbjMN;;;;;EauMF,OAAA;EbvMoB;;AA0BxB;;;EaoLI,YAAA;EblLA;;;;;EayLA,WAAA;AAAA;;;;;;;;;;;Ab9IJ;;;;;;;;;;;;;;;;;;;;;;;;;;ca4Qa,wBAAA,GACT,KAAA,UACA,cAAA,IAAkB,OAAA,qBAClB,aAAA;;;;;AZrcJ;;;;;AAEA;;;;;AAMA;;;;;;;;;AARA,cYgfa,YAAA,GAAgB,KAAA;;;;;;;;;;;;;;;;;;AZ9c7B;;;;cYqea,eAAA,GAAmB,QAAA,aAAgB,MAAA;;;;;;;;;;;;;cAqBnC,kBAAA,QAAyB,gBAAA;AX9gBtC;;;;;;;;;;;AA4CA;;;AA5CA,cW8hBa,eAAA,GAAmB,SAAA,EAAW,gBAAA;;AXvd3C;;;;;AAgBA;;;;;AAwBA;;;cW6ca,oBAAA,GAAwB,QAAA;;;;KAWzB,YAAA;EAAiB,KAAA;EAAe,IAAA;AAAA;;AVlkB5C;;;;;;;;;;;;;;;;;cUslBa,kBAAA,GAAsB,QAAA,UAAkB,QAAA,EAAU,YAAA;;;AVxkB/D;;;;;AAEA;;;;;;;;cUmmBa,kBAAA,GAAsB,QAAA;;;;;;AftgBnC;;;;;;;;AChIiE;;;;;AA8BxD;;;;;AA4BG;;cenBC,sBAAA,GAA0B,OAAA;;;AfiDrB;;;;;AAiCC;;;;;AAiCnB;ce1Da,WAAA,GAAe,CAAA;;;;;;;;;;cAwBf,4BAAA,GAAgC,IAAA;AAAA,cAiBhC,wBAAA,GAA4B,IAAA;;;KCzC7B,iBAAA;EjB3E+C;;;;;EiBiFvD,mBAAA;AAAA;;;AjB0BJ;;;;;;;;AChIiE;;;;cgB0iBpD,gBAAA,GACT,KAAA,EAAO,IAAA,IACP,OAAA,EAAS,mBAAA,EACT,QAAA,EAAU,OAAA,IACV,iBAAA,GAAoB,iBAAA,KACrB,uBAAA"}
|
package/dist/index.mjs
CHANGED
|
@@ -1300,7 +1300,8 @@ const PATTERN_TYPE_KEYS = [
|
|
|
1300
1300
|
"lineStartsAfter",
|
|
1301
1301
|
"lineEndsWith",
|
|
1302
1302
|
"template",
|
|
1303
|
-
"regex"
|
|
1303
|
+
"regex",
|
|
1304
|
+
"dictionaryEntry"
|
|
1304
1305
|
];
|
|
1305
1306
|
//#endregion
|
|
1306
1307
|
//#region src/optimization/optimize-rules.ts
|
|
@@ -1319,11 +1320,17 @@ const getPatternArray = (rule, key) => {
|
|
|
1319
1320
|
};
|
|
1320
1321
|
const getPatternString = (rule, key) => {
|
|
1321
1322
|
const value = rule[key];
|
|
1322
|
-
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : "";
|
|
1323
|
+
return typeof value === "string" ? value : Array.isArray(value) ? value.join("\n") : value ? JSON.stringify(value) : "";
|
|
1323
1324
|
};
|
|
1324
1325
|
const normalizePatterns = (patterns) => [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
|
|
1326
|
+
const getDictionaryEntrySpecificityScore = (rule) => {
|
|
1327
|
+
if (!("dictionaryEntry" in rule)) return 0;
|
|
1328
|
+
const { allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords } = rule.dictionaryEntry;
|
|
1329
|
+
return minLetters * 20 + maxLetters + (allowCommaSeparated ? 0 : 120) + (allowParenthesized ? 0 : 60) + (allowWhitespaceBeforeColon ? 0 : 20) + (midLineSubentries ? 0 : 160) + Math.min(stopWords.length, 25);
|
|
1330
|
+
};
|
|
1325
1331
|
const getSpecificityScore = (rule) => {
|
|
1326
1332
|
const key = getPatternKey(rule);
|
|
1333
|
+
if (key === "dictionaryEntry") return getDictionaryEntrySpecificityScore(rule);
|
|
1327
1334
|
return MERGEABLE_KEYS.has(key) ? getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0) : getPatternString(rule, key).length;
|
|
1328
1335
|
};
|
|
1329
1336
|
const createMergeKey = (rule) => {
|
|
@@ -1470,21 +1477,21 @@ const applyPreprocessToPage = (content, pageId, transforms) => {
|
|
|
1470
1477
|
};
|
|
1471
1478
|
//#endregion
|
|
1472
1479
|
//#region src/segmentation/arabic-dictionary-rule.ts
|
|
1473
|
-
const
|
|
1480
|
+
const uniqueCanonicalWords = (words) => {
|
|
1474
1481
|
const seen = /* @__PURE__ */ new Set();
|
|
1475
1482
|
const result = [];
|
|
1476
1483
|
for (const word of words) {
|
|
1477
1484
|
const normalized = normalizeArabicForComparison(word);
|
|
1478
1485
|
if (!normalized || seen.has(normalized)) continue;
|
|
1479
1486
|
seen.add(normalized);
|
|
1480
|
-
result.push(
|
|
1487
|
+
result.push(word);
|
|
1481
1488
|
}
|
|
1482
1489
|
return result;
|
|
1483
1490
|
};
|
|
1484
1491
|
const buildStopAlternation = (stopWords) => {
|
|
1485
|
-
const unique =
|
|
1492
|
+
const unique = uniqueCanonicalWords(stopWords);
|
|
1486
1493
|
if (unique.length === 0) return "";
|
|
1487
|
-
return unique.map((word) => makeDiacriticInsensitive(word)).join("|");
|
|
1494
|
+
return unique.map((word) => makeDiacriticInsensitive(normalizeArabicForComparison(word))).join("|");
|
|
1488
1495
|
};
|
|
1489
1496
|
const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation, stopwordBody, unit }) => {
|
|
1490
1497
|
if (!stopAlternation) return allowCommaSeparated ? `${unit}(?:\\s*[،,]\\s*${unit})*` : unit;
|
|
@@ -1493,20 +1500,53 @@ const buildHeadwordBody = ({ allowCommaSeparated, colonPattern, stopAlternation,
|
|
|
1493
1500
|
};
|
|
1494
1501
|
const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, captureName, headwordBody }) => {
|
|
1495
1502
|
const colon = allowWhitespaceBeforeColon ? "\\s*:" : ":";
|
|
1496
|
-
const withCapture =
|
|
1503
|
+
const withCapture = `(?<${captureName}>${headwordBody})`;
|
|
1497
1504
|
if (!allowParenthesized) return `${withCapture}${colon}`;
|
|
1498
1505
|
return `(?:\\(\\s*${withCapture}\\s*\\)|${withCapture})${colon}`;
|
|
1499
1506
|
};
|
|
1507
|
+
const validateDictionaryEntryOptions = ({ captureName = "lemma", maxLetters = 10, minLetters = 2 }) => {
|
|
1508
|
+
if (!Number.isInteger(minLetters) || minLetters < 1) throw new Error(`createArabicDictionaryEntryRule: minLetters must be an integer >= 1, got ${minLetters}`);
|
|
1509
|
+
if (!Number.isInteger(maxLetters) || maxLetters < minLetters) throw new Error(`createArabicDictionaryEntryRule: maxLetters must be an integer >= minLetters, got ${maxLetters}`);
|
|
1510
|
+
if (!captureName.match(/^[A-Za-z_]\w*$/)) throw new Error(`createArabicDictionaryEntryRule: invalid captureName "${captureName}"`);
|
|
1511
|
+
};
|
|
1512
|
+
const buildArabicDictionaryEntryRegexSource = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, midLineSubentries = true, minLetters = 2, stopWords }, capturePrefix) => {
|
|
1513
|
+
validateDictionaryEntryOptions({
|
|
1514
|
+
captureName,
|
|
1515
|
+
maxLetters,
|
|
1516
|
+
minLetters
|
|
1517
|
+
});
|
|
1518
|
+
const zeroWidthPrefix = "[\\u200E\\u200F\\u061C\\u200B\\u200C\\u200D\\uFEFF]*";
|
|
1519
|
+
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1520
|
+
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1521
|
+
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1522
|
+
const stopAlternation = buildStopAlternation(stopWords);
|
|
1523
|
+
const lemmaBody = buildHeadwordBody({
|
|
1524
|
+
allowCommaSeparated,
|
|
1525
|
+
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1526
|
+
stopAlternation,
|
|
1527
|
+
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1528
|
+
unit: lemmaUnit
|
|
1529
|
+
});
|
|
1530
|
+
const lineStartBoundary = `(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`;
|
|
1531
|
+
const midLineTrigger = allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`;
|
|
1532
|
+
const prefixedCaptureName = capturePrefix ? `${capturePrefix}${captureName}` : captureName;
|
|
1533
|
+
const regex = `(?:${lineStartBoundary}${midLineSubentries ? `|${midLineTrigger}` : ""})` + buildBalancedMarker({
|
|
1534
|
+
allowParenthesized,
|
|
1535
|
+
allowWhitespaceBeforeColon,
|
|
1536
|
+
captureName: prefixedCaptureName,
|
|
1537
|
+
headwordBody: lemmaBody
|
|
1538
|
+
});
|
|
1539
|
+
return {
|
|
1540
|
+
captureNames: [prefixedCaptureName],
|
|
1541
|
+
regex
|
|
1542
|
+
};
|
|
1543
|
+
};
|
|
1500
1544
|
/**
|
|
1501
1545
|
* Creates a reusable split rule for Arabic dictionary entries.
|
|
1502
1546
|
*
|
|
1503
|
-
* The
|
|
1504
|
-
*
|
|
1505
|
-
*
|
|
1506
|
-
* - matches root entries at true line/page starts
|
|
1507
|
-
* - matches mid-line subentries conservatively when they begin with `و`
|
|
1508
|
-
* - can optionally support parenthesized headwords like `(عنبر) :`
|
|
1509
|
-
* - can optionally support comma-separated headword lists like `سبد، دبس:`
|
|
1547
|
+
* The returned rule preserves authoring intent as a serializable
|
|
1548
|
+
* `{ dictionaryEntry: ... }` pattern rather than eagerly compiling to a raw
|
|
1549
|
+
* regex string.
|
|
1510
1550
|
*
|
|
1511
1551
|
* @example
|
|
1512
1552
|
* createArabicDictionaryEntryRule({
|
|
@@ -1522,33 +1562,26 @@ const buildBalancedMarker = ({ allowParenthesized, allowWhitespaceBeforeColon, c
|
|
|
1522
1562
|
* stopWords: ['الليث', 'العجاج'],
|
|
1523
1563
|
* })
|
|
1524
1564
|
*/
|
|
1525
|
-
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
const wawWithMarks = `و${ARABIC_MARKS_CLASS}*`;
|
|
1531
|
-
const alWithMarks = `ا${ARABIC_MARKS_CLASS}*ل${ARABIC_MARKS_CLASS}*`;
|
|
1532
|
-
const lemmaUnit = `(?:${wawWithMarks})?(?:${alWithMarks})?${`${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}(?:${ARABIC_LETTER_WITH_OPTIONAL_MARKS_PATTERN}){${minLetters - 1},${maxLetters - 1}}`}`;
|
|
1533
|
-
const stopAlternation = buildStopAlternation(stopWords);
|
|
1534
|
-
const lemmaBody = buildHeadwordBody({
|
|
1535
|
-
allowCommaSeparated,
|
|
1536
|
-
colonPattern: allowWhitespaceBeforeColon ? "\\s*:" : ":",
|
|
1537
|
-
stopAlternation,
|
|
1538
|
-
stopwordBody: stopAlternation ? `(?:${wawWithMarks})?(?:${stopAlternation})` : "",
|
|
1539
|
-
unit: lemmaUnit
|
|
1565
|
+
const createArabicDictionaryEntryRule = ({ allowCommaSeparated = false, allowParenthesized = false, allowWhitespaceBeforeColon = false, captureName = "lemma", maxLetters = 10, meta, midLineSubentries = true, minLetters = 2, pageStartPrevWordStoplist, samePagePrevWordStoplist, stopWords }) => {
|
|
1566
|
+
validateDictionaryEntryOptions({
|
|
1567
|
+
captureName,
|
|
1568
|
+
maxLetters,
|
|
1569
|
+
minLetters
|
|
1540
1570
|
});
|
|
1541
1571
|
return {
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
regex: `(?:${`(?:(?<=^)|(?<=\\n))${zeroWidthPrefix}`}|${allowParenthesized ? `(?<=\\s)(?=(?:\\(\\s*)?${wawWithMarks}(?:${alWithMarks})?)` : `(?<=\\s)(?=${wawWithMarks}(?:${alWithMarks})?)`})` + buildBalancedMarker({
|
|
1572
|
+
dictionaryEntry: {
|
|
1573
|
+
allowCommaSeparated,
|
|
1545
1574
|
allowParenthesized,
|
|
1546
1575
|
allowWhitespaceBeforeColon,
|
|
1547
1576
|
captureName,
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1577
|
+
maxLetters,
|
|
1578
|
+
midLineSubentries,
|
|
1579
|
+
minLetters,
|
|
1580
|
+
stopWords: uniqueCanonicalWords(stopWords)
|
|
1581
|
+
},
|
|
1582
|
+
meta,
|
|
1583
|
+
pageStartPrevWordStoplist,
|
|
1584
|
+
samePagePrevWordStoplist
|
|
1552
1585
|
};
|
|
1553
1586
|
};
|
|
1554
1587
|
const WINDOW_PREFIX_LENGTHS = [
|
|
@@ -2636,6 +2669,26 @@ const validateRegexRule = (rule, result) => {
|
|
|
2636
2669
|
return true;
|
|
2637
2670
|
}
|
|
2638
2671
|
};
|
|
2672
|
+
const invalidDictionaryEntryIssue = (message) => ({
|
|
2673
|
+
message,
|
|
2674
|
+
type: "invalid_option"
|
|
2675
|
+
});
|
|
2676
|
+
const validateDictionaryEntryRule = (rule, result) => {
|
|
2677
|
+
if (!("dictionaryEntry" in rule) || !rule.dictionaryEntry) return false;
|
|
2678
|
+
const issues = {};
|
|
2679
|
+
const { allowCommaSeparated, allowParenthesized, allowWhitespaceBeforeColon, captureName, maxLetters, midLineSubentries, minLetters, stopWords } = rule.dictionaryEntry;
|
|
2680
|
+
if (!Array.isArray(stopWords) || stopWords.some((word) => typeof word !== "string" || !word.trim())) issues.stopWords = invalidDictionaryEntryIssue("stopWords must be a string[] with non-empty entries");
|
|
2681
|
+
if (allowCommaSeparated !== void 0 && typeof allowCommaSeparated !== "boolean") issues.allowCommaSeparated = invalidDictionaryEntryIssue("allowCommaSeparated must be a boolean");
|
|
2682
|
+
if (allowParenthesized !== void 0 && typeof allowParenthesized !== "boolean") issues.allowParenthesized = invalidDictionaryEntryIssue("allowParenthesized must be a boolean");
|
|
2683
|
+
if (allowWhitespaceBeforeColon !== void 0 && typeof allowWhitespaceBeforeColon !== "boolean") issues.allowWhitespaceBeforeColon = invalidDictionaryEntryIssue("allowWhitespaceBeforeColon must be a boolean");
|
|
2684
|
+
if (midLineSubentries !== void 0 && typeof midLineSubentries !== "boolean") issues.midLineSubentries = invalidDictionaryEntryIssue("midLineSubentries must be a boolean");
|
|
2685
|
+
if (captureName !== void 0 && !captureName.match(/^[A-Za-z_]\w*$/)) issues.captureName = invalidDictionaryEntryIssue(`captureName must match /^[A-Za-z_]\\w*$/, got "${captureName}"`);
|
|
2686
|
+
if (minLetters !== void 0 && (!Number.isInteger(minLetters) || minLetters < 1)) issues.minLetters = invalidDictionaryEntryIssue("minLetters must be an integer >= 1");
|
|
2687
|
+
if (maxLetters !== void 0 && (!Number.isInteger(maxLetters) || maxLetters < (minLetters ?? 2))) issues.maxLetters = invalidDictionaryEntryIssue(`maxLetters must be an integer >= ${minLetters ?? 2}`);
|
|
2688
|
+
if (Object.keys(issues).length === 0) return false;
|
|
2689
|
+
result.dictionaryEntry = issues;
|
|
2690
|
+
return true;
|
|
2691
|
+
};
|
|
2639
2692
|
const formatValidationIssue = (_type, issue, loc) => {
|
|
2640
2693
|
if (!issue) return null;
|
|
2641
2694
|
if (issue.type === "missing_braces") return `${loc}: Missing {{}} around token "${issue.token}"`;
|
|
@@ -2670,7 +2723,8 @@ const validateRules = (rules) => rules.map((rule) => {
|
|
|
2670
2723
|
const endsWithIssues = applyRulePatternValidation(result, "lineEndsWith", rule.lineEndsWith);
|
|
2671
2724
|
const templateIssues = validateTemplateRule(rule, result);
|
|
2672
2725
|
const regexIssues = validateRegexRule(rule, result);
|
|
2673
|
-
|
|
2726
|
+
const dictionaryEntryIssues = validateDictionaryEntryRule(rule, result);
|
|
2727
|
+
return startsWithIssues || startsAfterIssues || endsWithIssues || templateIssues || regexIssues || dictionaryEntryIssues ? result : void 0;
|
|
2674
2728
|
});
|
|
2675
2729
|
/**
|
|
2676
2730
|
* Formats a validation result array into a list of human-readable error messages.
|
|
@@ -2687,8 +2741,12 @@ const validateRules = (rules) => rules.map((rule) => {
|
|
|
2687
2741
|
*/
|
|
2688
2742
|
const formatValidationReport = (results) => results.flatMap((result, i) => {
|
|
2689
2743
|
if (!result) return [];
|
|
2690
|
-
return Object.entries(result).flatMap(([type, issues]) => (
|
|
2744
|
+
return Object.entries(result).flatMap(([type, issues]) => formatValidationIssues(type, issues, i + 1));
|
|
2691
2745
|
});
|
|
2746
|
+
const formatValidationIssues = (type, issues, ruleNumber) => {
|
|
2747
|
+
if (type === "dictionaryEntry" && issues && typeof issues === "object" && !Array.isArray(issues)) return Object.entries(issues).map(([field, issue]) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}.${field}`)).filter((msg) => msg !== null);
|
|
2748
|
+
return (Array.isArray(issues) ? issues : [issues]).map((issue) => formatValidationIssue(type, issue, `Rule ${ruleNumber}, ${type}`)).filter((msg) => msg !== null);
|
|
2749
|
+
};
|
|
2692
2750
|
//#endregion
|
|
2693
2751
|
//#region src/segmentation/breakpoint-processor.ts
|
|
2694
2752
|
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
@@ -3336,6 +3394,7 @@ const buildLineBasedRuleRegex = (rule, fuzzy, capturePrefix) => {
|
|
|
3336
3394
|
if ("lineStartsWith" in rule && Array.isArray(rule.lineStartsWith) && rule.lineStartsWith.length > 0) return buildLineStartsWithRegexSource(rule.lineStartsWith, fuzzy, capturePrefix);
|
|
3337
3395
|
if ("lineEndsWith" in rule && Array.isArray(rule.lineEndsWith) && rule.lineEndsWith.length > 0) return buildLineEndsWithRegexSource(rule.lineEndsWith, fuzzy, capturePrefix);
|
|
3338
3396
|
if ("template" in rule && typeof rule.template === "string") return buildTemplateRegexSource(rule.template, capturePrefix);
|
|
3397
|
+
if ("dictionaryEntry" in rule && rule.dictionaryEntry) return buildArabicDictionaryEntryRegexSource(rule.dictionaryEntry, capturePrefix);
|
|
3339
3398
|
return null;
|
|
3340
3399
|
};
|
|
3341
3400
|
/**
|
|
@@ -3358,7 +3417,7 @@ const buildRuleRegex = (rule, capturePrefix) => {
|
|
|
3358
3417
|
let finalRegex = ruleRegexSource?.regex;
|
|
3359
3418
|
let allCaptureNames = ruleRegexSource?.captureNames ?? [];
|
|
3360
3419
|
if (!finalRegex && "regex" in rule && typeof rule.regex === "string") finalRegex = rule.regex;
|
|
3361
|
-
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, or
|
|
3420
|
+
if (!finalRegex) throw new Error("Rule must specify exactly one pattern type: regex, template, lineStartsWith, lineStartsAfter, lineEndsWith, or dictionaryEntry");
|
|
3362
3421
|
if (allCaptureNames.length === 0) allCaptureNames = extractNamedCaptureNames(finalRegex);
|
|
3363
3422
|
return {
|
|
3364
3423
|
captureNames: allCaptureNames,
|